In [1]:
!pip install -q datasets transformers trl peft

In [2]:
import logging
import argparse
import os
from datasets import load_from_disk
import sys
from transformers import (
    AutoTokenizer, 
    AutoModelForCausalLM,
    TrainingArguments,
    )
from transformers.trainer_utils import get_last_checkpoint
from trl import SFTTrainer, setup_chat_format
from peft import LoraConfig
from scripts.chat_template import DatasetProcessor


In [5]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [4]:
processor = DatasetProcessor()

dataset_configs = [
    {
        "name": "jrobador/mathinstruct_es",
        "input_key": "question",
        "output_key": "answer",
        "rename_columns": {"instruction": "question", "output": "answer"},
        "remove_columns": ["question", "answer", "Unnamed: 0", "source"]
    },
    {
        "name": "jrobador/gsm8k_es",
        "input_key": "question",
        "output_key": "answer",
        "remove_columns": ["question", "answer", "Unnamed: 0"]
    }
]

# Process datasets
remove_ids = [18865, 143408]  # IDs of questions without answers
combined_dataset = processor.process_hf_datasets(dataset_configs, remove_ids)

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

MathInstruct_spanish.csv:   0%|          | 0.00/206M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/262040 [00:00<?, ? examples/s]

Filter:   0%|          | 0/262040 [00:00<?, ? examples/s]

Map:   0%|          | 0/262038 [00:00<?, ? examples/s]

README.md:   0%|          | 0.00/24.0 [00:00<?, ?B/s]

GSM8K-Socratic_spanish.csv:   0%|          | 0.00/5.67M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/7473 [00:00<?, ? examples/s]

Filter:   0%|          | 0/7473 [00:00<?, ? examples/s]

Map:   0%|          | 0/7473 [00:00<?, ? examples/s]

In [6]:
def setup_logging():
    """Configure logging with proper format and handlers."""
    logger = logging.getLogger(__name__)
    logger.setLevel(logging.INFO)
    
    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
    handler = logging.StreamHandler(sys.stdout)
    handler.setFormatter(formatter)
    
    # Clear existing handlers to avoid duplicate logs
    logger.handlers.clear()
    logger.addHandler(handler)
    
    return logger

# Define the arguments as variables
epochs = 1  # Number of training epochs
train_batch_size = 2  # Training batch size
warmup_ratio = 0.03  # Warmup ratio for learning rate scheduler
model_id = 'meta-llama/Llama-3.2-3B-Instruct'  # Model identifier from HuggingFace hub
learning_rate = 5e-5  # Learning rate
weight_decay = 0  # Weight decay
output_dir = "./opt/checkpoints"  # Output directory for checkpoints
max_seq_length = 256  # Maximum sequence length
gradient_accumulation_steps = 4  # Number of gradient accumulation steps
output_data_dir = "./opts/results"  # Output data directory

logger = setup_logging()
try:
    # load datasets
    train_dataset = combined_dataset
    logger.info(f" loaded train_dataset length is: {len(train_dataset)}")
    # download model from model hub
    model = AutoModelForCausalLM.from_pretrained(model_id)
    tokenizer = AutoTokenizer.from_pretrained(model_id)
    tokenizer.pad_token = tokenizer.eos_token
    # define training args
    training_arguments = TrainingArguments(
        output_dir=output_dir,
        overwrite_output_dir=True if get_last_checkpoint(output_dir) is not None else False,
        num_train_epochs=epochs,
        per_device_train_batch_size=train_batch_size,
        gradient_accumulation_steps=gradient_accumulation_steps,
        fp16=False,
        bf16=True,
        save_strategy="steps",
        save_steps = 500,
        max_steps=-1,
        max_grad_norm=0.2,
        warmup_ratio=warmup_ratio,
        logging_dir=f"{output_data_dir}/logs",
        learning_rate=float(learning_rate),
        weight_decay=weight_decay,
        logging_steps=10,
        lr_scheduler_type="constant",
        eval_steps=500,
    )
    peft_config = LoraConfig(
        lora_alpha=16,
        lora_dropout=0.1,
        r=32,
        bias="none",
        task_type="CAUSAL_LM"
    )
    # Create SFTTrainer instance
    trainer = SFTTrainer(
        model=model,
        args=training_arguments,
        peft_config=peft_config,
        train_dataset=train_dataset,
        tokenizer=tokenizer,
        max_seq_length=max_seq_length,
        dataset_text_field="prompt"
    )
    # train model
    if get_last_checkpoint(output_dir) is not None:
        logger.info("***** continue training *****")
        last_checkpoint = get_last_checkpoint(output_dir)
        trainer.train(resume_from_checkpoint=last_checkpoint)
    else:
        trainer.train()
    # Saves the model to s3 uses os.environ["SM_MODEL_DIR"] to make sure checkpointing works
    trainer.save_model(output_data_dir)
except Exception as e:
    logger.error(f"Training failed: {str(e)}")
    raise

config.json:   0%|          | 0.00/878 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/20.9k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/1.46G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.5k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/296 [00:00<?, ?B/s]


Deprecated positional argument(s) used in SFTTrainer, please use the SFTConfig to set these arguments instead.


Map:   0%|          | 0/269511 [00:00<?, ? examples/s]

Step,Training Loss
10,2.2494
20,2.1672
30,1.91
40,1.7643
50,1.7414
60,1.5396
70,1.5133
80,1.3268
90,1.3191
100,1.311


In [7]:
trainer.save_model(output_data_dir)

In [13]:
from peft import PeftModel, PeftConfig
from transformers import AutoModelForCausalLM
import torch

# Cargar el modelo base
base_model = AutoModelForCausalLM.from_pretrained(model_id)

# Cargar el adaptador LoRA
lora_model = PeftModel.from_pretrained(base_model, output_data_dir, torch_dtype=torch.bfloat16)

# Fusionar el adaptador con el modelo base
lora_model = lora_model.merge_and_unload()

# Guardar el modelo combinado
lora_model.save_pretrained("./merged")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
from huggingface_hub import login
login(token="YOUR_TOKEN_NOT_MINE")

In [None]:
lora_model.push_to_hub("jrobador/MatIA", tokenizer, token = "YOUR_TOKEN_NOT_MINE")


model-00001-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

Upload 3 LFS files:   0%|          | 0/3 [00:00<?, ?it/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/2.92G [00:00<?, ?B/s]

HTTP Error 500 thrown while requesting PUT https://hf-hub-lfs-us-east-1.s3.us-east-1.amazonaws.com/repos/5a/26/5a265fe048a51a515d30a7976bb8b5844f831c7570658a16b9829678be7b3fac/060c9a4155866e3e3aeb53c249e6c2e1d3d7b28207d6b73a341372cb92c27e51?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Content-Sha256=UNSIGNED-PAYLOAD&X-Amz-Credential=AKIA2JU7TKAQLC2QXPN7%2F20241122%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20241122T060507Z&X-Amz-Expires=86400&X-Amz-Signature=4481daae59fcae9f010568a91f0d303c8300385c4081663122920c7d23daf5b9&X-Amz-SignedHeaders=host&partNumber=62&uploadId=whf.VQVbp2AYpDmKABepZlm_HQK3Zd95iQE05WF97UNuylRa1x6kAFD2hwDsxPLH7T3QSBpz6.VzqmPqXSplzzAmLBsQHofqlVq25gngCceCNbe0h7S7gAYWmzjXC39T&x-id=UploadPart
Retrying in 1s [Retry 1/5].


CommitInfo(commit_url='https://huggingface.co/jrobador/MatIA/commit/e6b33a7a19ed1f1c1f71444b019ac1b7785ec50b', commit_message='Upload LlamaForCausalLM', commit_description='', oid='e6b33a7a19ed1f1c1f71444b019ac1b7785ec50b', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jrobador/MatIA', endpoint='https://huggingface.co', repo_type='model', repo_id='jrobador/MatIA'), pr_revision=None, pr_num=None)