In [1]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Entrenando en: {device}")


Entrenando en: mps


In [3]:
dataset = load_dataset("cfahlgren1/react-code-instructions", split="train")
dataset = dataset.select(range(30000))
dataset = dataset.train_test_split(test_size=5000)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['created_at', 'model', 'messages', 'recommended', 'upvoted'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['created_at', 'model', 'messages', 'recommended', 'upvoted'],
        num_rows: 5000
    })
})


In [4]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",  
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    torch_dtype=torch.float16,
    device_map={"": "cpu"},
    low_cpu_mem_usage=True
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading checkpoint shards: 100%|██████████| 3/3 [00:17<00:00,  5.79s/it]


In [5]:
def preprocess(batch):
    texts = []
    for msgs in batch["messages"]:
        if isinstance(msgs, list):
            text = " ".join(m["content"] for m in msgs if "content" in m)
        else:
            text = str(msgs)
        texts.append(text)

    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=512
    )

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)



Map: 100%|██████████| 25000/25000 [00:18<00:00, 1377.06 examples/s]
Map: 100%|██████████| 5000/5000 [00:03<00:00, 1511.73 examples/s]


In [6]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)

In [7]:
training_args = TrainingArguments(
    output_dir="./mistral-finetuned-react",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False,
    fp16=False,
    bf16=False,
    no_cuda=True
)




In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


  trainer = Trainer(


In [None]:
trainer.train()

In [None]:
trainer.save_model("./mistral-finetuned-react")
tokenizer.save_pretrained("./mistral-finetuned-react")