In [11]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)


In [12]:
device = "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Entrenando en: {device}")


Entrenando en: mps


In [13]:
dataset = load_dataset("cfahlgren1/react-code-instructions", split="train")
dataset = dataset.select(range(30000))
dataset = dataset.train_test_split(test_size=5000)
print(dataset)


DatasetDict({
    train: Dataset({
        features: ['created_at', 'model', 'messages', 'recommended', 'upvoted'],
        num_rows: 25000
    })
    test: Dataset({
        features: ['created_at', 'model', 'messages', 'recommended', 'upvoted'],
        num_rows: 5000
    })
})


In [14]:
MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.2"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto"
)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token


ImportError: The installed version of bitsandbytes (<0.43.1) requires CUDA, but CUDA is not available. You may need to install PyTorch with CUDA support or upgrade bitsandbytes to >=0.43.1.

In [None]:
def preprocess(batch):
    texts = []
    for msgs in batch["messages"]:
        if isinstance(msgs, list):
            text = " ".join(m["content"] for m in msgs if "content" in m)
        else:
            text = str(msgs)
        texts.append(text)

    return tokenizer(
        texts,
        truncation=True,
        padding="max_length",
        max_length=1024  # subimos a 1024 si la VRAM lo permite
    )

tokenized_dataset = dataset.map(
    preprocess,
    batched=True,
    remove_columns=dataset["train"].column_names
)


In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False
)


In [None]:
training_args = TrainingArguments(
    output_dir="./mistral-finetuned-react",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    save_total_limit=2,
    push_to_hub=False
)


In [None]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)


In [None]:
trainer.train()

In [None]:
trainer.save_model("./mistral-finetuned-react")
tokenizer.save_pretrained("./mistral-finetuned-react")
