In [None]:
!pip uninstall torch torchvision torchaudio -y
!pip install torch==2.7.0 torchvision torchaudio --index-url https://download.pytorch.org/whl/cu128
!pip install \
  transformers==4.49.0 \
  accelerate==1.4.0 \
  bitsandbytes==0.45.3 \
  peft==0.14.0 \
  trl==0.15.2 \
  datasets==3.3.2 \
  huggingface_hub==0.28.1
  
!pip install \
  "numpy<2" \
  pandas==2.2.3 \
  matplotlib==3.9.4 \
  seaborn==0.13.2

Verifico el entorno en el que me encuentro ya que estoy en un jupyter remoto

In [None]:
import torch

cap = torch.cuda.get_device_capability()
_ = torch.tensor([1.0]).cuda() * 2
import bitsandbytes as bnb
import transformers, peft, trl, datasets

print(f"PyTorch: {torch.__version__}")
print(f"GPU: {torch.cuda.get_device_name(0)}")
print(f"Compute: sm_{cap[0]}{cap[1]}")
print(f"VRAM total: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

In [None]:
import torch
import json
import numpy as np
import matplotlib.pyplot as plt

from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    EarlyStoppingCallback,
)
from datasets import load_dataset, DatasetDict
from peft import LoraConfig
from trl import SFTTrainer, SFTConfig

SEED = 42
torch.manual_seed(SEED)
np.random.seed(SEED)

Hago login con huggingface para poder acceder a los modelos y datasets

In [None]:
from huggingface_hub import login
login("API_KEY")

Descargo el dataset y hago un exploratorio y análisis


In [None]:
raw_dataset = load_dataset("JulianVelandia/ColombianAccent", split="train")

print(f"Total ejemplos: {len(raw_dataset)}")
print(f"Columnas: {raw_dataset.column_names}")

print("Ejemplo:")
print(json.dumps(raw_dataset[0], indent=2, ensure_ascii=False))

In [None]:
instruct_lens = [len(ex["instruction"].split()) for ex in raw_dataset]
response_lens = [len(ex["response"].split()) for ex in raw_dataset]


fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(instruct_lens, bins=40, color='steelblue')
axes[0].set_title('Longitud instrucciones (palabras)')
axes[1].hist(response_lens, bins=40, color='coral')
axes[1].set_title('Longitud respuestas (palabras)')
plt.tight_layout()
plt.show()

Hago un preprocesamiento del dataset para adaptarlo al formato que necesito para el fine-tuning

In [None]:
SYSTEM_PROMPT = (
    "Eres un asesor comercial colombiano parce, trabajas vendiendo productos y servicios en Colombia. "
    "Hablas con el acento, las palabras y las expresiones típicas colombianas "
    "'chimba', 'hagale', 'de una', 'listo pues', 'sumerce', 'a lo bien', entre otras. "
    "Tu objetivo es atender al cliente de manera cálida y cercana, resolver sus dudas, manejar objeciones "
    "con argumentos sólidos, y cerrar la venta de forma natural. Eres persuasivo pero honesto, nunca presionas "
    "de mala manera. Siempre mantienes el tono 100% colombiano en cada respuesta."
)

def format_example(example):
    text = (
        "<|start_header_id|>system<|end_header_id|>\n\n"
        f"{SYSTEM_PROMPT}<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{example['instruction']}<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
        f"{example['response']}<|eot_id|>"
    )
    return {"text": text}

formatted_dataset = raw_dataset.map(
    format_example,
    remove_columns=raw_dataset.column_names
)

print(f"Dataset: {len(formatted_dataset)} ejemplos")
print("Ejemplo:")
print(formatted_dataset[0]['text'][:500] + "...")

Divido el dataset en entrenamiento y validación

In [None]:
split = formatted_dataset.train_test_split(test_size=0.05, seed=SEED)
dataset_dict = DatasetDict({
    "train": split["train"],
    "validation": split["test"]
})

print(f"Train: {len(dataset_dict['train'])} ejemplos")
print(f"Validation: {len(dataset_dict['validation'])} ejemplos")

Cargo el modelo base que se va a fine-tuning, y configuro el tokenizer

In [None]:
MODEL_NAME = "meta-llama/Meta-Llama-3.1-8B-Instruct"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
    bnb_4bit_use_double_quant=True,
)

model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,
)

model.config.use_cache = False
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

Preparo la configuración de LoRA para el fine-tuning

In [None]:
lora_config = LoraConfig(
    r=64,
    lora_alpha=128,
    target_modules=[
        "q_proj", "k_proj", "v_proj", "o_proj",
        "gate_proj", "up_proj", "down_proj",
    ],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM",
)

Preparo la configuración de entrenamiento para el SFTTrainer, 2 épocas y un learning rate de 1e-4 porque cuento con 30k ejemplos en el dataset


In [None]:
OUTPUT_DIR = "/llama_ventas_co_checkpoints"

training_args = SFTConfig(
    output_dir=OUTPUT_DIR,
    max_seq_length=512,
    dataset_text_field="text",
    num_train_epochs=2,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    optim="paged_adamw_8bit",
    learning_rate=1e-4,
    lr_scheduler_type="cosine",
    warmup_ratio=0.05,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=False,
    bf16=True,
    tf32=True,
    weight_decay=0.01,
    max_grad_norm=0.3,
    report_to="none",
    seed=SEED,
)

Inicio el entrenamiento del modelo con el lora, tokenizador, training_args y el dataset ya preprocesado, tambien aplico un early stopping para evitar seguir entrenando si el modelo no mejora

In [None]:
trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset_dict["train"],
    eval_dataset=dataset_dict["validation"],
    processing_class=tokenizer,
    peft_config=lora_config,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2, early_stopping_threshold=0.005)],
)

trainer.create_model_card = lambda *args, **kwargs: None
trainable = sum(p.numel() for p in trainer.model.parameters() if p.requires_grad)
total = sum(p.numel() for p in trainer.model.parameters())

print(f"Parámetros entrenables: {trainable:,} ({100 * trainable / total:.2f}%)")

In [None]:
trainer.train()

Analizo las métricas de entrenamiento

In [None]:
log_history = trainer.state.log_history

train_logs = [log for log in log_history if 'loss' in log and 'eval_loss' not in log]
eval_logs  = [log for log in log_history if 'eval_loss' in log]

fig, axes = plt.subplots(1, 2, figsize=(15, 5))

if train_logs:
    axes[0].plot([l['step'] for l in train_logs], [l['loss'] for l in train_logs],
                 linewidth=2, color='steelblue')
    axes[0].set_xlabel('Steps'); axes[0].set_ylabel('Loss')
    axes[0].set_title('Training Loss', fontweight='bold'); axes[0].grid(True, alpha=0.3)

if eval_logs:
    axes[1].plot([l['epoch'] for l in eval_logs], [l['eval_loss'] for l in eval_logs],
                 marker='s', linewidth=2, color='coral')
    axes[1].set_xlabel('Epoch'); axes[1].set_ylabel('Eval Loss')
    axes[1].set_title('Validation Loss', fontweight='bold'); axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

if eval_logs:
    print(f"Train loss final: {train_logs[-1]['loss']:.4f}")
    print(f"Val loss final:   {eval_logs[-1]['eval_loss']:.4f}")
    print(f"Mejor val loss:   {min(l['eval_loss'] for l in eval_logs):.4f}")

Pruebas finales con el modelo entrenado

In [None]:
def generate_response(user_message: str, max_new_tokens: int = 60) -> str:
    prompt = (
        "<|start_header_id|>system<|end_header_id|>\n\n"
        f"{SYSTEM_PROMPT}<|eot_id|>"
        "<|start_header_id|>user<|end_header_id|>\n\n"
        f"{user_message}<|eot_id|>"
        "<|start_header_id|>assistant<|end_header_id|>\n\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    eos_token_id = tokenizer.convert_tokens_to_ids("<|eot_id|>")

    with torch.no_grad():
        with torch.autocast("cuda", dtype=torch.bfloat16):
            outputs = model.generate(
                **inputs,
                max_new_tokens=max_new_tokens,
                temperature=0.7,
                top_p=0.9,
                repetition_penalty=1.3,
                do_sample=True,
                pad_token_id=tokenizer.eos_token_id,
                eos_token_id=[
                    tokenizer.eos_token_id,
                    eos_token_id,
                ],
            )

    return tokenizer.decode(
        outputs[0][inputs['input_ids'].shape[1]:],
        skip_special_tokens=True
    ).strip()


test_prompts = [
    "Hola, buenos días",
    "Ese precio me parece muy caro",
    "¿Qué garantía tiene el producto?",
    "Déjeme pensarlo y le aviso",
    "¿Por qué debería comprarle a usted y no a la competencia?",
    "No tengo plata ahorita",
    "¿Hacen descuentos?",
    "Muchas gracias, me interesa",
]

for i, prompt in enumerate(test_prompts, 1):
    print(f"\n{i} Cliente: {prompt}")
    print(f"Asesor: {generate_response(prompt)}")
    print("\n---")

Guardo el modelo entrenado y el tokenizador

In [None]:
FINAL_MODEL_PATH = "/llama_ventas_colombiano_LoRA"

trainer.model.save_pretrained(FINAL_MODEL_PATH)
tokenizer.save_pretrained(FINAL_MODEL_PATH)