# Lab Fine tuning with HF


In [None]:
# instalar las librerias
# google colab ya las tiene
!pip install -U transformers datasets evaluate accelerate bertviz torch torchvision torchaudio -q


In [None]:
# esta no viene con google colab
!pip install evaluate

In [None]:
from huggingface_hub import login

# tu token
login("hf_xxxxxxxxxxxxx")


In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          DataCollatorWithPadding, TrainingArguments, Trainer)
import evaluate
import numpy as np

MODEL_ID = "distilbert-base-uncased"   # puedes usar uno multilingüe p.ej. "dccuchile/bert-base-spanish-wwm-uncased"

# 1) Dataset (ejemplo: SST2 binario; cambia por tu dataset en español)
ds = load_dataset("glue", "sst2")
ds = ds.rename_column("label", "labels")  # Trainer espera "labels"

# 2) Tokenizer
tok = AutoTokenizer.from_pretrained(MODEL_ID)

def preprocess(batch):
    return tok(batch["sentence"], truncation=True)

ds_tok = ds.map(preprocess, batched=True, remove_columns=ds["train"].column_names)

# 3) Data collator
collator = DataCollatorWithPadding(tokenizer=tok)

# 4) Modelo con cabeza de clasificación (num_labels=2 para binario)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_ID, num_labels=2)

# 5) Métricas
metric = evaluate.load("glue", "sst2")
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return metric.compute(predictions=preds, references=labels)

# 6) Argumentos de entrenamiento
args = TrainingArguments(
    output_dir="out-clf",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    eval_strategy="epoch",
#    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    fp16=True
)

# 7) Trainer
trainer = Trainer(
    model=model,
    args=args,
    train_dataset=ds_tok["train"],
    eval_dataset=ds_tok["validation"],
    tokenizer=tok,
    data_collator=collator,
    compute_metrics=compute_metrics
)

# 8) Entrenar y evaluar
trainer.train()
eval_res = trainer.evaluate()
print(eval_res)

# 9) Guardar
trainer.save_model("out-clf/best")


In [None]:
# Reto: QA extractivo
from transformers import AutoModelForQuestionAnswering
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased")
# Preprocesa con tokenizer(question, context, truncation=True, max_length=..., stride=..., return_overflowing_tokens=True, return_offsets_mapping=True)
# Usa data collator por defecto o uno de QA; compute_metrics típico: EM/F1 (SQuAD).


In [None]:
# Reto: NER (token classification)

from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
model = AutoModelForTokenClassification.from_pretrained("bert-base-cased", num_labels=num_tags)
collator = DataCollatorForTokenClassification(tokenizer)
# Mapear etiquetas al nivel de tokens (manejo de subpalabras y -100 en especiales).


In [None]:
# Traducción (seq2seq)

from transformers import AutoModelForSeq2SeqLM, DataCollatorForSeq2Seq
model = AutoModelForSeq2SeqLM.from_pretrained("Helsinki-NLP/opus-mt-en-es")
collator = DataCollatorForSeq2Seq(tokenizer, model=model)
# Tokenizar con text/summary/translation format; compute_metrics con sacreBLEU, etc.


## Fine-tuning en clasificación (SST-2, subconjunto)

In [None]:
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding
import evaluate
import random

# Carga SST2 y reduce tamaño para demo rápida
ds = load_dataset("glue", "sst2")
small_train = ds["train"].shuffle(seed=42).select(range(200))   # 200 ejemplos
small_val = ds["validation"].shuffle(seed=42).select(range(200))

checkpoint = "bert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

def tokenize(ex):
    return tokenizer(ex["sentence"], truncation=True)

small_train = small_train.map(tokenize, batched=True)
small_val = small_val.map(tokenize, batched=True)

num_labels = 2
model = AutoModelForSequenceClassification.from_pretrained(checkpoint, num_labels=num_labels).to(device)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
metric = evaluate.load("glue", "sst2")

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"accuracy": (preds == labels).mean()}

args = TrainingArguments(
    output_dir="bert-sst2-demo",
    evaluation_strategy="epoch",
    save_strategy="no",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_steps=10,
    fp16=torch.cuda.is_available(),
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=small_train,
    eval_dataset=small_val,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

trainer.train()
eval_res = trainer.evaluate()
eval_res
