In [1]:
# CELDA A: configuracion global
from pathlib import Path
import pandas as pd, numpy as np, json, torch
import matplotlib.pyplot as plt

SEED = 42
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"
OUT_BASE = Path("../models/beto_kfold")
FOLDS_CSV = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
OUT_BASE.mkdir(parents=True, exist_ok=True)

# Hiperpar√°metros que vamos a ajustar
N_EPOCHS = 4                # subir a 3-5 para ver mejora
LR = 2e-5
TRAIN_BS = 8                # si falta VRAM bajar a 4
EVAL_BS = 32
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 50
SAVE_TOTAL_LIMIT = 2


In [2]:
# 2_fine_tuning_beto_kfold.ipynb - celda principal de entrenamiento K-Fold
import os
from pathlib import Path
import numpy as np
import pandas as pd
import random
from collections import defaultdict

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    DataCollatorWithPadding,
    TrainingArguments,
    Trainer,
    set_seed
)
from datasets import Dataset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score

# -------------------- CONFIG --------------------
SEED = 42
MODEL_NAME = "dccuchile/bert-base-spanish-wwm-cased"   # BETO
OUT_BASE = Path("../models/beto_kfold")
OUT_BASE.mkdir(parents=True, exist_ok=True)
FOLDS_CSV = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
N_EPOCHS = 3          # cambiar si quieres
LR = 2e-5
EVAL_STRATEGY = "epoch"
SAVE_STRATEGY = "epoch"
PER_DEVICE_TRAIN_BATCH_SIZE = 8   # ajustar por memoria GPU
PER_DEVICE_EVAL_BATCH_SIZE = 16
WEIGHT_DECAY = 0.01
LOGGING_STEPS = 50

# debug tip: para pruebas r√°pidas usa EPOCHS=1 y batch_size=8
# --------------------------------------------------------

set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

# -------------------- CARGA DATA --------------------
df_master = pd.read_csv(FOLDS_CSV, encoding="utf-8")
assert {"text","labels","fold"}.issubset(df_master.columns), df_master.columns

# mapeos de etiquetas (asegura id2label / label2id constantes)
labels_sorted = sorted(df_master["labels"].unique().tolist())
label2id = {str(l): int(l) for l in labels_sorted}   # aqu√≠ las ids ya son num√©ricas
id2label = {int(l): str(l) for l in labels_sorted}
num_labels = len(labels_sorted)
print("num_labels:", num_labels, "label2id:", label2id)

# -------------------- TOKENIZER y COLLATOR --------------------
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# tokenization helper
def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

# -------------------- METRICAS --------------------
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    f1_mac = f1_score(labels, preds, average="macro", zero_division=0)
    acc = accuracy_score(labels, preds)
    prec = precision_score(labels, preds, average="macro", zero_division=0)
    rec = recall_score(labels, preds, average="macro", zero_division=0)
    return {"accuracy": acc, "f1_macro": f1_mac, "precision_macro": prec, "recall_macro": rec}

# -------------------- FUNCION ENTRENAR POR FOLD --------------------
def train_one_fold(fold_id):
    print(f"\n\n=== TRAIN FOLD {fold_id} ===")
    out_dir = OUT_BASE / f"fold{fold_id}"
    out_dir.mkdir(parents=True, exist_ok=True)

    train_df = df_master[df_master["fold"] != fold_id][["text","labels"]].reset_index(drop=True)
    val_df = df_master[df_master["fold"] == fold_id][["text","labels"]].reset_index(drop=True)

    # datasets HuggingFace
    ds_train = Dataset.from_pandas(train_df)
    ds_val = Dataset.from_pandas(val_df)

    ds_train = ds_train.map(lambda x: {"labels": int(x["labels"])}, remove_columns=[])  # ensure int
    ds_val = ds_val.map(lambda x: {"labels": int(x["labels"])}, remove_columns=[])

    ds_train = ds_train.map(tokenize_batch, batched=True, batch_size=64)
    ds_val = ds_val.map(tokenize_batch, batched=True, batch_size=64)

    ds_train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
    ds_val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

    # load model fresh for each fold
    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=str(out_dir),
        evaluation_strategy=EVAL_STRATEGY,
        save_strategy=SAVE_STRATEGY,
        learning_rate=LR,
        per_device_train_batch_size=PER_DEVICE_TRAIN_BATCH_SIZE,
        per_device_eval_batch_size=PER_DEVICE_EVAL_BATCH_SIZE,
        num_train_epochs=N_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        logging_strategy="steps",
        logging_steps=LOGGING_STEPS,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        save_total_limit=2,
        seed=SEED,
        fp16=torch.cuda.is_available(),   # usa mixed precision si hay GPU
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
    )

    # Entrenar
    trainer.train()

    # Evaluar final y guardar m√©tricas
    metrics = trainer.evaluate(ds_val)
    print("Fold metrics:", metrics)

    # guardar m√©tricas por fold en JSON y guardar el mejor checkpoint
    (out_dir / "metrics_fold.json").write_text(json.dumps(metrics, default=str))
    trainer.save_model(str(out_dir / "best_model"))

    return metrics

# -------------------- LOOP 5-FOLD --------------------
import json
results = {}
for k in sorted(df_master["fold"].unique()):
    # opcional: limpiar cache CUDA entre folds
    if torch.cuda.is_available():
        torch.cuda.empty_cache()
    metrics = train_one_fold(int(k))
    results[f"fold_{k}"] = metrics

# -------------------- RESUMEN --------------------
# Construye dataframe con m√©tricas clave por fold
rows = []
for fold, m in results.items():
    rows.append({
        "fold": fold,
        "f1_macro": m.get("eval_f1_macro", m.get("f1_macro", None)),
        "accuracy": m.get("eval_accuracy", m.get("accuracy", None)),
        "precision_macro": m.get("eval_precision_macro", m.get("precision_macro", None)),
        "recall_macro": m.get("eval_recall_macro", m.get("recall_macro", None))
    })
res_df = pd.DataFrame(rows)
res_df[["f1_macro","accuracy","precision_macro","recall_macro"]] = res_df[["f1_macro","accuracy","precision_macro","recall_macro"]].astype(float)
summary = res_df.describe().loc[["mean","std"]]
print("\n\n=== SUMMARY ===")
print(res_df)
print(summary)

# guardar CSV resumen
csv_out = OUT_BASE / "kfold_results_summary.csv"
res_df.to_csv(csv_out, index=False)
print("‚úÖ Guardado resumen:", csv_out)


Device: cuda
num_labels: 4 label2id: {'0': 0, '1': 1, '2': 2, '3': 3}






=== TRAIN FOLD 0 ===


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4661, 'learning_rate': 1.6733333333333335e-05, 'epoch': 0.5}
{'loss': 0.0202, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0037644030526280403, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.3648, 'eval_samples_per_second': 542.749, 'eval_steps_per_second': 35.635, 'epoch': 1.0}
{'loss': 0.0049, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0036, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0018961597234010696, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2535, 'eval_samples_per_second': 780.967, 'eval_steps_per_second': 51.276, 'epoch': 2.0}
{'loss': 0.0026, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0023, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0015802672132849693, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2051, 'eval_samples_per_second': 965.152, 'eval_steps_per_second': 63.369, 'epoch': 3.0}
{'train_runtime': 66.2639, 'train_samples_per_second': 35.902, 'train_steps_per_second': 4.527, 'train_loss': 0.08328939239184062, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.0037644030526280403, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.4191, 'eval_samples_per_second': 472.423, 'eval_steps_per_second': 31.018, 'epoch': 3.0}


=== TRAIN FOLD 1 ===


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4827, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.5}
{'loss': 0.026, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.018651988357305527, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9902777777777778, 'eval_precision_macro': 0.9918032786885246, 'eval_recall_macro': 0.9891304347826086, 'eval_runtime': 0.2306, 'eval_samples_per_second': 858.788, 'eval_steps_per_second': 56.385, 'epoch': 1.0}
{'loss': 0.016, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0032, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0019352458184584975, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2188, 'eval_samples_per_second': 904.788, 'eval_steps_per_second': 59.405, 'epoch': 2.0}
{'loss': 0.0026, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0024, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.001600400311872363, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2032, 'eval_samples_per_second': 974.323, 'eval_steps_per_second': 63.971, 'epoch': 3.0}
{'train_runtime': 66.1755, 'train_samples_per_second': 35.95, 'train_steps_per_second': 4.533, 'train_loss': 0.08883576234181723, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.0019352458184584975, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.423, 'eval_samples_per_second': 468.062, 'eval_steps_per_second': 30.731, 'epoch': 3.0}


=== TRAIN FOLD 2 ===


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4731, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.5}
{'loss': 0.0142, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.04213862493634224, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9900779828940749, 'eval_precision_macro': 0.9906896551724138, 'eval_recall_macro': 0.989592094196804, 'eval_runtime': 0.2052, 'eval_samples_per_second': 965.037, 'eval_steps_per_second': 63.361, 'epoch': 1.0}
{'loss': 0.0044, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0031, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.042931824922561646, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9900779828940749, 'eval_precision_macro': 0.9906896551724138, 'eval_recall_macro': 0.989592094196804, 'eval_runtime': 0.212, 'eval_samples_per_second': 934.009, 'eval_steps_per_second': 61.324, 'epoch': 2.0}
{'loss': 0.0025, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0022, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.03865697234869003, 'eval_accuracy': 0.9949494949494949, 'eval_f1_macro': 0.9953008344312693, 'eval_precision_macro': 0.995, 'eval_recall_macro': 0.9956896551724138, 'eval_runtime': 0.233, 'eval_samples_per_second': 849.611, 'eval_steps_per_second': 55.783, 'epoch': 3.0}
{'train_runtime': 70.0093, 'train_samples_per_second': 33.981, 'train_steps_per_second': 4.285, 'train_loss': 0.08325882752736409, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.03865697234869003, 'eval_accuracy': 0.9949494949494949, 'eval_f1_macro': 0.9953008344312693, 'eval_precision_macro': 0.995, 'eval_recall_macro': 0.9956896551724138, 'eval_runtime': 0.4279, 'eval_samples_per_second': 462.715, 'eval_steps_per_second': 30.38, 'epoch': 3.0}


=== TRAIN FOLD 3 ===


Map:   0%|          | 0/792 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Map:   0%|          | 0/792 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/297 [00:00<?, ?it/s]

{'loss': 0.4541, 'learning_rate': 1.6632996632996633e-05, 'epoch': 0.51}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.01670241169631481, 'eval_accuracy': 0.9949748743718593, 'eval_f1_macro': 0.9950862318142403, 'eval_precision_macro': 0.9948979591836735, 'eval_recall_macro': 0.9953703703703703, 'eval_runtime': 0.2262, 'eval_samples_per_second': 879.764, 'eval_steps_per_second': 57.472, 'epoch': 1.0}
{'loss': 0.0487, 'learning_rate': 1.3400673400673401e-05, 'epoch': 1.01}
{'loss': 0.0048, 'learning_rate': 1.0033670033670035e-05, 'epoch': 1.52}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.003856295021250844, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2257, 'eval_samples_per_second': 881.85, 'eval_steps_per_second': 57.608, 'epoch': 2.0}
{'loss': 0.0043, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.02}
{'loss': 0.0042, 'learning_rate': 3.2996632996633e-06, 'epoch': 2.53}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0017929269233718514, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2643, 'eval_samples_per_second': 752.96, 'eval_steps_per_second': 49.188, 'epoch': 3.0}
{'train_runtime': 61.5353, 'train_samples_per_second': 38.612, 'train_steps_per_second': 4.827, 'train_loss': 0.08724421925014919, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.003856295021250844, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.3839, 'eval_samples_per_second': 518.298, 'eval_steps_per_second': 33.859, 'epoch': 3.0}


=== TRAIN FOLD 4 ===


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4188, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.5}
{'loss': 0.0573, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.004045216832309961, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2462, 'eval_samples_per_second': 804.237, 'eval_steps_per_second': 52.803, 'epoch': 1.0}
{'loss': 0.0044, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0125, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0042310478165745735, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.253, 'eval_samples_per_second': 782.561, 'eval_steps_per_second': 51.38, 'epoch': 2.0}
{'loss': 0.0027, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0022, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

{'eval_loss': 0.0015061694430187345, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2149, 'eval_samples_per_second': 921.258, 'eval_steps_per_second': 60.487, 'epoch': 3.0}
{'train_runtime': 61.7073, 'train_samples_per_second': 38.553, 'train_steps_per_second': 4.862, 'train_loss': 0.08299334287643433, 'epoch': 3.0}


  0%|          | 0/13 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.004045216832309961, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.2091, 'eval_samples_per_second': 947.0, 'eval_steps_per_second': 62.177, 'epoch': 3.0}


=== SUMMARY ===
     fold  f1_macro  accuracy  precision_macro  recall_macro
0  fold_0  1.000000  1.000000            1.000       1.00000
1  fold_1  1.000000  1.000000            1.000       1.00000
2  fold_2  0.995301  0.994949            0.995       0.99569
3  fold_3  1.000000  1.000000            1.000       1.00000
4  fold_4  1.000000  1.000000            1.000       1.00000
      f1_macro  accuracy  precision_macro  recall_macro
mean  0.999060  0.998990         0.999000      0.999138
std   0.002102  0.002259         0.002236      0.001928
‚úÖ Guardado resumen: ..\models\beto_kfold\kfold_results_summary.csv


In [3]:
# CELDA C: K-Fold con BETO (con early stopping)
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer, DataCollatorWithPadding, EarlyStoppingCallback, set_seed
from datasets import Dataset
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score
import warnings
warnings.filterwarnings("ignore")

set_seed(SEED)
device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

df_master = pd.read_csv(FOLDS_CSV, encoding="utf-8")
assert {"text","labels","fold"}.issubset(df_master.columns)

labels_sorted = sorted(df_master["labels"].unique().tolist())
label2id = {str(l): int(l) for l in labels_sorted}
id2label = {int(l): str(l) for l in labels_sorted}
num_labels = len(labels_sorted)
print("num_labels:", num_labels, "labels:", labels_sorted)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, padding=False, max_length=256)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1_macro": f1_score(labels, preds, average="macro", zero_division=0),
        "precision_macro": precision_score(labels, preds, average="macro", zero_division=0),
        "recall_macro": recall_score(labels, preds, average="macro", zero_division=0),
    }

results = {}
for k in sorted(df_master["fold"].unique()):
    print(f"\n\n--- TRAIN FOLD {k} ---")
    out_dir = OUT_BASE / f"fold{k}"
    out_dir.mkdir(parents=True, exist_ok=True)

    train_df = df_master[df_master["fold"] != k][["text","labels"]].reset_index(drop=True)
    val_df   = df_master[df_master["fold"] == k][["text","labels"]].reset_index(drop=True)

    ds_train = Dataset.from_pandas(train_df).map(tokenize_batch, batched=True, batch_size=64)
    ds_val   = Dataset.from_pandas(val_df).map(tokenize_batch, batched=True, batch_size=64)

    ds_train.set_format(type="torch", columns=["input_ids","attention_mask","labels"])
    ds_val.set_format(type="torch", columns=["input_ids","attention_mask","labels"])

    model = AutoModelForSequenceClassification.from_pretrained(
        MODEL_NAME,
        num_labels=num_labels,
        id2label=id2label,
        label2id=label2id
    )

    training_args = TrainingArguments(
        output_dir=str(out_dir),
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=LR,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        num_train_epochs=N_EPOCHS,
        weight_decay=WEIGHT_DECAY,
        logging_strategy="steps",
        logging_steps=LOGGING_STEPS,
        load_best_model_at_end=True,
        metric_for_best_model="f1_macro",
        greater_is_better=True,
        save_total_limit=SAVE_TOTAL_LIMIT,
        seed=SEED,
        fp16=torch.cuda.is_available(),
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=ds_train,
        eval_dataset=ds_val,
        tokenizer=tokenizer,
        data_collator=data_collator,
        compute_metrics=compute_metrics,
        callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
    )

    trainer.train()
    metrics = trainer.evaluate(ds_val)
    print("Fold metrics:", metrics)

    # guardar
    (out_dir / "metrics_fold.json").write_text(json.dumps(metrics, default=str))
    trainer.save_model(str(out_dir / "best_model"))
    results[f"fold_{k}"] = {
        "f1_macro": float(metrics.get("eval_f1_macro", np.nan)),
        "accuracy": float(metrics.get("eval_accuracy", np.nan)),
        "precision_macro": float(metrics.get("eval_precision_macro", np.nan)),
        "recall_macro": float(metrics.get("eval_recall_macro", np.nan)),
    }

# resumen final
res_df = pd.DataFrame.from_dict(results, orient="index").reset_index().rename(columns={"index":"fold"})
csv_out = OUT_BASE / "kfold_results_summary.csv"
res_df.to_csv(csv_out, index=False)
print("‚úÖ Guardado resumen:", csv_out)
print(res_df)


Device: cuda
num_labels: 4 labels: [0, 1, 2, 3]


--- TRAIN FOLD 0 ---


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 0.4661, 'learning_rate': 1.6733333333333335e-05, 'epoch': 0.5}
{'loss': 0.0202, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0037641332019120455, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.178, 'eval_samples_per_second': 1112.419, 'eval_steps_per_second': 39.328, 'epoch': 1.0}
{'loss': 0.0049, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0036, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0018960441229864955, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1922, 'eval_samples_per_second': 1030.141, 'eval_steps_per_second': 36.419, 'epoch': 2.0}
{'loss': 0.0026, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0023, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0015802816487848759, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1579, 'eval_samples_per_second': 1253.823, 'eval_steps_per_second': 44.327, 'epoch': 3.0}
{'train_runtime': 63.3688, 'train_samples_per_second': 37.542, 'train_steps_per_second': 4.734, 'train_loss': 0.08328939239184062, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.0037641332019120455, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.326, 'eval_samples_per_second': 607.347, 'eval_steps_per_second': 21.472, 'epoch': 3.0}


--- TRAIN FOLD 1 ---


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4827, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.5}
{'loss': 0.026, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.018637211993336678, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9902777777777778, 'eval_precision_macro': 0.9918032786885246, 'eval_recall_macro': 0.9891304347826086, 'eval_runtime': 0.1799, 'eval_samples_per_second': 1100.87, 'eval_steps_per_second': 38.92, 'epoch': 1.0}
{'loss': 0.016, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0032, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0019353951793164015, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1388, 'eval_samples_per_second': 1426.565, 'eval_steps_per_second': 50.434, 'epoch': 2.0}
{'loss': 0.0026, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0024, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0016005111392587423, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1628, 'eval_samples_per_second': 1216.152, 'eval_steps_per_second': 42.995, 'epoch': 3.0}
{'train_runtime': 73.5472, 'train_samples_per_second': 32.347, 'train_steps_per_second': 4.079, 'train_loss': 0.08883576234181723, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.0019353951793164015, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1608, 'eval_samples_per_second': 1231.239, 'eval_steps_per_second': 43.529, 'epoch': 3.0}


--- TRAIN FOLD 2 ---


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4731, 'learning_rate': 1.6800000000000002e-05, 'epoch': 0.5}
{'loss': 0.0142, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0421612448990345, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9900779828940749, 'eval_precision_macro': 0.9906896551724138, 'eval_recall_macro': 0.989592094196804, 'eval_runtime': 0.151, 'eval_samples_per_second': 1311.345, 'eval_steps_per_second': 46.361, 'epoch': 1.0}
{'loss': 0.0044, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0031, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.042926888912916183, 'eval_accuracy': 0.98989898989899, 'eval_f1_macro': 0.9900779828940749, 'eval_precision_macro': 0.9906896551724138, 'eval_recall_macro': 0.989592094196804, 'eval_runtime': 0.1482, 'eval_samples_per_second': 1336.477, 'eval_steps_per_second': 47.249, 'epoch': 2.0}
{'loss': 0.0025, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0022, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.03865227475762367, 'eval_accuracy': 0.9949494949494949, 'eval_f1_macro': 0.9953008344312693, 'eval_precision_macro': 0.995, 'eval_recall_macro': 0.9956896551724138, 'eval_runtime': 0.156, 'eval_samples_per_second': 1269.591, 'eval_steps_per_second': 44.885, 'epoch': 3.0}
{'train_runtime': 62.232, 'train_samples_per_second': 38.228, 'train_steps_per_second': 4.821, 'train_loss': 0.08325882752736409, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.03865227475762367, 'eval_accuracy': 0.9949494949494949, 'eval_f1_macro': 0.9953008344312693, 'eval_precision_macro': 0.995, 'eval_recall_macro': 0.9956896551724138, 'eval_runtime': 0.2919, 'eval_samples_per_second': 678.37, 'eval_steps_per_second': 23.983, 'epoch': 3.0}


--- TRAIN FOLD 3 ---


Map:   0%|          | 0/792 [00:00<?, ? examples/s]

Map:   0%|          | 0/199 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/297 [00:00<?, ?it/s]

{'loss': 0.4541, 'learning_rate': 1.6632996632996633e-05, 'epoch': 0.51}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.016675373539328575, 'eval_accuracy': 0.9949748743718593, 'eval_f1_macro': 0.9950862318142403, 'eval_precision_macro': 0.9948979591836735, 'eval_recall_macro': 0.9953703703703703, 'eval_runtime': 0.1682, 'eval_samples_per_second': 1182.772, 'eval_steps_per_second': 41.605, 'epoch': 1.0}
{'loss': 0.0487, 'learning_rate': 1.3400673400673401e-05, 'epoch': 1.01}
{'loss': 0.0048, 'learning_rate': 1.0033670033670035e-05, 'epoch': 1.52}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.003854871727526188, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.176, 'eval_samples_per_second': 1130.516, 'eval_steps_per_second': 39.767, 'epoch': 2.0}
{'loss': 0.0043, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.02}
{'loss': 0.0042, 'learning_rate': 3.2996632996633e-06, 'epoch': 2.53}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0017921457765623927, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1682, 'eval_samples_per_second': 1183.333, 'eval_steps_per_second': 41.625, 'epoch': 3.0}
{'train_runtime': 68.3352, 'train_samples_per_second': 34.77, 'train_steps_per_second': 4.346, 'train_loss': 0.08724421925014919, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.003854871727526188, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.304, 'eval_samples_per_second': 654.602, 'eval_steps_per_second': 23.026, 'epoch': 3.0}


--- TRAIN FOLD 4 ---


Map:   0%|          | 0/793 [00:00<?, ? examples/s]

Map:   0%|          | 0/198 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-cased and are newly initialized: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


  0%|          | 0/300 [00:00<?, ?it/s]

{'loss': 0.4188, 'learning_rate': 1.6666666666666667e-05, 'epoch': 0.5}
{'loss': 0.0573, 'learning_rate': 1.3466666666666668e-05, 'epoch': 1.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.0040450915694236755, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.179, 'eval_samples_per_second': 1106.161, 'eval_steps_per_second': 39.107, 'epoch': 1.0}
{'loss': 0.0044, 'learning_rate': 1.0133333333333335e-05, 'epoch': 1.5}
{'loss': 0.0125, 'learning_rate': 6.800000000000001e-06, 'epoch': 2.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.004231057595461607, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1481, 'eval_samples_per_second': 1337.039, 'eval_steps_per_second': 47.269, 'epoch': 2.0}
{'loss': 0.0027, 'learning_rate': 3.4666666666666672e-06, 'epoch': 2.5}
{'loss': 0.0022, 'learning_rate': 1.3333333333333336e-07, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

{'eval_loss': 0.001506212865933776, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.1466, 'eval_samples_per_second': 1350.353, 'eval_steps_per_second': 47.74, 'epoch': 3.0}
{'train_runtime': 62.5302, 'train_samples_per_second': 38.046, 'train_steps_per_second': 4.798, 'train_loss': 0.08299334287643433, 'epoch': 3.0}


  0%|          | 0/7 [00:00<?, ?it/s]

Fold metrics: {'eval_loss': 0.0040450915694236755, 'eval_accuracy': 1.0, 'eval_f1_macro': 1.0, 'eval_precision_macro': 1.0, 'eval_recall_macro': 1.0, 'eval_runtime': 0.3153, 'eval_samples_per_second': 628.048, 'eval_steps_per_second': 22.204, 'epoch': 3.0}
‚úÖ Guardado resumen: ..\models\beto_kfold\kfold_results_summary.csv
     fold  f1_macro  accuracy  precision_macro  recall_macro
0  fold_0  1.000000  1.000000            1.000       1.00000
1  fold_1  1.000000  1.000000            1.000       1.00000
2  fold_2  0.995301  0.994949            0.995       0.99569
3  fold_3  1.000000  1.000000            1.000       1.00000
4  fold_4  1.000000  1.000000            1.000       1.00000


# validaciones post resultado de entrenamiento

In [7]:
# DIAG 1: verificar tama√±os y ejemplos crudos (antes de tokenizar)
import pandas as pd
from pathlib import Path
PATH = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(PATH, encoding="utf-8")

k = 0   # chequear el fold 0; rep√≠telo por otros folds si quieres
train_df = df[df["fold"] != k].reset_index(drop=True)
val_df   = df[df["fold"] == k].reset_index(drop=True)

print("Fold", k, "-> train:", len(train_df), " val:", len(val_df))
print("\n--- Muestras train (5) ---")
print(train_df["text"].head(5).to_list())
print("\n--- Muestras val (5) ---")
print(val_df["text"].head(5).to_list())

# chequear cu√°ntos textos exactos se repiten (de nuevo)
train_set = set(train_df["text"].astype(str))
val_set   = set(val_df["text"].astype(str))
print("\nIntersecci√≥n exacta textos train‚à©val:", len(train_set & val_set))


Fold 0 -> train: 793  val: 198

--- Muestras train (5) ---
['Habla de verdad y solo cuenta su parte', 'La fuerza del pueblo es m√°s fuerte que cualquier crisis', 'M√°s del 60 % de los empleos creados en pandemia fueron informales', 'Se llena la boca de patria y vac√≠a los bolsillos del Estado', 'M√°s de 50 000 escolares abandonaron clases tras la pandemia']

--- Muestras val (5) ---
['No hay fuerza m√°s grande que la esperanza', '[PARTIDO] somos mayor√≠a, [ADVERSARIO] son ruido', '[ADVERSARIO] confunden cr√≠tica con traici√≥n, [PARTIDO] sabemos que es deber', 'El acceso a internet rural apenas cubre el 25 % de los hogares', 'El 80 % de los trabajadores agr√≠colas no tiene contrato']

Intersecci√≥n exacta textos train‚à©val: 0


In [8]:
# DIAG 2: comparar hash de input_ids tokenizados (fold 0)
from transformers import AutoTokenizer
import hashlib, numpy as np

tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", use_fast=True)
def hash_input_ids(texts, tokenizer):
    hset = set()
    for i in range(0, len(texts), 64):
        batch = texts[i:i+64]
        enc = tokenizer(batch, truncation=True, padding=False, max_length=256, return_tensors="np")
        for ids in enc["input_ids"]:
            h = hashlib.md5(ids.tobytes()).hexdigest()
            hset.add(h)
    return hset

train_hashes = hash_input_ids(train_df["text"].astype(str).tolist(), tokenizer)
val_hashes   = hash_input_ids(val_df["text"].astype(str).tolist(), tokenizer)
print("Hashes train:", len(train_hashes), " hashes val:", len(val_hashes))
print("Intersecci√≥n hashes tokenizados:", len(train_hashes & val_hashes))


Hashes train: 793  hashes val: 198
Intersecci√≥n hashes tokenizados: 0


In [9]:
# DIAG 3: evaluar manualmente el best_model del fold 0 (sin Trainer)
import torch, numpy as np
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score

model_dir = "../models/beto_kfold/fold0/best_model"   # ajusta si tu carpeta es distinta
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_dir).to("cuda" if torch.cuda.is_available() else "cpu")

val_texts = val_df["text"].astype(str).tolist()
val_labels = val_df["labels"].astype(int).values

bs = 32
preds = []
for i in range(0, len(val_texts), bs):
    batch = val_texts[i:i+bs]
    enc = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits.cpu().numpy()
    preds.extend(np.argmax(logits, axis=1).tolist())

print("Eval manual accuracy:", accuracy_score(val_labels, preds))
print("Eval manual f1_macro:", f1_score(val_labels, preds, average="macro", zero_division=0))


Eval manual accuracy: 1.0
Eval manual f1_macro: 1.0


In [10]:
# DIAG 3: evaluar manualmente el best_model del fold 0 (sin Trainer)
import torch, numpy as np, pandas as pd
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from sklearn.metrics import f1_score, accuracy_score
from pathlib import Path

PATH = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(PATH, encoding="utf-8")
k = 0
val_df = df[df["fold"]==k].reset_index(drop=True)

model_dir = Path("../models/beto_kfold/fold0/best_model")   # ajusta si tu carpeta es distinta
print("Comprobando existencia:", model_dir.exists())
tokenizer = AutoTokenizer.from_pretrained("dccuchile/bert-base-spanish-wwm-cased", use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(str(model_dir)).to("cuda" if torch.cuda.is_available() else "cpu")

val_texts = val_df["text"].astype(str).tolist()
val_labels = val_df["labels"].astype(int).values

bs = 32
preds = []
for i in range(0, len(val_texts), bs):
    batch = val_texts[i:i+bs]
    enc = tokenizer(batch, padding=True, truncation=True, max_length=256, return_tensors="pt").to(model.device)
    with torch.no_grad():
        out = model(**enc)
        logits = out.logits.cpu().numpy()
    preds.extend(np.argmax(logits, axis=1).tolist())

print("Eval manual accuracy:", accuracy_score(val_labels, preds))
print("Eval manual f1_macro:", f1_score(val_labels, preds, average="macro", zero_division=0))

# Muestra 10 ejemplos para inspecci√≥n
for i in range(min(10, len(val_texts))):
    print(i, "lab:", val_labels[i], "pred:", preds[i], "->", val_texts[i][:180].replace("\n"," "))


Comprobando existencia: True
Eval manual accuracy: 1.0
Eval manual f1_macro: 1.0
0 lab: 3 pred: 3 -> No hay fuerza m√°s grande que la esperanza
1 lab: 1 pred: 1 -> [PARTIDO] somos mayor√≠a, [ADVERSARIO] son ruido
2 lab: 1 pred: 1 -> [ADVERSARIO] confunden cr√≠tica con traici√≥n, [PARTIDO] sabemos que es deber
3 lab: 2 pred: 2 -> El acceso a internet rural apenas cubre el 25 % de los hogares
4 lab: 2 pred: 2 -> El 80 % de los trabajadores agr√≠colas no tiene contrato
5 lab: 2 pred: 2 -> El d√©ficit fiscal se redujo al 2,1 % del PBI en el √∫ltimo a√±o
6 lab: 2 pred: 2 -> La inversi√≥n p√∫blica regional cay√≥ 14 % respecto al a√±o anterior
7 lab: 2 pred: 2 -> El 40 % de las familias peruanas vive sin acceso a alcantarillado
8 lab: 1 pred: 1 -> [ADVERSARIO] nos llaman resentidos, [PARTIDO] los llamamos responsables
9 lab: 2 pred: 2 -> La contaminaci√≥n del aire en Lima excede tres veces los l√≠mites de la OMS


In [11]:
# DIAG 5: inspeccionar trainer.args y eval_dataset
# Ejecuta esto **en la misma sesi√≥n** donde hayas creado trainer (o justo despu√©s de la training loop)
print("Trainer args (s√≥lo algunas claves):")
for k in ["output_dir","evaluation_strategy","load_best_model_at_end","metric_for_best_model","save_strategy"]:
    print(k, "->", getattr(trainer.args, k, None))

print("\nEval dataset type:", type(getattr(trainer, "eval_dataset", None)))
try:
    ed = trainer.eval_dataset
    # si es Dataset, imprime su tama√±o y primeras labels/texts (si tiene)
    if hasattr(ed, "__len__"):
        print("len(eval_dataset)=", len(ed))
    # si podemos, intentar extraer un ejemplo formateado
    try:
        sample = ed[0]
        print("Ejemplo eval_dataset keys:", list(sample.keys()))
        if "labels" in sample:
            print("Ejemplo labels[0]:", sample["labels"])
        # print small sample of tokenized ids if present
        if "input_ids" in sample:
            print("input_ids len:", len(sample["input_ids"]))
    except Exception as e:
        print("No se pudo mostrar item de eval_dataset:", e)
except Exception as e:
    print("Error accediendo a trainer.eval_dataset:", e)


Trainer args (s√≥lo algunas claves):
output_dir -> ..\models\beto_kfold\fold4
evaluation_strategy -> IntervalStrategy.EPOCH
load_best_model_at_end -> True
metric_for_best_model -> f1_macro
save_strategy -> IntervalStrategy.EPOCH

Eval dataset type: <class 'datasets.arrow_dataset.Dataset'>
len(eval_dataset)= 198
Ejemplo eval_dataset keys: ['labels', 'input_ids', 'attention_mask']
Ejemplo labels[0]: tensor(2)
input_ids len: 13



## Evaluacion de los diagnostivos segundo nivel luego de estos resultados previos

In [12]:
# DIAG A: buscar label names dentro del texto
import re, pandas as pd
from pathlib import Path

F = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(F, encoding="utf-8")
# Si tienes labels_name en el DF, √∫salo; si no, define nombres manualmente:
if "labels_name" in df.columns:
    label_names = sorted(df["labels_name"].dropna().unique().tolist())
else:
    # ajusta estos a tus nombres reales (ej.: ad_hominem, logos, framing, retorica_vacia)
    label_names = ["ad_hominem","logos","framing","retorica_vacia","retorica_vacia","retorica","vacia"]

res = {}
for name in label_names:
    n = df["text"].astype(str).str.contains(re.escape(name), case=False, na=False).sum()
    if n>0:
        res[name] = int(n)

print("Apariciones de label names en texto (si hay):", res)
# Muestra ejemplos si hay
for name,count in res.items():
    print(f"\n=== Ejemplos con '{name}' ===")
    print(df[df["text"].astype(str).str.contains(re.escape(name), case=False, na=False)]["text"].head(10).to_list())


Apariciones de label names en texto (si hay): {}


In [13]:
# DIAG B: buscar patrones de etiqueta num√©rica o coma+etiqueta al final
import re, pandas as pd
from pathlib import Path

F = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(F, encoding="utf-8")

# buscar textos que terminan con ", 0" o ",0" o " , 0" etc. (cualquier d√≠gito de etiqueta)
pattern_end_num = re.compile(r"[,\s]\s*[0-9]\s*$")
mask_end_num = df["text"].astype(str).str.match(pattern_end_num)
print("textos que terminan con coma/espacio + d√≠gito (ej. ', 0'):", mask_end_num.sum())
if mask_end_num.sum()>0:
    print(df[mask_end_num]["text"].head(10).to_list())

# buscar patrones con coma + palabra (ej. ',ad_hominem') al final
pattern_end_word = re.compile(r",\s*[A-Za-z_√±√ë√°√©√≠√≥√∫√Å√â√ç√ì√ö-]{3,}\s*$")
mask_end_word = df["text"].astype(str).str.match(pattern_end_word)
print("textos que terminan con coma + palabra (ej. ',ad_hominem'):", mask_end_word.sum())
if mask_end_word.sum()>0:
    print(df[mask_end_word]["text"].head(10).to_list())


textos que terminan con coma/espacio + d√≠gito (ej. ', 0'): 0
textos que terminan con coma + palabra (ej. ',ad_hominem'): 0


In [14]:
# DIAG C: buscar apariciones de tokens ' 0 ' ' 1 ' como token independiente
import re, pandas as pd
from pathlib import Path

F = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(F, encoding="utf-8")
counts = {}
for lid in sorted(df["labels"].unique()):
    patt = rf"(?<!\d){re.escape(str(lid))}(?!\d)"   # digit not part of larger number
    cnt = df["text"].astype(str).str.contains(patt, regex=True, na=False, case=False).sum()
    counts[int(lid)] = int(cnt)

print("Apariciones de tokens num√©ricos exactos (posible fuga):", counts)
# mostrar ejemplos si hay
for lid,cnt in counts.items():
    if cnt>0:
        print(f"\nEjemplos que contienen token '{lid}':")
        print(df[df["text"].astype(str).str.contains(rf'(?<!\d){lid}(?!\d)', regex=True, na=False)][["text","labels"]].head(10).to_dict(orient="records"))


Apariciones de tokens num√©ricos exactos (posible fuga): {0: 2, 1: 3, 2: 4, 3: 3}

Ejemplos que contienen token '0':
[{'text': 'El gasto en investigaci√≥n cient√≠fica es menor al 0,2 % del PBI', 'labels': 2}, {'text': 'El gasto en cultura representa menos del 0,1 % del presupuesto nacional', 'labels': 2}]

Ejemplos que contienen token '1':
[{'text': 'El d√©ficit fiscal se redujo al 2,1 % del PBI en el √∫ltimo a√±o', 'labels': 2}, {'text': 'El acceso a agua potable mejor√≥, pero a√∫n falta cobertura en 1,2 millones de hogares', 'labels': 2}, {'text': 'El gasto en cultura representa menos del 0,1 % del presupuesto nacional', 'labels': 2}]

Ejemplos que contienen token '2':
[{'text': 'El PBI del pa√≠s creci√≥ 2,5 % durante el √∫ltimo trimestre', 'labels': 2}, {'text': 'El d√©ficit fiscal se redujo al 2,1 % del PBI en el √∫ltimo a√±o', 'labels': 2}, {'text': 'El gasto en investigaci√≥n cient√≠fica es menor al 0,2 % del PBI', 'labels': 2}, {'text': 'El acceso a agua potable mejor√≥, pero a√

A continuaci√≥n 3 pruebas concretas y ordenadas que haz ahora (muy r√°pidas). Primero detectamos las palabras/strings que hay tokens/plantillas que permiten separar perfectamente las clases o que, en esa ejecuci√≥n concreta

In [16]:
# TOP TOKENS por clase (TF-IDF + LogisticRegression entrenado en todo df)
from pathlib import Path
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

FOLDS = Path("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv")
df = pd.read_csv(FOLDS, encoding="utf-8")
texts = df["text"].astype(str).tolist()
y = df["labels"].astype(int).values

vec = TfidfVectorizer(max_features=15000, ngram_range=(1,2), min_df=2)
X = vec.fit_transform(texts)
vocab = np.array(vec.get_feature_names_out())

# Entrenar LR (one-vs-rest)
clf = LogisticRegression(max_iter=5000, solver="liblinear", multi_class="ovr")
clf.fit(X, y)

def top_tokens_for_class(clf, vocab, class_label, topn=20):
    """
    Devuelve (tokens_top, weights_top) para la clase con etiqueta class_label (no √≠ndice).
    Maneja mapeo correcto entre label -> posici√≥n en clf.classes_.
    """
    # ubica la posici√≥n real de la clase en clf.classes_
    classes = list(clf.classes_)
    if class_label not in classes:
        raise ValueError(f"label {class_label} not in trained classifier classes: {classes}")
    pos = classes.index(class_label)
    coef = clf.coef_[pos]  # esto debe ser 1D array (n_features,)
    top_idx = np.argsort(coef)[-topn:][::-1]
    return vocab[top_idx], coef[top_idx]

# imprimir top 20 por cada clase encontrada en el df
labels_sorted = sorted(df["labels"].unique())
for lab in labels_sorted:
    toks, w = top_tokens_for_class(clf, vocab, lab, topn=20)
    print(f"\n=== Label {lab} ‚Äî top tokens (peso, token) ===")
    for tok_i, wt in zip(toks, w):
        # aseguramos convertir el peso a float de manera segura
        wt_float = float(np.asarray(wt).item())
        print(f"{wt_float:.4f}\t{tok_i}")

# guardar resultado completo (top 50 por clase) en CSV
rows=[]
for lab in labels_sorted:
    toks, w = top_tokens_for_class(clf, vocab, lab, topn=50)
    for tok_i, wt in zip(toks, w):
        rows.append({"label": int(lab), "token": str(tok_i), "weight": float(np.asarray(wt).item())})

out = Path("../logs/top_tokens_by_label.csv")
pd.DataFrame(rows).to_csv(out, index=False, encoding="utf-8")
print("\nGuardado top tokens en:", out)



=== Label 0 ‚Äî top tokens (peso, token) ===
4.0122	promete
3.5170	pero
3.2571	habla
3.2245	habla de
3.0500	predica
2.5887	dice
2.0137	pol√≠tico
1.9992	que
1.9792	dice que
1.7194	el que
1.4268	quien
1.3589	su
1.0749	critica
0.9995	como
0.9702	vive
0.9592	gobierna
0.9331	siempre
0.9182	transparencia
0.8989	no
0.8688	al

=== Label 1 ‚Äî top tokens (peso, token) ===
8.0542	adversario
7.3069	partido
1.2397	viven
1.1766	adversario se
1.1245	adversario viven
0.9170	hablan
0.8722	partido seguimos
0.8722	seguimos
0.8387	partido la
0.7823	adversario hablan
0.7645	hablan de
0.7588	llaman
0.6950	desde
0.6686	reparten
0.6607	partido lo
0.6415	tenemos
0.6326	partido sabemos
0.6264	lo
0.6222	creen
0.6108	vivimos

=== Label 2 ‚Äî top tokens (peso, token) ===
4.2876	de
3.5847	de los
2.9911	el
2.1248	de las
2.0819	los
2.0003	las
1.5050	m√°s de
1.3286	en
1.2587	del
1.1785	rurales
1.0954	40
1.0760	60
1.0295	el 40
0.9861	70
0.9821	m√°s
0.9616	a√±o
0.9575	30
0.9560	70 de
0.9536	nacional
0.9443	25

=== Lab

In [17]:
# 1) Construir lista de tokens a enmascarar y evaluar LR (5-fold CV)
import re, pandas as pd, numpy as np
from pathlib import Path
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, cross_val_score

# lee top tokens guardados
topdf = pd.read_csv("../logs/top_tokens_by_label.csv", encoding="utf-8")
# toma top N por label (ej. 30)
N=30
cands = topdf.sort_values("weight", ascending=False).groupby("label").head(N)["token"].unique().tolist()
print("N√∫mero tokens a enmascarar:", len(cands))
print("Ejemplos:", cands[:40])

# funci√≥n que enmascara tokens (word-boundary, case-insensitive)
def mask_tokens(s, tokens):
    s0 = " " + str(s) + " "
    for t in tokens:
        # crear patr√≥n de palabra (escapado), caso-insensible
        pat = re.compile(r"(?i)\b" + re.escape(t) + r"\b")
        s0 = pat.sub(" <MASK> ", s0)
    return " ".join(s0.split())

# cargar dataset folds maestro
df = pd.read_csv("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv", encoding="utf-8")
df["text_masked"] = df["text"].astype(str).apply(lambda x: mask_tokens(x, cands))

# Vectorizar y cross-val
vec = TfidfVectorizer(max_features=8000, ngram_range=(1,2), min_df=2)
X_masked = vec.fit_transform(df["text_masked"].tolist())
y = df["labels"].astype(int).values

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
clf = LogisticRegression(max_iter=5000, multi_class="ovr", solver="liblinear")
scores_masked = cross_val_score(clf, X_masked, y, cv=skf, scoring="f1_macro")
print("LR masked F1-macro por fold:", np.round(scores_masked,4), "mean:", round(scores_masked.mean(),4))


N√∫mero tokens a enmascarar: 120
Ejemplos: ['adversario', 'partido', 'de', 'promete', 'de los', 'pero', 'habla', 'habla de', 'predica', 'el', 'per√∫', 'dice', 'el per√∫', 'de las', 'los', 'pol√≠tico', 'las', 'que', 'dice que', 'es', 'somos', 'esperanza', 'nuestra', 'la', 'el que', 'cambio', 'ser√°', 'el cambio', 'juntos', 'm√°s de', 'futuro', 'hoy', 'quien', 'su', 'en', 'un', 'del', 'nada', 'fe', 'viven']
LR masked F1-macro por fold: [0.8474 0.8648 0.8475 0.8012 0.7954] mean: 0.8313


üß† Interpretaci√≥n

Tu modelo base (TF-IDF + LR) antes del enmascarado estaba probablemente en torno a 0.97‚Äì0.99 de F1-macro, y al quitar solo 120 tokens frecuentes (palabras y frases cortas), cae a 0.83.

‚û°Ô∏è Eso significa que casi 15-17 % de su poder predictivo ven√≠a de ‚Äúatajos l√©xicos‚Äù (palabras que se repiten sistem√°ticamente por clase, como ‚Äúellos‚Äù, ‚Äúnosotros‚Äù, ‚ÄúPer√∫‚Äù, ‚Äúpromete‚Äù, ‚Äúm√°s de‚Äù).

‚û°Ô∏è En otras palabras, el modelo no estaba generalizando realmente el sentido, sino asociando ciertos marcadores con clases.

üìä Qu√© implica esto

Dataset a√∫n √∫til, pero debe refinarse para eliminar esos artefactos antes de seguir fine-tuning con BETO.

Si no se corrige, BETO aprender√° las mismas correlaciones y producir√° F1‚âà1 artificial.

La ca√≠da de 0.83 sigue mostrando que hay se√±al sem√°ntica real, solo que m√°s delgada, lo que es bueno: el modelo a√∫n distingue estilos, pero sin trampas l√©xicas bajar√° algo m√°s la m√©trica y subir√° la robustez.

In [18]:
import re, pandas as pd

df = pd.read_csv("../data/processed/clean_v2/folds/corpus_clean_v2_folds.csv", encoding="utf-8")

def normalize_numbers(text):
    s = str(text)
    s = re.sub(r"\d+(\,\d+)?\s*%", " <PCT> ", s)
    s = re.sub(r"\d+(\,\d+)?", " <NUM> ", s)
    return " ".join(s.split())

df["text_norm"] = df["text"].apply(normalize_numbers)
df.to_csv("../data/processed/clean_v2/folds/corpus_clean_v2_norm.csv", index=False)
print("Guardado dataset normalizado.")


Guardado dataset normalizado.


Hacemos TF-IDF sobre text_norm, entrena un LogisticRegression con 5-fold Stratified CV y muestra F1-macro por fold y la media.