In [None]:
import pandas as pd
import numpy as np
import torch
import shutil
import os
os.environ["WANDB_DISABLED"] = "true"
import matplotlib.pyplot as plt
from collections import defaultdict
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from datasets import Dataset
import torch.nn as nn
from torch.nn import CrossEntropyLoss

In [None]:
df = pd.read_excel("clasificador_analisis/clasificador/basico/tonos_dataset.xlsx")
df = df.rename(columns={"Mensaje": "text", "Etiqueta": "label"})

etiqueta_map = {
    "Gestión Pública e Instituciones": 0,
    "Economía, Empresa, Empleo e Infraestructuras": 1,
    "Sociedad, Igualdad y Derechos": 2,
    "Otros": 3
}
target_names = list(etiqueta_map.keys())

df["label"] = df["label"].map(etiqueta_map)
df = df.dropna()

model_name = "VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_dir = "clasificador_analisis/clasificador/clasificador_tema/classweights_crossentropy/comparativa"

class_counts = df["label"].value_counts().sort_index()
class_weights = 1. / class_counts
class_weights = class_weights / class_weights.sum()
alpha = torch.tensor(class_weights.tolist(), dtype=torch.float)

epoch_values = range(3, 7)  

mejor_accuracy = 0
mejor_comb = ""
mejor_f1_equilibrio = float('inf')
mejor_result_path = ""
mejor_recall = 0
mejor_recall_std = float('inf')

accuracy_por_comb = defaultdict(list)
top_results = []

In [None]:
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.get("labels")
    outputs = model(**inputs)
    logits = outputs.get("logits")

    loss_fn = CrossEntropyLoss(weight=alpha.to(logits.device))
    loss = loss_fn(logits, labels)

    return (loss, outputs) if return_outputs else loss

In [None]:
df_balanced = df.sample(frac=1, random_state=43).reset_index(drop=True)
dataset = Dataset.from_pandas(df_balanced).shuffle(seed=43)
dataset = dataset.train_test_split(test_size=0.2)
train_valid_split = dataset["train"].train_test_split(test_size=0.2, seed=43)
train_dataset = train_valid_split["train"]
valid_dataset = train_valid_split["test"]
test_dataset = dataset["test"]

tokenize = lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

for num_epochs in epoch_values:
    comb_key = f"ep{num_epochs}"
    path = f"{base_dir}/{comb_key}"

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=4, ignore_mismatched_sizes=True)
    model.compute_loss = compute_loss

    training_args = TrainingArguments(
        output_dir=path,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
        report_to="none",
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    df_test = test_dataset.to_pandas()
    inputs = tokenizer(df_test["text"].tolist(), return_tensors="pt", padding=True, truncation=True, max_length=128)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    with torch.no_grad():
        outputs = model(**inputs)
    preds = torch.argmax(outputs.logits, dim=1).cpu().numpy()

    y_true = df_test["label"]
    y_pred = preds

    print("\n📊 MATRIZ DE CONFUSIÓN:")
    print(confusion_matrix(y_true, y_pred))
    print("\n📈 CLASSIFICATION REPORT:")
    print(classification_report(y_true, y_pred, target_names=target_names))

    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, target_names=target_names, output_dict=True)
    recalls = [report[label]["recall"] for label in target_names]
    recall_macro = np.mean(recalls)
    recall_std = np.std(recalls)

    accuracy_por_comb[comb_key].append(accuracy)

    top_results.append({
        "combinacion": comb_key,
        "accuracy": accuracy,
        "recall_macro": recall_macro,
        "recall_std": recall_std,
        "ruta": path
    })

    es_mejor = (
        (accuracy > mejor_accuracy and recall_macro >= mejor_recall) or
        (accuracy == mejor_accuracy and recall_macro > mejor_recall) or
        (accuracy == mejor_accuracy and recall_macro == mejor_recall and recall_std < mejor_recall_std)
    )

    if es_mejor:
        anterior_path = mejor_result_path
        mejor_accuracy = accuracy
        mejor_comb = comb_key
        mejor_result_path = path
        mejor_recall = recall_macro
        mejor_recall_std = recall_std

        if anterior_path and anterior_path != path and os.path.exists(anterior_path):
            shutil.rmtree(anterior_path)
            print(f"🗑️ Eliminado modelo anterior: {anterior_path}")

        df_train = train_dataset.to_pandas()
        df_valid = valid_dataset.to_pandas()
        df_test_export = test_dataset.to_pandas()

        df_train.to_excel("clasificador_analisis/clasificador/clasificador_tema/classweights_crossentropy/mejor_train.xlsx", index=False)
        df_valid.to_excel("clasificador_analisis/clasificador/clasificador_tema/classweights_crossentropy/mejor_valid.xlsx", index=False)
        df_test_export.to_excel("clasificador_analisis/clasificador/clasificador_tema/classweights_crossentropy/mejor_test.xlsx", index=False)

        model.save_pretrained(f"{path}/modelo_final")
        tokenizer.save_pretrained(f"{path}/modelo_final")

        print(f"💾 Guardado nuevo mejor modelo: {comb_key} | Accuracy: {accuracy:.4f} | Recall: {recall_macro:.4f} | Path: {path}")
    else:
        if path != mejor_result_path and os.path.exists(path):
            shutil.rmtree(path)
            print(f"⛔ Borrado modelo descartado: {path}")

print(f"\n🏁 Mejor combinación final → {mejor_comb} | Accuracy: {mejor_accuracy:.4f} | Recall: {mejor_recall:.4f} | STD: {mejor_recall_std:.4f}")