In [None]:
import os
import shutil
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch.nn as nn
from torch.nn import CrossEntropyLoss
from collections import defaultdict

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BertTokenizer,
    BertForSequenceClassification)

In [None]:
def predict_sentiment_with_neutral(text, model, tokenizer, alpha=1.5, neutral_threshold=0.1):
    device = next(model.parameters()).device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(device)
    
    with torch.no_grad():
        probs = torch.softmax(model(**inputs).logits, dim=1).squeeze()

    neutral_score = 1 - (abs(probs[0] - 0.5) ** alpha + abs(probs[2] - 0.5) ** alpha)
    return 1 if neutral_score > (1 - neutral_threshold) else int(torch.argmax(probs))


In [None]:
df = pd.read_excel("clasificador_analisis/clasificador/datasets_originales/tonos_dataset.xlsx")
df = df.rename(columns={"Mensaje": "text", "Etiqueta": "label"})
df["label"] = df["label"].map({"Negativo": 0, "Neutro": 1, "Positivo": 2})
df = df.dropna()

model_name = "VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)
base_dir = "clasificador_analisis/clasificador/clasificador_tono/classweights_crossentropy/comparativa"

class_weights = 1. / df["label"].value_counts().sort_index()
class_weights = class_weights / class_weights.sum()
alpha = torch.tensor(class_weights.values, dtype=torch.float)

epoch_values = range(4, 5)

mejor_accuracy = 0.0
mejor_comb = ""
mejor_f1_equilibrio = float("inf")
mejor_result_path = ""

accuracy_por_comb = defaultdict(list)
recall_dist_por_comb = defaultdict(lambda: {
    "neu-neg": [],
    "neu-pos": [],
    "neg-pos": []
})
top_results = []

In [None]:
df_balanced = df.sample(frac=1, random_state=42).reset_index(drop=True)
dataset = Dataset.from_pandas(df_balanced).shuffle(seed=42).train_test_split(test_size=0.2)
train_valid = dataset["train"].train_test_split(test_size=0.2, seed=42)

train_dataset = train_valid["train"]
valid_dataset = train_valid["test"]
test_dataset = dataset["test"]

tokenize = lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

for num_epochs in epoch_values:
    comb_key = f"ep{num_epochs}"
    path = f"{base_dir}/{comb_key}"
    print(f"\n📚 Entrenando modelo - Épocas: {num_epochs}")

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)
    
    def compute_loss(model, inputs, return_outputs=False):
        labels = inputs["labels"]
        logits = model(**inputs).logits
        loss = CrossEntropyLoss(weight=alpha.to(logits.device))(logits, labels)
        return (loss, logits) if return_outputs else loss
    
    model.compute_loss = compute_loss

    training_args = TrainingArguments(
        output_dir=path,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    df_test = test_dataset.to_pandas()
    df_test["label_predicted"] = df_test["text"].apply(lambda x: predict_sentiment_with_neutral(x, model, tokenizer))

    y_true = df_test["label"]
    y_pred = df_test["label_predicted"]

    print("\n📊 MATRIZ DE CONFUSIÓN:")
    print(confusion_matrix(y_true, y_pred))
    
    print("\n📈 CLASSIFICATION REPORT:")
    print(classification_report(y_true, y_pred, target_names=["Negativo", "Neutro", "Positivo"]))

    acc = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True, target_names=["Negativo", "Neutro", "Positivo"])
    recall_neg, recall_neu, recall_pos = report["Negativo"]["recall"], report["Neutro"]["recall"], report["Positivo"]["recall"]

    accuracy_por_comb[comb_key].append(acc)
    recall_dist_por_comb[comb_key]["neu-neg"].append(abs(recall_neu - recall_neg) * 100)
    recall_dist_por_comb[comb_key]["neu-pos"].append(abs(recall_neu - recall_pos) * 100)
    recall_dist_por_comb[comb_key]["neg-pos"].append(abs(recall_neg - recall_pos) * 100)

    f1s = [f1_score(y_true, y_pred, labels=[i], average="macro") for i in range(3)]
    f1_eq = np.std(f1s)

    top_results.append({
        "combinacion": comb_key,
        "accuracy": acc,
        "f1_eq": f1_eq,
        "ruta": path
    })

    es_mejor = (acc > mejor_accuracy) or (acc == mejor_accuracy and f1_eq < mejor_f1_equilibrio)
    if es_mejor:
        if mejor_result_path and os.path.exists(mejor_result_path):
            shutil.rmtree(mejor_result_path)
            print(f"🗑️ Eliminado modelo anterior: {mejor_result_path}")

        mejor_accuracy = acc
        mejor_comb = comb_key
        mejor_result_path = path
        mejor_f1_equilibrio = f1_eq

        train_dataset.to_pandas().to_excel(f"clasificador_analisis/clasificador/clasificador_tono/classweights_crossentropy/mejor_train.xlsx", index=False)
        valid_dataset.to_pandas().to_excel(f"clasificador_analisis/clasificador/clasificador_tono/classweights_crossentropy/mejor_valid.xlsx", index=False)
        test_dataset.to_pandas().to_excel(f"clasificador_analisis/clasificador/clasificador_tono/classweights_crossentropy/mejor_test.xlsx", index=False)
        model.save_pretrained(f"{path}/modelo_final")
        tokenizer.save_pretrained(f"{path}/modelo_final")

        print(f"💾 Guardado nuevo mejor modelo: {comb_key} | Accuracy: {acc:.4f} | Path: {path}")
    else:
        if path != mejor_result_path and os.path.exists(path):
            shutil.rmtree(path)
            print(f"⛔ Borrado modelo descartado: {path}")

print(f"\n🏁 Mejor combinación final → {mejor_comb} | Accuracy: {mejor_accuracy:.4f}")


In [None]:
model_path = "/clasificador_analisis/clasificador/clasificador_tono/classweights_crossentropy/comparativa/ep_4/modelo_final"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

def predict_sentiment_with_neutral_alpha_thresh(text, model, tokenizer, alpha, neutral_threshold):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(next(model.parameters()).device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).squeeze()
    neutral_score = 1 - (abs(probs[0] - 0.5) ** alpha + abs(probs[2] - 0.5) ** alpha)
    if neutral_score > (1 - neutral_threshold):
        return 1
    else:
        return int(torch.argmax(probs))

df = pd.read_excel("/clasificador_analisis/clasificador/clasificador_tono/classweights_crossentrop/mejor_test.xlsx")
df = df.rename(columns={"text": "text", "label": "label_manual_num"})

alpha_values = np.arange(1.0, 1.2, 0.1)
threshold_values = np.arange(0.1, 0.2, 0.1)

best_acc = 0
best_combo = None
best_preds = None
results = []

for alpha in alpha_values:
    for threshold in threshold_values:
        print(f"🔍 Probando Alpha={alpha:.2f}, Threshold={threshold:.2f}")
        preds = df["text"].apply(lambda x: predict_sentiment_with_neutral_alpha_thresh(x, model, tokenizer, alpha, threshold))
        acc = accuracy_score(df["label_manual_num"], preds)

        results.append({
            "alpha": round(alpha, 2),
            "threshold": round(threshold, 2),
            "accuracy": acc
        })

        if acc > best_acc:
            best_acc = acc
            best_combo = (round(alpha, 2), round(threshold, 2))
            best_preds = preds

            print(f"\n🔥 Nueva mejor combinación encontrada: Alpha={alpha:.2f}, Threshold={threshold:.2f}, Accuracy={acc:.4f}")
            print("\n📊 MATRIZ DE CONFUSIÓN:")
            print(confusion_matrix(df["label_manual_num"], best_preds))
            print("\n📈 CLASSIFICATION REPORT:")
            print(classification_report(
                df["label_manual_num"],
                best_preds,
                target_names=["Negativo", "Neutro", "Positivo"],
                digits=3
            ))

print(f"\n🏆 Mejor combinación final → Alpha={best_combo[0]}, Threshold={best_combo[1]}, Accuracy={best_acc:.4f}")
