In [1]:
import os
import shutil
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import torch.nn as nn

from collections import defaultdict
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, f1_score
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    BertTokenizer,
    BertForSequenceClassification
)
from sklearn.model_selection import train_test_split

  warn(


In [None]:
def predict_sentiment_with_neutral(text, model, tokenizer, alpha=1.5, neutral_threshold=0.1):
    model_device = next(model.parameters()).device
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128).to(model_device)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).squeeze().tolist()
    neutral_score = 1 - abs(probs[0] - 0.5) ** alpha - abs(probs[2] - 0.5) ** alpha
    if neutral_score > (1 - neutral_threshold):
        return 1
    return int(torch.argmax(torch.tensor(probs)))

In [None]:


df = pd.read_excel("clasificador_analisis/clasificador/datasets_originales/tonos_dataset.xlsx")
df = df.rename(columns={"Mensaje": "text", "Etiqueta": "label"})

etiqueta_map = {"Negativo": 0, "Neutro": 1, "Positivo": 2}
df["label"] = df["label"].map(etiqueta_map)
df = df.dropna()

model_name = "VerificadoProfesional/SaBERT-Spanish-Sentiment-Analysis"
tokenizer = AutoTokenizer.from_pretrained(model_name)

base_dir = "clasificador_analisis/clasificador/clasificador_tono/class_balanced/comparativa"

epoch_values = range(4, 5)
mejor_accuracy = 0
mejor_comb = ""
mejor_result_path = ""
mejor_f1_equilibrio = float("inf")
cumple_f1_al_menos_una_vez = False

accuracy_por_comb = defaultdict(list)
recall_dist_por_comb = defaultdict(lambda: {"neu-neg": [], "neu-pos": [], "neg-pos": []})
top_results = []


In [None]:
target_size = df[df["label"] == 1].shape[0]
df_neu = df[df["label"] == 1]
df_neg = df[df["label"] == 0].sample(n=target_size, random_state=42)
df_pos = df[df["label"] == 2].sample(n=target_size, random_state=42)

df_balanced = pd.concat([df_neg, df_neu, df_pos]).sample(frac=1, random_state=42).reset_index(drop=True)

df_temp, df_test = train_test_split(df_balanced, test_size=0.2, stratify=df_balanced["label"], random_state=42)
df_train, df_valid = train_test_split(df_temp, test_size=0.2, stratify=df_temp["label"], random_state=42)

train_dataset = Dataset.from_pandas(df_train)
valid_dataset = Dataset.from_pandas(df_valid)
test_dataset = Dataset.from_pandas(df_test)

tokenize = lambda x: tokenizer(x["text"], padding="max_length", truncation=True, max_length=128)
train_dataset = train_dataset.map(tokenize, batched=True)
valid_dataset = valid_dataset.map(tokenize, batched=True)
test_dataset = test_dataset.map(tokenize, batched=True)

for num_epochs in epoch_values:
    comb_key = f"ep{num_epochs}"
    path = f"{base_dir}/ep_{num_epochs}"

    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=3, ignore_mismatched_sizes=True)

    training_args = TrainingArguments(
        output_dir=path,
        evaluation_strategy="epoch",
        save_strategy="epoch",
        learning_rate=3e-5,
        per_device_train_batch_size=16,
        per_device_eval_batch_size=16,
        num_train_epochs=num_epochs,
        weight_decay=0.01,
        logging_dir="./logs",
        logging_steps=10,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=valid_dataset,
        tokenizer=tokenizer,
    )

    trainer.train()

    df_test = test_dataset.to_pandas()
    preds = df_test["text"].apply(lambda x: predict_sentiment_with_neutral(x, model, tokenizer))
    df_test["label_predicted"] = preds

    y_true = df_test["label"]
    y_pred = df_test["label_predicted"]

    print("\n📊 MATRIZ DE CONFUSIÓN:")
    print(confusion_matrix(y_true, y_pred))

    print("\n📈 CLASSIFICATION REPORT:")
    print(classification_report(y_true, y_pred, target_names=["Negativo", "Neutro", "Positivo"]))

    accuracy = accuracy_score(y_true, y_pred)
    report = classification_report(y_true, y_pred, output_dict=True, target_names=["Negativo", "Neutro", "Positivo"])
    recall_neg = report["Negativo"]["recall"]
    recall_neu = report["Neutro"]["recall"]
    recall_pos = report["Positivo"]["recall"]

    accuracy_por_comb[comb_key].append(accuracy)
    recall_dist_por_comb[comb_key]["neu-neg"].append(abs(recall_neu - recall_neg) * 100)
    recall_dist_por_comb[comb_key]["neu-pos"].append(abs(recall_neu - recall_pos) * 100)
    recall_dist_por_comb[comb_key]["neg-pos"].append(abs(recall_neg - recall_pos) * 100)

    f1s = [f1_score(y_true, y_pred, labels=[i], average="macro") for i in range(3)]
    f1_eq = np.std(f1s)

    top_results.append({
        "combinacion": comb_key,
        "accuracy": accuracy,
        "f1_eq": f1_eq,
        "ruta": path
    })

    es_mejor = (accuracy > mejor_accuracy) or (accuracy == mejor_accuracy and f1_eq < mejor_f1_equilibrio)

    if es_mejor:
        anterior_path = mejor_result_path
        mejor_accuracy = accuracy
        mejor_comb = comb_key
        mejor_result_path = path
        mejor_f1_equilibrio = f1_eq

        if anterior_path and anterior_path != path and os.path.exists(anterior_path):
            shutil.rmtree(anterior_path)
            print(f"🗑️ Eliminado modelo anterior: {anterior_path}")

        df_train.to_excel("clasificador_analisis/clasificador/clasificador_tono/class_balanced/mejor_train.xlsx", index=False)
        df_valid.to_excel("clasificador_analisis/clasificador/clasificador_tono/class_balanced/mejor_valid.xlsx", index=False)
        df_test.to_excel("clasificador_analisis/clasificador/clasificador_tono/class_balanced/mejor_test.xlsx", index=False)

        model.save_pretrained(f"{path}/modelo_final")
        tokenizer.save_pretrained(f"{path}/modelo_final")

        print(f"💾 Guardado nuevo mejor modelo: {comb_key} | Accuracy: {accuracy:.4f} | Path: {path}")
    else:
        if path != mejor_result_path and os.path.exists(path):
            shutil.rmtree(path)
            print(f"⛔ Borrado modelo descartado: {path}")

print(f"\n🏁 Mejor combinación final → {mejor_comb} | Accuracy: {mejor_accuracy:.4f}")


In [None]:
model_path = "clasificador_analisis/clasificador/clasificador_tono/class_balanced/comparativa/ep_4/modelo_final"
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)
model.eval()

df = pd.read_excel("clasificador_analisis/clasificador/clasificador_tono/class_balanced/mejor_test.xlsx")
df = df.rename(columns={"text": "text", "label": "label_manual_num"})

In [None]:
def predict_sentiment_with_neutral_alpha_thresh(text, model, tokenizer, alpha, neutral_threshold):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True, max_length=128)
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.softmax(logits, dim=1).squeeze().tolist()
    neutral_score = 1 - abs(probs[0] - 0.5) ** alpha - abs(probs[2] - 0.5) ** alpha
    if neutral_score > (1 - neutral_threshold):
        return 1
    else:
        return int(torch.argmax(torch.tensor(probs)))

results = []
alpha_values = np.arange(1.0, 1.3, 0.1)
threshold_values = np.arange(0.5, 0.6, 0.1)

print("\n🔍 Evaluando combinaciones de alpha y threshold...")
for alpha in alpha_values:
    for threshold in threshold_values:
        preds = df["text"].apply(lambda x: predict_sentiment_with_neutral_alpha_thresh(x, model, tokenizer, alpha, threshold))
        f1_macro = f1_score(df["label_manual_num"], preds, average='macro')
        results.append({
            "alpha": round(alpha, 2),
            "threshold": round(threshold, 2),
            "f1_macro": round(f1_macro, 4)
        })
        print(f"Alpha: {alpha:.1f}, Threshold: {threshold:.1f} → F1 Macro: {f1_macro:.4f}")

results_df = pd.DataFrame(results)
results_df = results_df.sort_values(by="f1_macro", ascending=False).reset_index(drop=True)

mejor = results_df.iloc[0]
print(f"\n✅ Mejor combinación global → Alpha: {mejor['alpha']}, Threshold: {mejor['threshold']}, F1 Macro: {mejor['f1_macro']:.4f}")