# Train Interspeech

## Data

In [1]:
balanced = False

In [2]:
preprocessed = False

In [None]:
import pandas as pd

# Cargar el CSV para male
# data = pd.read_csv('data/data_female.csv')
data = pd.read_csv('data/balanced_pseudo_female.csv')
print(len(data))
# Eliminar filas con valores nulos solo en la columna 'text'
data = data.dropna(subset=['text'])
#data.to_csv('data/data_female.csv', index=False)
print(len(data))
data.head()

In [4]:
if balanced:
    # Paso 1: Determinar el tamaño mínimo entre las clases
    min_count = data['PseudoEmo'].value_counts().min()

    # Paso 2: Submuestrear cada clase
    data = data.groupby('PseudoEmo').apply(lambda x: x.sample(min_count)).reset_index(drop=True)

    # Paso 3: Verificar el balance
    print(data['PseudoEmo'].value_counts())
    print(len(data))

    data.to_csv('data/balanced_pseudo_female.csv', index=False)


In [None]:
# Filtrar los datos en tres conjuntos basados en la columna 'Split_Set'
train_df = data.loc[data['NewPartition'] == 'Train']
dev_df = data.loc[data['NewPartition'] == 'Evaluation']
test_df = data.loc[data['NewPartition'] == 'Test']

# Verifica las primeras filas de cada conjunto
print("Conjunto Train:")
print(train_df.head())

print("Conjunto Development:")
print(dev_df.head())

print("Conjunto Test:")
print(test_df.head())

In [6]:
# # Establecer la semilla para garantizar reproducibilidad
# seed = 42

# # Seleccionar aleatoriamente las primeras N filas del conjunto de entrenamiento
# train_df = train_df.sample(n=1000, random_state=seed)

# # Seleccionar aleatoriamente las primeras N filas del conjunto de desarrollo
# dev_df = dev_df.sample(n=250, random_state=seed)

In [None]:
num_labels = 8
id2label = {
        1: 'H',  # Happy
        3: 'S',  # Sad
        2: 'A',  # Angry
        0: 'N',  # Neutral
        5: 'U',  # Surprise
        7: 'F',  # Fear
        6: 'D',  # Disgust
        4: 'C'   # Contempt
    }
label2id = {
        'H': 1,  # Happy
        'S': 2 ,  # Sad
        'A': 3,  # Angry
        'N': 4,  # Neutral
        'U': 5,  # Surprise
        'F': 6,  # Fear
        'D': 7,  # Disgust
        'C': 8   # Contempt
    }
print(f"{id2label=}")
print(f"{label2id=}")

In [None]:
# Convertir EmoClass a valores numéricos si es necesario
train_df['PseudoEmoNum'] = train_df['PseudoEmoNum'].astype(int)
dev_df['PseudoEmoNum'] = dev_df['PseudoEmoNum'].astype(int)
test_df['PseudoEmoNum'] = test_df['PseudoEmoNum'].astype(int)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer


if preprocessed == False:
    # Modelo
    model_ckpt = "distilbert-base-uncased"
    # model_ckpt = "distilroberta-base"


    # Cargar el tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    # Función para tokenizar los datos
    def tokenize_function(examples):
        # Verificar que estamos pasando una lista de textos
        texts = examples['text']
        return tokenizer(texts, padding="max_length", truncation=True)

    # Asegurarse de que 'train_df' y 'dev_df' son objetos Dataset de Hugging Face
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)


    # Tokenizamos ambos conjuntos de datos
    train_dataset = train_dataset.map(tokenize_function, batched=True)
    dev_dataset = dev_dataset.map(tokenize_function, batched=True)
    test_dataset = test_dataset.map(tokenize_function, batched=True)


    # Renombrar columna de labels
    train_dataset = train_dataset.rename_column("PseudoEmoNum", "labels")
    dev_dataset = dev_dataset.rename_column("PseudoEmoNum", "labels")
    test_dataset = test_dataset.rename_column("PseudoEmoNum", "labels")


In [10]:
import re
from datasets import Dataset
from transformers import AutoTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk

if preprocessed:
    # Descargar recursos necesarios de NLTK
    nltk.download('stopwords')
    nltk.download('wordnet')

    # Modelo
    model_ckpt = "distilbert-base-uncased"
    tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

    # Stopwords en inglés (puedes cambiar el idioma si es necesario)
    stop_words = set(stopwords.words('english'))

    # Inicializar el lematizador
    lemmatizer = WordNetLemmatizer()

    # Función para limpiar y tokenizar
    def preprocess_and_tokenize(examples):
        processed_texts = []
        for text in examples['text']:
            # Convertir a minúsculas
            text = text.lower()
            # # Eliminar URLs
            # text = re.sub(r"http\S+|www\S+|https\S+", "", text, flags=re.MULTILINE)
            # # Eliminar menciones y hashtags
            # text = re.sub(r"@\w+|#\w+", "", text)
            # # Eliminar caracteres especiales y puntuación
            # text = re.sub(r"[^a-zA-Z\s]", "", text)
            # # Eliminar números
            # text = re.sub(r"\d+", "", text)
            # # Eliminar palabras de parada
            # words = text.split()
            # words = [word for word in words if word not in stop_words]
            # # Aplicar lematización
            # words = [lemmatizer.lemmatize(word) for word in words]
            # # Unir palabras limpias
            # text = " ".join(words)
            # Añadir texto procesado a la lista
            processed_texts.append(text)
        
        # Tokenizar el texto limpio
        return tokenizer(processed_texts, padding="max_length", truncation=True)

    # Verificar que no haya valores nulos
    train_df = train_df.dropna(subset=["text", "PseudoEmoNum"])
    dev_df = dev_df.dropna(subset=["text", "PseudoEmoNum"])
    test_df = test_df.dropna(subset=["text", "PseudoEmoNum"])

    # Convertir DataFrame a Dataset
    train_dataset = Dataset.from_pandas(train_df)
    dev_dataset = Dataset.from_pandas(dev_df)
    test_dataset = Dataset.from_pandas(test_df)

    # Tokenizar los conjuntos de datos
    train_dataset = train_dataset.map(preprocess_and_tokenize, batched=True, num_proc=4)
    dev_dataset = dev_dataset.map(preprocess_and_tokenize, batched=True, num_proc=4)
    test_dataset = test_dataset.map(preprocess_and_tokenize, batched=True, num_proc=4)

    # Renombrar columna de labels
    train_dataset = train_dataset.rename_column("PseudoEmoNum", "labels")
    dev_dataset = dev_dataset.rename_column("PseudoEmoNum", "labels")
    test_dataset = test_dataset.rename_column("PseudoEmoNum", "labels")

    # Inspeccionar un ejemplo procesado
    print(train_dataset[0])


## Code

In [None]:
from transformers import AutoTokenizer
import torch
print(torch.__version__)  # Esto debería mostrarte la versión de PyTorch instalada

In [None]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, label2id=label2id, id2label=id2label).to(device)
print(model.num_labels)

In [13]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Obtener reporte completo
    report = classification_report(labels, predictions, output_dict=True)
    
    # Obtener la matriz de confusión
    conf_matrix = confusion_matrix(labels, predictions)
    
    # Extraer métricas para cada clase y globales
    metrics = {
        'accuracy': report['accuracy'],
        'macro_f1': report['macro avg']['f1-score'],
        'weighted_f1': report['weighted avg']['f1-score'],
        # 'weighted_precision': report['weighted avg']['precision'],
        # 'weighted_recall': report['weighted avg']['recall'],
        'confusion_matrix': conf_matrix.tolist()  # Convertir a lista para asegurarse de que es serializable si es necesario
    }
    
    # # Añadir métricas específicas por clase si se requiere
    # for label, scores in report.items():
    #     if label not in ["accuracy", "macro avg", "weighted avg"]:
    #         metrics[f'{label}_precision'] = scores['precision']
    #         metrics[f'{label}_recall'] = scores['recall']
    #         metrics[f'{label}_f1'] = scores['f1-score']
    #         metrics[f'{label}_support'] = scores['support']
    
    return metrics

In [None]:
from transformers import Trainer, TrainingArguments

batch_size = 32
logging_steps = len(train_dataset) // batch_size
model_name = f"{model_ckpt}-finetuned-female"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    # push_to_hub=True,
    log_level="error"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

In [None]:
trainer.train()

In [None]:
preds_output = trainer.predict(test_dataset)
preds_metrics = preds_output.metrics
preds_metrics

In [None]:
import matplotlib.pyplot as plt
import numpy as np

labels = label2id.keys()
conf_matrix = preds_metrics['test_confusion_matrix']

# Crear el gráfico de la matriz de confusión
plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()

# Agregar etiquetas
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

# Rótulos de valores dentro de cada celda
conf_matrix = np.array(conf_matrix)

# Calcular el umbral
thresh = conf_matrix.max() / 2
for i, j in np.ndindex(conf_matrix.shape):
    plt.text(
        j, i, format(conf_matrix[i, j], 'd'),
        horizontalalignment="center",
        color="white" if conf_matrix[i, j] > thresh else "black"
    )

# Etiquetas de los ejes
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()