# Train Interspeech

## Data

In [1]:
import pandas as pd

# Cargar el CSV para male
data = pd.read_csv('data/data_female.csv')
print(len(data))
# Eliminar filas con valores nulos solo en la columna 'text'
data = data.dropna(subset=['text'])
print(len(data))
data.head()

17581
17576


Unnamed: 0,FileName,text,EmoClass,EmoAct,EmoVal,EmoDom,SpkrID,Gender,Split_Set,PodcastID,StratifyCol,NewPartition,PseudoEmo,InvEntropyNorm,PseudoEmoNum
0,MSP-PODCAST_3820_0101_0000,or you're just done with all of the stuff that...,N,4.2,3.4,4.4,2289,Female,Train,3820,3820_2289_N,Train,N,1.0,0
1,MSP-PODCAST_0727_0275,"or just end it. if you are uncertain about it,...",X,4.2,3.2,4.2,471,Female,Development,727,0727_471_X,Train,F,0.199354,7
2,MSP-PODCAST_0452_0122,it's like someone on a railroad track.,H,5.0,5.4,3.8,638,Female,Train,452,0452_638_H,Test,H,1.0,1
3,MSP-PODCAST_1664_0017_0003,i don't know why they're a key point for me. i...,N,4.4,3.6,3.8,953,Female,Train,1664,1664_953_N,Train,N,1.0,0
4,MSP-PODCAST_0675_0430,this was definitely one of them. omg.,X,3.2,4.4,4.0,1261,Female,Train,675,0675_1261_X,Test,N,0.222219,0


In [2]:
# Filtrar los datos en tres conjuntos basados en la columna 'Split_Set'
train_df = data.loc[data['NewPartition'] == 'Train']
dev_df = data.loc[data['NewPartition'] == 'Evaluation']
test_df = data.loc[data['NewPartition'] == 'Test']

# Verifica las primeras filas de cada conjunto
print("Conjunto Train:")
print(train_df.head())

print("Conjunto Development:")
print(dev_df.head())

print("Conjunto Test:")
print(test_df.head())

Conjunto Train:
                     FileName  \
0  MSP-PODCAST_3820_0101_0000   
1       MSP-PODCAST_0727_0275   
3  MSP-PODCAST_1664_0017_0003   
5       MSP-PODCAST_5756_0423   
7  MSP-PODCAST_0861_0097_0002   

                                                text EmoClass  EmoAct  EmoVal  \
0  or you're just done with all of the stuff that...        N     4.2     3.4   
1  or just end it. if you are uncertain about it,...        X     4.2     3.2   
3  i don't know why they're a key point for me. i...        N     4.4     3.6   
5  this is a great way to now add it in the back ...        N     3.4     4.8   
7   but it would be very hard for me to accept th...        N     4.8     4.4   

   EmoDom  SpkrID  Gender    Split_Set  PodcastID  StratifyCol NewPartition  \
0     4.4    2289  Female        Train       3820  3820_2289_N        Train   
1     4.2     471  Female  Development        727   0727_471_X        Train   
3     3.8     953  Female        Train       1664   1664_953_

In [3]:
# # Establecer la semilla para garantizar reproducibilidad
# seed = 42

# # Seleccionar aleatoriamente las primeras N filas del conjunto de entrenamiento
# train_df = train_df.sample(n=1000, random_state=seed)

# # Seleccionar aleatoriamente las primeras N filas del conjunto de desarrollo
# dev_df = dev_df.sample(n=250, random_state=seed)

In [4]:
num_labels = 8
id2label = dict(zip(train_df["PseudoEmoNum"], train_df["PseudoEmo"]))
label2id = dict(zip(train_df["PseudoEmo"], train_df["PseudoEmoNum"]))

In [5]:
# Convertir EmoClass a valores numéricos si es necesario
train_df['PseudoEmoNum'] = train_df['PseudoEmoNum'].astype(int)
dev_df['PseudoEmoNum'] = dev_df['PseudoEmoNum'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_df['PseudoEmoNum'] = train_df['PseudoEmoNum'].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dev_df['PseudoEmoNum'] = dev_df['PseudoEmoNum'].astype(int)


In [6]:
from datasets import Dataset
from transformers import AutoTokenizer

# Modelo
model_ckpt = "distilbert-base-uncased"

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Función para tokenizar los datos
def tokenize_function(examples):
    # Verificar que estamos pasando una lista de textos
    texts = examples['text']
    return tokenizer(texts, padding="max_length", truncation=True)

# Asegurarse de que 'train_df' y 'dev_df' son objetos Dataset de Hugging Face
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Tokenizamos ambos conjuntos de datos
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Renombrar columna de labels
train_dataset = train_dataset.rename_column("PseudoEmoNum", "labels")
dev_dataset = dev_dataset.rename_column("PseudoEmoNum", "labels")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 9996/9996 [00:00<00:00, 15557.30 examples/s]
Map: 100%|██████████| 3321/3321 [00:00<00:00, 16719.86 examples/s]


## Code

In [7]:
from transformers import AutoTokenizer
import torch
print(torch.__version__)  # Esto debería mostrarte la versión de PyTorch instalada

2.5.1


In [8]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Obtener reporte completo
    report = classification_report(labels, predictions, output_dict=True)
    
    # Obtener la matriz de confusión
    conf_matrix = confusion_matrix(labels, predictions)
    
    # Extraer métricas para cada clase y globales
    metrics = {
        'accuracy': report['accuracy'],
        'macro_f1': report['macro avg']['f1-score'],
        'weighted_f1': report['weighted avg']['f1-score'],
        'weighted_precision': report['weighted avg']['precision'],
        'weighted_recall': report['weighted avg']['recall'],
        'confusion_matrix': conf_matrix.tolist()  # Convertir a lista para asegurarse de que es serializable si es necesario
    }
    
    # # Añadir métricas específicas por clase si se requiere
    # for label, scores in report.items():
    #     if label not in ["accuracy", "macro avg", "weighted avg"]:
    #         metrics[f'{label}_precision'] = scores['precision']
    #         metrics[f'{label}_recall'] = scores['recall']
    #         metrics[f'{label}_f1'] = scores['f1-score']
    #         metrics[f'{label}_support'] = scores['support']
    
    return metrics

In [10]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(train_dataset) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    #push_to_hub=True,
    log_level="error"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss


In [None]:
preds_output = trainer.predict(dev_dataset)
preds_metrics = preds_output.metrics
preds_metrics

In [None]:
import matplotlib.pyplot as plt
import numpy as np

labels = label2id.keys()
conf_matrix = preds_metrics['test_confusion_matrix']

# Crear el gráfico de la matriz de confusión
plt.figure(figsize=(10, 8))
plt.imshow(conf_matrix, interpolation='nearest', cmap=plt.cm.Blues)
plt.title("Confusion Matrix")
plt.colorbar()

# Agregar etiquetas
tick_marks = np.arange(len(labels))
plt.xticks(tick_marks, labels)
plt.yticks(tick_marks, labels)

# Rótulos de valores dentro de cada celda
conf_matrix = np.array(conf_matrix)

# Calcular el umbral
thresh = conf_matrix.max() / 2
for i, j in np.ndindex(conf_matrix.shape):
    plt.text(
        j, i, format(conf_matrix[i, j], 'd'),
        horizontalalignment="center",
        color="white" if conf_matrix[i, j] > thresh else "black"
    )

# Etiquetas de los ejes
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()