# Train Interspeech

## Data

In [None]:
import pandas as pd

# Cargar el CSV
data = pd.read_csv('data.csv')
print(len(data))
# Eliminar filas con valores nulos solo en la columna 'text'
data = data.dropna(subset=['text'])
print(len(data))
data.head()

116221
116193


Unnamed: 0,FileName,text,EmoClass,EmoAct,EmoVal,EmoDom,SpkrID,Gender,Split_Set
0,MSP-PODCAST_2432_0200,... happening there as well. and you have the ...,S,2.8,2.2,3.4,1425,Male,Train
1,MSP-PODCAST_0133_0033,look they're - they're reporting on absolute ...,D,6.8,2.8,6.6,54,Male,Development
2,MSP-PODCAST_0288_0019,"mr. [excess 00:01:24] also known as ike, bbc r...",H,5.333333,5.416667,4.75,123,Male,Train
3,MSP-PODCAST_2546_0333_0003,and instead of us just handing people masks...,H,4.6,4.0,4.8,1644,Male,Development
4,MSP-PODCAST_3820_0101_0000,or you're just done with all of the stuff that...,N,4.2,3.4,4.4,2289,Female,Train


In [2]:
# Filtrar los datos en tres conjuntos basados en la columna 'Split_Set'
train_df = data.loc[data['Split_Set'] == 'Train']
dev_df = data.loc[data['Split_Set'] == 'Development']
test_df = data.loc[data['Split_Set'] == 'Test']

# Verifica las primeras filas de cada conjunto
print("Conjunto Train:")
print(train_df.head())

print("Conjunto Development:")
print(dev_df.head())

print("Conjunto Test:")
print(test_df.head())

Conjunto Train:
                     FileName  \
0       MSP-PODCAST_2432_0200   
2       MSP-PODCAST_0288_0019   
4  MSP-PODCAST_3820_0101_0000   
6       MSP-PODCAST_0545_0449   
7       MSP-PODCAST_5492_2849   

                                                text EmoClass    EmoAct  \
0  ... happening there as well. and you have the ...        S  2.800000   
2  mr. [excess 00:01:24] also known as ike, bbc r...        H  5.333333   
4  or you're just done with all of the stuff that...        N  4.200000   
6  man, the power of contrast is so, i think, eas...        N  3.200000   
7  ... we're older. so why not allow a little bit...        A  5.400000   

     EmoVal  EmoDom  SpkrID  Gender Split_Set  
0  2.200000    3.40    1425    Male     Train  
2  5.416667    4.75     123    Male     Train  
4  3.400000    4.40    2289  Female     Train  
6  3.800000    3.80     227    Male     Train  
7  2.400000    5.00    2889    Male     Train  
Conjunto Development:
                      Fi

In [3]:
# Establecer la semilla para garantizar reproducibilidad
seed = 42

# Seleccionar aleatoriamente las primeras N filas del conjunto de entrenamiento
train_df = train_df.sample(n=1000, random_state=seed)

# Seleccionar aleatoriamente las primeras N filas del conjunto de desarrollo
dev_df = dev_df.sample(n=250, random_state=seed)

In [4]:
# Verificar las etiquetas únicas en EmoClass antes de mapear
print("Etiquetas únicas en 'EmoClass' antes del mapeo:")
print(train_df['EmoClass'].unique())


Etiquetas únicas en 'EmoClass' antes del mapeo:
['N' 'S' 'H' 'A' 'X' 'U' 'F' 'C' 'D' 'O']


In [5]:
num_labels = 10
id2label = {
    0: "A",
    1: "S",
    2: "H",
    3: "U",
    4: "F",
    5: "D",
    6: "C",
    7: "N",
    8: "O",
    9: "X"
}
label2id = {
    "A": 0,
    "S": 1,
    "H": 2,
    "U": 3,
    "F": 4,
    "D": 5,
    "C": 6,
    "N": 7,
    "O": 8,
    "X": 9
}

# Convertir EmoClass a valores numéricos si es necesario
train_df['EmoClass'] = train_df['EmoClass'].map(label2id).astype(int)
dev_df['EmoClass'] = dev_df['EmoClass'].map(label2id).astype(int)

In [None]:
from datasets import Dataset
from transformers import AutoTokenizer

# Modelo
model_ckpt = "distilbert-base-uncased"

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Función para tokenizar los datos
def tokenize_function(examples):
    # Verificar que estamos pasando una lista de textos
    texts = examples['text']
    return tokenizer(texts, padding="max_length", truncation=True)

# Asegurarse de que 'train_df' y 'dev_df' son objetos Dataset de Hugging Face
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Tokenizamos ambos conjuntos de datos
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Renombrar columna de labels
train_dataset = train_dataset.rename_column("EmoClass", "labels")
dev_dataset = dev_dataset.rename_column("EmoClass", "labels")


  from .autonotebook import tqdm as notebook_tqdm


## Code

In [7]:
from transformers import AutoTokenizer
import torch
print(torch.__version__)  # Esto debería mostrarte la versión de PyTorch instalada

2.5.1


In [8]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from sklearn.metrics import classification_report, confusion_matrix
import numpy as np


def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    
    # Obtener reporte completo
    report = classification_report(labels, predictions, output_dict=True)
    
    # Obtener la matriz de confusión
    conf_matrix = confusion_matrix(labels, predictions)
    
    # Extraer métricas para cada clase y globales
    metrics = {
        'accuracy': report['accuracy'],
        'weighted_f1': report['weighted avg']['f1-score'],
        # 'weighted_precision': report['weighted avg']['precision'],
        # 'weighted_recall': report['weighted avg']['recall'],
        # La matriz de confusión no se incluye normalmente como una métrica devuelta porque no es un escalar
        # 'confusion_matrix': conf_matrix.tolist()  # Convertir a lista para asegurarse de que es serializable si es necesario
    }
    
    # # Añadir métricas específicas por clase si se requiere
    # for label, scores in report.items():
    #     if label not in ["accuracy", "macro avg", "weighted avg"]:
    #         metrics[f'{label}_precision'] = scores['precision']
    #         metrics[f'{label}_recall'] = scores['recall']
    #         metrics[f'{label}_f1'] = scores['f1-score']
    #         metrics[f'{label}_support'] = scores['support']
    
    return metrics

In [10]:
from transformers import Trainer, TrainingArguments

batch_size = 8
logging_steps = len(train_dataset) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=10,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",       # Evalúa al final de cada epoch
    save_strategy="epoch",             # Guarda un checkpoint al final de cada epoch
    save_total_limit=1,                # Mantiene solo el mejor checkpoint
    load_best_model_at_end=True,       # Carga el mejor modelo al final del entrenamiento
    metric_for_best_model="accuracy",  # Métrica utilizada para seleccionar el mejor modelo
    greater_is_better=True,            # Especifica si mayor valor es mejor para la métrica
    logging_steps=logging_steps,
    log_level="error",
    disable_tqdm=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,Weighted F1
1,1.9187,1.914432,0.264,0.115489
2,1.7504,1.872409,0.276,0.206577
3,1.5526,1.886392,0.348,0.280479
4,1.2685,2.050945,0.288,0.263589
5,0.9832,2.15099,0.316,0.296491
6,0.7393,2.376164,0.284,0.265352
7,0.5761,2.483486,0.28,0.260726
8,0.462,2.617593,0.296,0.284329
9,0.4022,2.66708,0.288,0.275651
10,0.3621,2.734391,0.284,0.2695


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize

TrainOutput(global_step=1250, training_loss=1.0015172088623048, metrics={'train_runtime': 1010.1109, 'train_samples_per_second': 9.9, 'train_steps_per_second': 1.237, 'total_flos': 1324862976000000.0, 'train_loss': 1.0015172088623048, 'epoch': 10.0})

In [12]:
preds_output = trainer.predict(dev_dataset)
preds_output.metrics

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


{'test_loss': 2.734391450881958,
 'test_accuracy': 0.284,
 'test_weighted_f1': 0.2694996600208677,
 'test_runtime': 7.87,
 'test_samples_per_second': 31.766,
 'test_steps_per_second': 4.066}