# Train Interspeech

## Data

In [1]:
import pandas as pd

# Cargar el CSV
data = pd.read_csv('data.csv')
print(len(data))
# Eliminar filas con valores nulos solo en la columna 'text'
data = data.dropna(subset=['text'])
print(len(data))
data.head()

116221
116193


Unnamed: 0,FileName,text,EmoClass,EmoAct,EmoVal,EmoDom,SpkrID,Gender,Split_Set
0,MSP-PODCAST_2432_0200,... happening there as well. and you have the ...,S,2.8,2.2,3.4,1425,Male,Train
1,MSP-PODCAST_0133_0033,look they're - they're reporting on absolute ...,D,6.8,2.8,6.6,54,Male,Development
2,MSP-PODCAST_0288_0019,"mr. [excess 00:01:24] also known as ike, bbc r...",H,5.333333,5.416667,4.75,123,Male,Train
3,MSP-PODCAST_2546_0333_0003,and instead of us just handing people masks...,H,4.6,4.0,4.8,1644,Male,Development
4,MSP-PODCAST_3820_0101_0000,or you're just done with all of the stuff that...,N,4.2,3.4,4.4,2289,Female,Train


In [2]:
# Filtrar los datos en tres conjuntos basados en la columna 'Split_Set'
train_df = data.loc[data['Split_Set'] == 'Train']
dev_df = data.loc[data['Split_Set'] == 'Development']
test_df = data.loc[data['Split_Set'] == 'Test']

# Verifica las primeras filas de cada conjunto
print("Conjunto Train:")
print(train_df.head())

print("Conjunto Development:")
print(dev_df.head())

print("Conjunto Test:")
print(test_df.head())

Conjunto Train:
                     FileName  \
0       MSP-PODCAST_2432_0200   
2       MSP-PODCAST_0288_0019   
4  MSP-PODCAST_3820_0101_0000   
6       MSP-PODCAST_0545_0449   
7       MSP-PODCAST_5492_2849   

                                                text EmoClass    EmoAct  \
0  ... happening there as well. and you have the ...        S  2.800000   
2  mr. [excess 00:01:24] also known as ike, bbc r...        H  5.333333   
4  or you're just done with all of the stuff that...        N  4.200000   
6  man, the power of contrast is so, i think, eas...        N  3.200000   
7  ... we're older. so why not allow a little bit...        A  5.400000   

     EmoVal  EmoDom  SpkrID  Gender Split_Set  
0  2.200000    3.40    1425    Male     Train  
2  5.416667    4.75     123    Male     Train  
4  3.400000    4.40    2289  Female     Train  
6  3.800000    3.80     227    Male     Train  
7  2.400000    5.00    2889    Male     Train  
Conjunto Development:
                      Fi

In [None]:
# Seleccionar las primeras 1000 filas de los datos de entrenamiento y desarrollo
train_df = train_df.head(1000)  # Primeros 1000 ejemplos del conjunto de entrenamiento
dev_df = dev_df.head(250)  # Primeros 1000 ejemplos del conjunto de desarrollo


In [4]:
# Verificar las etiquetas únicas en EmoClass antes de mapear
print("Etiquetas únicas en 'EmoClass' antes del mapeo:")
print(train_df['EmoClass'].unique())


Etiquetas únicas en 'EmoClass' antes del mapeo:
['S' 'H' 'N' 'A' 'X' 'C' 'U' 'O' 'D' 'F']


In [5]:
num_labels = 10
id2label = {
    0: "A",
    1: "S",
    2: "H",
    3: "U",
    4: "F",
    5: "D",
    6: "C",
    7: "N",
    8: "O",
    9: "X"
}
label2id = {
    "A": 0,
    "S": 1,
    "H": 2,
    "U": 3,
    "F": 4,
    "D": 5,
    "C": 6,
    "N": 7,
    "O": 8,
    "X": 9
}

# Convertir EmoClass a valores numéricos si es necesario
train_df['EmoClass'] = train_df['EmoClass'].map(label2id).astype(int)
dev_df['EmoClass'] = dev_df['EmoClass'].map(label2id).astype(int)

In [6]:
from datasets import Dataset
from transformers import AutoTokenizer

# Modelo
model_ckpt = "distilbert-base-uncased"

# Cargar el tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_ckpt)

# Función para tokenizar los datos
def tokenize_function(examples):
    # Verificar que estamos pasando una lista de textos
    texts = examples['text']
    return tokenizer(texts, padding="max_length", truncation=True)

# Asegurarse de que 'train_df' y 'dev_df' son objetos Dataset de Hugging Face
train_dataset = Dataset.from_pandas(train_df)
dev_dataset = Dataset.from_pandas(dev_df)

# Tokenizamos ambos conjuntos de datos
train_dataset = train_dataset.map(tokenize_function, batched=True)
dev_dataset = dev_dataset.map(tokenize_function, batched=True)

# Renombrar columna de labels
train_dataset = train_dataset.rename_column("EmoClass", "labels")
dev_dataset = dev_dataset.rename_column("EmoClass", "labels")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 1000/1000 [00:00<00:00, 4540.77 examples/s]
Map: 100%|██████████| 1000/1000 [00:00<00:00, 6750.69 examples/s]


## Code

In [7]:
from transformers import AutoTokenizer
import torch
print(torch.__version__)  # Esto debería mostrarte la versión de PyTorch instalada

2.5.1+cu124


In [8]:
from transformers import AutoModelForSequenceClassification
import torch

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForSequenceClassification.from_pretrained(model_ckpt, num_labels=num_labels, id2label=id2label, label2id=label2id).to(device)

  return torch._C._cuda_getDeviceCount() > 0


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
from sklearn.metrics import accuracy_score, f1_score

def compute_metrics(eval_pred):
    # Desempaquetamos las predicciones y las etiquetas
    logits, labels = eval_pred

    # Convertimos logits a etiquetas predichas usando argmax
    predictions = torch.argmax(torch.tensor(logits), axis=-1)

    # Calculamos precisión
    accuracy = accuracy_score(labels, predictions)

    # Calculamos F1 score (micro promedio)
    f1 = f1_score(labels, predictions, average='micro')

    return {
        'accuracy': accuracy,
        'f1': f1
    }


In [10]:
from transformers import Trainer, TrainingArguments

batch_size = 64
logging_steps = len(train_dataset) // batch_size
model_name = f"{model_ckpt}-finetuned-emotion"
training_args = TrainingArguments(
    output_dir=model_name,
    num_train_epochs=2,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    disable_tqdm=False,
    logging_steps=logging_steps,
    #push_to_hub=True,
    log_level="error"
)

trainer = Trainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=dev_dataset,
    tokenizer=tokenizer
)

  trainer = Trainer(


In [11]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,2.1615,2.099988,0.236,0.236
2,1.949,2.051714,0.237,0.237


TrainOutput(global_step=32, training_loss=2.045714110136032, metrics={'train_runtime': 1718.8558, 'train_samples_per_second': 1.164, 'train_steps_per_second': 0.019, 'total_flos': 264972595200000.0, 'train_loss': 2.045714110136032, 'epoch': 2.0})

In [12]:
preds_output = trainer.predict(dev_dataset)
preds_output.metrics

{'test_loss': 2.0517144203186035,
 'test_accuracy': 0.237,
 'test_f1': 0.237,
 'test_runtime': 155067.0275,
 'test_samples_per_second': 0.006,
 'test_steps_per_second': 0.0}