In [1]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer
import torch
import pandas as pd
import torch.nn.functional as F
from tqdm import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Data

In [2]:
data_male = pd.read_csv('data/balanced_pseudo_male.csv')
data_female = pd.read_csv('data/balanced_pseudo_female.csv')

data_balanced = pd.concat([data_male, data_female], ignore_index=True)
len(data_balanced)
data_balanced.head()

Unnamed: 0,FileName,text,EmoClass,EmoAct,EmoVal,EmoDom,SpkrID,Gender,Split_Set,PodcastID,StratifyCol,NewPartition,PseudoEmo,InvEntropyNorm,PseudoEmoNum
0,MSP-PODCAST_4583_0017_0001,"from what it looks like, heidi was okay with j...",A,4.8,2.8,5.2,2357,Male,Train,4583,4583_2357_A,Train,A,1.0,2
1,MSP-PODCAST_3162_0475,gave them extra money and then they decided th...,A,6.2,2.6,6.2,2124,Male,Development,3162,3162_2124_A,Train,A,1.0,2
2,MSP-PODCAST_2355_0349,"yes, you're going to fucking die.",A,4.2,2.6,4.8,1601,Male,Train,2355,2355_1601_A,Test,A,1.0,2
3,MSP-PODCAST_5201_1197,and i already kind of had it in my mind,A,5.2,3.6,5.4,2789,Male,Development,5201,5201_2789_A,Train,A,1.0,2
4,MSP-PODCAST_5583_1543,i'm talking about a lot of you guys got to sta...,A,4.8,3.4,4.8,2953,Male,Train,5583,5583_2953_A,Train,A,1.0,2


In [10]:
def predict_proba(df, tokenizer, model, device):
    """
    Realiza predicciones con salida blanda para un DataFrame.
    
    Args:
    df (pd.DataFrame): DataFrame con los textos a predecir.
    
    Returns:
    pd.DataFrame: DataFrame con las probabilidades de cada clase.
    """
    all_probs = []
    
    for text in tqdm(df['text'], desc="Procesando predicciones", unit="texto"):
        # Tokenizar el texto
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
        
        # Mover a la GPU si es necesario
        inputs = {key: value.to(device) for key, value in inputs.items()}
        
        # Hacer la predicción
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Aplicar softmax para obtener probabilidades
        probs = F.softmax(outputs.logits, dim=1).cpu().numpy().flatten()
        
        all_probs.append(probs)
    
    # Convertir a DataFrame
    num_classes = len(all_probs[0])
    prob_df = pd.DataFrame(all_probs, columns=[f"Clase_{i}" for i in range(num_classes)])
    
    # Concatenar con el DataFrame original
    df_result = pd.concat([df.reset_index(drop=True), prob_df], axis=1)
    
    return df_result

## Male

In [3]:
# Ruta al checkpoint
male_path = "./models/distilbert-base-uncased-finetuned-male/checkpoint-870"

# Cargar el tokenizador
tokenizer_male = AutoTokenizer.from_pretrained(male_path)

# Cargar el modelo
model_male = AutoModelForSequenceClassification.from_pretrained(male_path)

# Mover el modelo a la GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_male.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): DistilBertSdpaAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)


In [5]:
# Predecir y obtener salidas blandas
df_predictions_male = predict_proba(data_balanced)

Procesando predicciones: 100%|██████████| 11344/11344 [02:21<00:00, 80.15texto/s]


In [9]:
# Mostrar el resultado
df_predictions_male

Unnamed: 0,FileName,text,EmoClass,EmoAct,EmoVal,EmoDom,SpkrID,Gender,Split_Set,PodcastID,...,InvEntropyNorm,PseudoEmoNum,Clase_0,Clase_1,Clase_2,Clase_3,Clase_4,Clase_5,Clase_6,Clase_7
0,MSP-PODCAST_4583_0017_0001,"from what it looks like, heidi was okay with j...",A,4.800000,2.80,5.200000,2357,Male,Train,4583,...,1.000000,2,0.021784,0.017141,0.615117,0.085869,0.048408,0.034944,0.130718,0.046019
1,MSP-PODCAST_3162_0475,gave them extra money and then they decided th...,A,6.200000,2.60,6.200000,2124,Male,Development,3162,...,1.000000,2,0.018133,0.010961,0.543206,0.072831,0.098502,0.023052,0.172499,0.060815
2,MSP-PODCAST_2355_0349,"yes, you're going to fucking die.",A,4.200000,2.60,4.800000,1601,Male,Train,2355,...,1.000000,2,0.014051,0.016372,0.722986,0.043221,0.029405,0.020655,0.107431,0.045878
3,MSP-PODCAST_5201_1197,and i already kind of had it in my mind,A,5.200000,3.60,5.400000,2789,Male,Development,5201,...,1.000000,2,0.070990,0.026357,0.040146,0.672142,0.019466,0.019884,0.036473,0.114542
4,MSP-PODCAST_5583_1543,i'm talking about a lot of you guys got to sta...,A,4.800000,3.40,4.800000,2953,Male,Train,5583,...,1.000000,2,0.013623,0.011982,0.720826,0.031806,0.052420,0.019218,0.112196,0.037931
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11339,MSP-PODCAST_2436_0067,everything's a lead organizer with the tps all...,U,5.600000,5.80,5.200000,1437,Female,Train,2436,...,1.000000,5,0.760902,0.093419,0.006885,0.016834,0.016783,0.062903,0.008460,0.033815
11340,MSP-PODCAST_1673_0033_0074,if i'd known it was a budding poet sleeping ne...,X,5.400000,4.20,5.000000,880,Female,Train,1673,...,0.226465,5,0.541963,0.126742,0.011534,0.130502,0.015816,0.049547,0.014506,0.109389
11341,MSP-PODCAST_1169_0096,"i just want somebody to step in and go, this ...",U,5.583333,5.75,5.333333,743,Female,Development,1169,...,1.000000,5,0.018234,0.009070,0.215654,0.016752,0.291869,0.067869,0.276344,0.104208
11342,MSP-PODCAST_5185_0412,god and every single person that's listening a...,X,4.200000,4.40,5.400000,2786,Female,Development,5185,...,0.231208,5,0.042067,0.186978,0.034574,0.010060,0.074141,0.576107,0.021849,0.054224


## Female

In [None]:
# Ruta al checkpoint
female_path = "./models/distilbert-base-uncased-finetuned-female/checkpoint-790"

# Cargar el tokenizador
tokenizer_female = AutoTokenizer.from_pretrained(female_path)

# Cargar el modelo
model_female = AutoModelForSequenceClassification.from_pretrained(female_path)

# Mover el modelo a la GPU si está disponible
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model_female.to(device)