# Dair-ai Emotion Classification Challenge
Este notebook es para un reto propuesto en el semillero investigativo de modelos generativos, construyendo un modelo de clasificación del dataset "Dair-ai Emotion".

In [None]:
%pip install torch huggingface_hub ipywidgets datasets pandas

In [None]:
from huggingface_hub import login
login()

Casi que no pero se logró :'D

In [30]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from datasets import load_dataset

# We download the dair-ai/emotion dataset from huggingface
trainDS = load_dataset("dair-ai/emotion", split="train")
validDS = load_dataset("dair-ai/emotion", split="validation")
testDS = load_dataset("dair-ai/emotion", split="test")

vocabulary = set()
texts = []
labels = []

# The translation for labels
labelsToIntegers = {
    "sadness" : 0,
    "joy" : 1,
    "love" : 2,
    "anger" : 3,
    "fear" : 4,
    "surprise" : 5
}

def extractData(dataset):
    vocabulary = set()
    texts = []
    labels = []
    for data in dataset:
        texts.append(data["text"])
        labels.append(data['label'])
        vocabulary.update(data["text"].split())
    return texts, labels, vocabulary

# Extracting the data from huggingface training Dataset
texts, labels, vocabulary = extractData(trainDS)

# Extracting data from validation Dataset
textsV, labelsV, vocabularyV = extractData(validDS)

# I will define my embeddings from a bag of words so...
vocabulary = list(vocabulary)
indexedWords = {word: i for i, word in enumerate(vocabulary)}

# This is a usage of the bag of words to translate words to numbers :)
tokenizedTexts = [[indexedWords.get(word, len(vocabulary)) for word in text.split()] for text in texts]

# TokenizedTexts for validationDS
tokTextsValidation = [[indexedWords.get(word, len(vocabulary)) for word in text.split()] for text in textsV]

# To avoid problems normalize sequences size doing padding :))))
maxLength = max(len(seq) for seq in tokenizedTexts)
maxLengthV = max(len(seq) for seq in tokTextsValidation)
globalMaxLength = max(maxLength, maxLengthV)
paddedTexts = [seq + [0] * (globalMaxLength - len(seq)) for seq in tokenizedTexts]

# Do the same for validationDS
paddedTextsV = [seq + [0] * (globalMaxLength - len(seq)) for seq in tokTextsValidation]

# Convert text and labels to Tensors
textTensor = torch.tensor(paddedTexts, dtype=torch.long)
labelTensor = torch.tensor(labels, dtype=torch.long)

# The same for validationDS
textVTensor = torch.tensor(paddedTextsV, dtype=torch.long)
labelVTensor = torch.tensor(labelsV, dtype=torch.long)

# Create DataLoader, I thought for pretty much all the day that the dataset
# was a array of 2 arrays/tensors, labels and data, much easier with TensorDataset
trainingTensorDS = TensorDataset(textTensor, labelTensor)
trainDataLoader = DataLoader(trainingTensorDS, batch_size=32, shuffle=True)

# Create DataLoader for ValidationDS
validationTensorDS = TensorDataset(textVTensor, labelVTensor)
validationDataLoader = DataLoader(validationTensorDS, batch_size=32, shuffle=False)

# I will combine our embedding layer to a 2 layer feedforward network :), pretty simple
class EmotionClassificationModel(nn.Module):
    def __init__(self, vocabularySize, embeddingDim, hiddenDim, numClasses):
        super().__init__()
        # Now we need to transform sentencesWIndex into a dense vector representation
        # Create embedding layer (matrix)
        self.embedding = nn.Embedding(vocabularySize, embeddingDim)
        self.linear1 = nn.Linear(embeddingDim, hiddenDim)
        self.linear2 = nn.Linear(hiddenDim, numClasses)
        self.relu = nn.ReLU()
    
    def forward(self, text):
        embedded = self.embedding(text)
        # I will apply a simple mean pooling
        pooled = embedded.mean(dim=1)
        hidden = self.relu(self.linear1(pooled)) 
        classification = self.linear2(hidden)
        return classification

# In OpenAI text-embedding-3-small the embedding dimension is 1536, let's try 200
emotionModel = EmotionClassificationModel(vocabularySize=len(vocabulary), embeddingDim=200, hiddenDim=60, numClasses=6)
# Define a loss function
criterion = nn.CrossEntropyLoss()
# Define a optimizer that updates model parameters (Adam)
optimizer = torch.optim.Adam(emotionModel.parameters())

# Let's try the validation dataset!
def validate(emotionModel, validationDataLoader, criterion):
    # We do a similar process as in the training part of the model
    emotionModel.eval()
    totalLoss = 0
    correctPredictions = 0
    totalPredictions = 0

    for data, labels in validationDataLoader:
        #print("Data shape:", data.shape)
        #print("Labels shape:", labels.shape)
        outputs = emotionModel(data)
        loss = criterion(outputs, labels)

        totalLoss += loss.item()
        # We want to compare the predicted cases
        predicted = torch.max(outputs, 1)
        correctPredictions += (predicted == labels).sum().item()
        totalPredictions += labels.size(0)
    
    avgLoss = totalLoss / len(validationTensorDS)
    accuracy = correctPredictions / totalPredictions
    return avgLoss, accuracy

# Let's train our model!
numEpochs = 10
for epoch in range(numEpochs):
    emotionModel.train()
    totalLoss = 0
    for batchIdx, (data, labels) in enumerate(trainDataLoader):
        #print("Data shape:", data.shape)
        #print("Labels shape:", labels.shape)
        predictions = emotionModel(data)
        loss = criterion(predictions, labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        totalLoss += loss.item()
        
        if batchIdx % 100 == 0:
            print(f'Epoch [{epoch+1}/{numEpochs}], Step [{batchIdx+1}/{len(trainDataLoader)}], Loss: {loss.item():.4f}')
    
    avgLoss = totalLoss / len(trainDataLoader)
    print(f'Epoch [{epoch+1}/{numEpochs}], Average Loss: {avgLoss:.4f}')

    #valLoss, valAccuracy = validate(emotionModel, validationDataLoader, criterion)
    #print(f'Validation Loss: {valLoss: .4f}, Validation Accuracy: {valAccuracy:.4f}')


Using the latest cached version of the dataset since dair-ai/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'split' at C:\Users\juanb\.cache\huggingface\datasets\dair-ai___emotion\split\1.0.0\9ce63038044ae35ec1305d998d1882fcecd70ec8 (last modified on Fri Jul 19 13:23:08 2024).
Using the latest cached version of the dataset since dair-ai/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'split' at C:\Users\juanb\.cache\huggingface\datasets\dair-ai___emotion\split\1.0.0\9ce63038044ae35ec1305d998d1882fcecd70ec8 (last modified on Fri Jul 19 13:23:08 2024).
Using the latest cached version of the dataset since dair-ai/emotion couldn't be found on the Hugging Face Hub
Found the latest cached dataset configuration 'split' at C:\Users\juanb\.cache\huggingface\datasets\dair-ai___emotion\split\1.0.0\9ce63038044ae35ec1305d998d1882fcecd70ec8 (last modified on Fri Jul 19 13:23:08 2024).


Epoch [1/10], Step [1/500], Loss: 1.7506
Epoch [1/10], Step [101/500], Loss: 1.4550
Epoch [1/10], Step [201/500], Loss: 1.5355
Epoch [1/10], Step [301/500], Loss: 1.6549
Epoch [1/10], Step [401/500], Loss: 1.4374
Epoch [1/10], Average Loss: 1.5814
Epoch [2/10], Step [1/500], Loss: 1.5801
Epoch [2/10], Step [101/500], Loss: 1.4791
Epoch [2/10], Step [201/500], Loss: 1.5712
Epoch [2/10], Step [301/500], Loss: 1.5680
Epoch [2/10], Step [401/500], Loss: 1.6185
Epoch [2/10], Average Loss: 1.5491
Epoch [3/10], Step [1/500], Loss: 1.6667
Epoch [3/10], Step [101/500], Loss: 1.7071
Epoch [3/10], Step [201/500], Loss: 1.4523
Epoch [3/10], Step [301/500], Loss: 1.5500
Epoch [3/10], Step [401/500], Loss: 1.1937
Epoch [3/10], Average Loss: 1.3762
Epoch [4/10], Step [1/500], Loss: 1.0687
Epoch [4/10], Step [101/500], Loss: 1.0089
Epoch [4/10], Step [201/500], Loss: 1.1298
Epoch [4/10], Step [301/500], Loss: 0.8556
Epoch [4/10], Step [401/500], Loss: 0.8769
Epoch [4/10], Average Loss: 0.9616
Epoch [5