# Goal Of The File
Explore the emotion dataset in order to understand how to feed it to a text classifier
that will be trained from scratch.


In [53]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import transformers.tokenization_utils_base
from datasets import load_dataset
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from umap import UMAP
from torch import nn
from torch.nn.functional import cross_entropy
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import neural_network
from torch.utils.data import DataLoader
# Set column width for dataframes
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 1000)

In [3]:
# Load the emotions dataset
emotions = load_dataset("emotion")

# Load all of the datasets
training_dataset, validation_dataset, test_dataset = emotions["train"], emotions["validation"], emotions["test"]


In [4]:
# Set the format of the datasets to pandas ?
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head(10)


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,2
4,i am feeling grouchy,3
5,ive been feeling a little burdened lately wasnt sure why that was,0
6,ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny,5
7,i feel as confused about life as a teenager or as jaded as a year old man,4
8,i have been with petronas for years i feel that petronas has performed well and made a huge profit,1
9,i feel romantic too,2


In [6]:
training_dataset.features["label"]

ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)

In [35]:
# Get the model tokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
def transform_label(i):

    if i in [0, 3, 4]:
        return 0
    if i in [1, 2, 5]:
        return 1

    
# Function that will be used to tokenize inputs
def tokenize(batch: dict) -> transformers.tokenization_utils_base.BatchEncoding:
    """
    Function that takes in as input a batch that is a dictionary and encodes all of the "text" values.
    :param batch: dictionary of texts as a batch can be multiple texts
    :return: encoded texts for a given batch
    """
    output = tokenizer(batch["text"], padding=True, truncation=True,return_tensors='pt')
    output.update({"merged_label":[transform_label(i) for i in batch["label"]]})
    return output

In [36]:

# Reset the format of our data
emotions.reset_format()
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=2000)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

In [49]:
emotions_encoded_dataframe = emotions_encoded["train"]
def collate_fn(batch):
    return {key: [d[key] for d in batch] for key in batch[0]}
train_dataloader = DataLoader(emotions_encoded_dataframe, batch_size=5, shuffle=False, collate_fn=collate_fn)



In [56]:
"""
The following script will contain a class that will be used to configure our transformer
"""


class TransformerTextConfiguration:
    """
    Configuration class that will be used to set the text classifier model parameters
    """

    embedding_dimension = 768  # For simplicity same embedding dimension along all layers (size of the hidden states)
    head_dimension = 64  # embedding dimension of a single head (intrinsicly sets the number of attention heads)
    reasoning_factor = 0.5  # upscaling from the embedding dimension in Feed Forward(after multi-head attention layer)
    hidden_dropout_probability = 0.1  # percentage of dropped values after Feed Forward
    vocabulary_size = 30522  # Size of the vocabulary of the tokenizer
    max_position_embedding = 512  # maximum possible length of a sequence that the model
    num_hidden_layers = 6  # Number of (MHA + FF Layers) Equivalent of number of ConvBlocks in Computer Vision
    num_labels = 2  # Number of Labels
    batch_size = 64  # Size of the batch used for training
    number_epochs = 10


configuration_dictionary = TransformerTextConfiguration


In [76]:
# Check that MPS is available
if not torch.backends.mps.is_available():
    if not torch.backends.mps.is_built():
        print("MPS not available because the current PyTorch install was not "
              "built with MPS enabled.")
    else:
        print("MPS not available because the current MacOS version is not 12.3+ "
              "and/or you do not have an MPS-enabled device on this machine.")
else:
    print("MPS Available")

MPS Available


In [66]:
# Creation of the initial token embeddor and positional embeddor.
class Embeddings(nn.Module):
    def __init__(self,config):
        super().__init__()
        self.token_embeddings = nn.Embedding(config.vocabulary_size,config.embedding_dimension)
        self.positional_embeddings = nn.Embedding(config.max_position_embedding,config.embedding_dimension)
        self.layer_normalization = nn.LayerNorm(config.embedding_dimension,eps=1e-12)
        self.dropout = nn.Dropout()

    def forward(self,input_ids):
        # Create position IDs for input sequence
        sequence_length = input_ids.size(1)
        position_ids = torch.arange(sequence_length,dtype=torch.long).unsqueeze(0)
        
        # Create Token and Position Embeddings
        token_embeddings = self.token_embeddings(input_ids)
        position_embeddings = self.positional_embeddings(position_ids)

        # Combine token and position embeddings
        embeddings = token_embeddings + position_embeddings
        embeddings = self.layer_normalization(embeddings)
        embeddings = self.dropout(embeddings)

        return embeddings


In [75]:
device = torch.device("mps")
text = "i am learning about pytorch and transformers"

inputs = tokenizer(text, return_tensors="pt", add_special_tokens=False).to(device)
inputs.input_ids = inputs.input_ids.to(device)
# Test out the embedding layer
embedding_layer = Embeddings(configuration_dictionary).to(device)
input_embeddings=embedding_layer(inputs.input_ids)
input_embeddings.shape


RuntimeError: Placeholder storage has not been allocated on MPS device!