# Goal Of The File
Explore the emotion dataset in order to understand how to feed it to a text classifier
that will be trained from scratch.


In [17]:
import os
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import transformers.tokenization_utils_base
from datasets import load_dataset
from sklearn.preprocessing import MinMaxScaler
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification, Trainer, TrainingArguments
from umap import UMAP
from torch.nn.functional import cross_entropy
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn import neural_network

# Set column width for dataframes
pd.set_option("display.max_colwidth", None)
pd.set_option("display.width", 1000)

In [18]:
# Load the emotions dataset
emotions = load_dataset("emotion")

# Load all of the datasets
training_dataset, validation_dataset, test_dataset = emotions["train"], emotions["validation"], emotions["test"]


In [21]:
# Set the format of the datasets to pandas ?
emotions.set_format(type="pandas")
df = emotions["train"][:]
df.head(10)


Unnamed: 0,text,label
0,i didnt feel humiliated,0
1,i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake,0
2,im grabbing a minute to post i feel greedy wrong,3
3,i am ever feeling nostalgic about the fireplace i will know that it is still on the property,2
4,i am feeling grouchy,3
5,ive been feeling a little burdened lately wasnt sure why that was,0
6,ive been taking or milligrams or times recommended amount and ive fallen asleep a lot faster but i also feel like so funny,5
7,i feel as confused about life as a teenager or as jaded as a year old man,4
8,i have been with petronas for years i feel that petronas has performed well and made a huge profit,1
9,i feel romantic too,2


In [16]:
training_dataset.features["label"]

ClassLabel(names=['sadness', 'joy', 'love', 'anger', 'fear', 'surprise'], id=None)

In [51]:
# Function that will be used to tokenize inputs
def tokenize(batch: dict) -> transformers.tokenization_utils_base.BatchEncoding:
    """
    Function that takes in as input a batch that is a dictionary and encodes all of the "text" values.
    :param batch: dictionary of texts as a batch can be multiple texts
    :return: encoded texts for a given batch
    """
    print(len(batch))
    return tokenizer(batch["text"], padding=True, truncation=True)

In [57]:
# Reset the format of our data
emotions.reset_format()
emotions_encoded = emotions.map(tokenize, batched=True, batch_size=2000)

Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

2
2
2
2
2
2
2
2


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2


Map:   0%|          | 0/16000 [00:00<?, ? examples/s]

2


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2


Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

2
