### Dataset Loading and and separates it into the training, validation, and testing sets
Note: Dataset source: https://huggingface.co/datasets/dair-ai/emotion

In [1]:
import nlp

from datasets import load_dataset
dataset = load_dataset("dair-ai/emotion", "split")

train = dataset['train']
val = dataset['validation']
test = dataset['test']

2024-10-02 05:22:31.834114: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### Separates training data into 2 arrays: “tweets” and “labels”.

In [3]:
def get_tweet(data):
    tweets = [x['text'] for x in data]
    labels = [x['label'] for x in data]
    return tweets, labels

tweets, labels = get_tweet(train)

tweets[0], labels[0]

('i didnt feel humiliated', 0)

### Initialize a tokenizer and calibrate it onto training data
This will assign each word a number by how commonly they appear in the dataset

In [4]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(tweets)

print(tweets[0])
print(tokenizer.texts_to_sequences(tweets[0]))

i didnt feel humiliated
[[2], [], [669], [2], [669], [1726], [43], [], [1997], [1550], [1550], [8884], [], [6051], [966], [93], [2], [8884], [2], [7], [43], [1550], [669]]


### Making all Sequences Same Shape

In [5]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen=50
def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

padded_train_seq = get_sequences(tokenizer, tweets)

padded_train_seq


array([[   2,  139,    3, ...,    0,    0,    0],
       [   2,   40,  101, ...,    0,    0,    0],
       [  17, 3060,    7, ...,    0,    0,    0],
       ...,
       [   2,    3,  327, ...,    0,    0,    0],
       [   2,    3,   14, ...,    0,    0,    0],
       [   2,   47,    7, ...,    0,    0,    0]], dtype=int32)

### Preparing Data for Model

In [6]:
import numpy as np

classes = set(labels)
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)

print(classes)
print(class_to_index)
print(index_to_class)
print(train_labels)

{0, 1, 2, 3, 4, 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
[0 0 3 ... 1 3 0]


### Creating Model

In [7]:
from tensorflow import keras

model = keras.models.Sequential([
keras.layers.Embedding(10000,16),
keras.layers.Bidirectional(keras.layers.LSTM(20, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(20)),
keras.layers.Dense(6, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

### Training Model

In [8]:
val_tweets, val_labels = get_tweet(val)
val_seq = get_sequences(tokenizer, val_tweets)
val_labels= names_to_ids(val_labels)
h = model.fit(
     padded_train_seq, train_labels,
     validation_data=(val_seq, val_labels),
     epochs=20,
     callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m49s[0m 73ms/step - accuracy: 0.3615 - loss: 1.5514 - val_accuracy: 0.6870 - val_loss: 0.8898
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 59ms/step - accuracy: 0.7556 - loss: 0.7012 - val_accuracy: 0.7760 - val_loss: 0.6469
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 60ms/step - accuracy: 0.8669 - loss: 0.4022 - val_accuracy: 0.8560 - val_loss: 0.4440
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 55ms/step - accuracy: 0.9194 - loss: 0.2357 - val_accuracy: 0.8700 - val_loss: 0.4161
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 53ms/step - accuracy: 0.9349 - loss: 0.1775 - val_accuracy: 0.8780 - val_loss: 0.3844
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 51ms/step - accuracy: 0.9577 - loss: 0.1323 - val_accuracy: 0.8830 - val_loss: 0.3685
Epoch 7/20
[1m5

### Evaluating and Testing Model

In [9]:
test_tweets, test_labels=get_tweet(test)
test_seq = get_sequences(tokenizer, test_tweets)
test_labels=names_to_ids(test_labels)
model.evaluate(test_seq, test_labels)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.8848 - loss: 0.3981


[0.3926566541194916, 0.8830000162124634]

In [12]:
import random

i = random.randint(0,len(test_labels)-1)
print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])
p = model.predict(np.expand_dims(test_seq[i], axis=0))[0]
print(test_seq[i])
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Predicted Emotion: ', pred_class)

Sentence: i managed to re learn feeling insecure again
Emotion: 4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 65ms/step
[  2 951   5 372 801   8 511 129   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
Predicted Emotion:  4


In [11]:
sentence = 'i am happy'
sequence = tokenizer.texts_to_sequences([sentence])
paddedSequence = pad_sequences(sequence, truncating = 'post', padding='post', maxlen=maxlen)
p = model.predict(np.expand_dims(paddedSequence[0], axis=0))[0]
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Sentence:', sentence)
print('Predicted Emotion: ', pred_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step
Sentence: i am happy
Predicted Emotion:  1
