### Dataset Loading and and separates it into the training, validation, and testing sets
Note: Dataset source: https://huggingface.co/datasets/dair-ai/emotion

In [1]:
import nlp

from datasets import load_dataset
dataset = load_dataset("dair-ai/emotion", "split")

train = dataset['train']
val = dataset['validation']
test = dataset['test']

2024-10-02 13:49:00.753035: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

### Separates training data into 2 arrays: “tweets” and “labels”.

In [3]:
def get_tweet(data):
    tweets = [x['text'] for x in data]
    labels = [x['label'] for x in data]
    return tweets, labels

tweets, labels = get_tweet(train)

tweets[:5], labels[:5]

(['i didnt feel humiliated',
  'i can go from feeling so hopeless to so damned hopeful just from being around someone who cares and is awake',
  'im grabbing a minute to post i feel greedy wrong',
  'i am ever feeling nostalgic about the fireplace i will know that it is still on the property',
  'i am feeling grouchy'],
 [0, 0, 3, 2, 3])

### Initialize a tokenizer and calibrate it onto training data
This will assign each word a number by how commonly they appear in the dataset

In [5]:
from tensorflow.keras.preprocessing.text import Tokenizer

tokenizer = Tokenizer(num_words=10000, oov_token='<UNK>')
tokenizer.fit_on_texts(tweets)

print(tweets[0])
print(tokenizer.texts_to_sequences(tweets[0]))
tokenizer.word_index

i didnt feel humiliated
[[2], [], [669], [2], [669], [1726], [43], [], [1997], [1550], [1550], [8884], [], [6051], [966], [93], [2], [8884], [2], [7], [43], [1550], [669]]


{'<UNK>': 1,
 'i': 2,
 'feel': 3,
 'and': 4,
 'to': 5,
 'the': 6,
 'a': 7,
 'feeling': 8,
 'that': 9,
 'of': 10,
 'my': 11,
 'in': 12,
 'it': 13,
 'like': 14,
 'so': 15,
 'for': 16,
 'im': 17,
 'me': 18,
 'but': 19,
 'was': 20,
 'have': 21,
 'is': 22,
 'this': 23,
 'am': 24,
 'with': 25,
 'not': 26,
 'about': 27,
 'be': 28,
 'as': 29,
 'on': 30,
 'you': 31,
 'just': 32,
 'at': 33,
 'when': 34,
 'or': 35,
 'all': 36,
 'because': 37,
 'more': 38,
 'do': 39,
 'can': 40,
 'really': 41,
 'up': 42,
 't': 43,
 'are': 44,
 'by': 45,
 'very': 46,
 'know': 47,
 'been': 48,
 'if': 49,
 'out': 50,
 'myself': 51,
 'time': 52,
 'how': 53,
 'what': 54,
 'get': 55,
 'little': 56,
 'had': 57,
 'now': 58,
 'will': 59,
 'from': 60,
 'being': 61,
 'they': 62,
 'people': 63,
 'them': 64,
 'would': 65,
 'he': 66,
 'want': 67,
 'her': 68,
 'some': 69,
 'think': 70,
 'one': 71,
 'still': 72,
 'ive': 73,
 'him': 74,
 'even': 75,
 'who': 76,
 'an': 77,
 'life': 78,
 'its': 79,
 'make': 80,
 'there': 81,
 'we': 

### Making all Sequences Same Shape

In [6]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

maxlen=50
def get_sequences(tokenizer, tweets):
    sequences = tokenizer.texts_to_sequences(tweets)
    padded = pad_sequences(sequences, truncating = 'post', padding='post', maxlen=maxlen)
    return padded

padded_train_seq = get_sequences(tokenizer, tweets)

padded_train_seq


array([[   2,  139,    3, ...,    0,    0,    0],
       [   2,   40,  101, ...,    0,    0,    0],
       [  17, 3060,    7, ...,    0,    0,    0],
       ...,
       [   2,    3,  327, ...,    0,    0,    0],
       [   2,    3,   14, ...,    0,    0,    0],
       [   2,   47,    7, ...,    0,    0,    0]], dtype=int32)

### Preparing Data for Model

In [7]:
import numpy as np

classes = set(labels)
class_to_index = dict((c,i) for i, c in enumerate(classes))
index_to_class = dict((v,k) for k, v in class_to_index.items())
names_to_ids = lambda labels: np.array([class_to_index.get(x) for x in labels])
train_labels = names_to_ids(labels)

print(classes)
print(class_to_index)
print(index_to_class)
print(train_labels)

{0, 1, 2, 3, 4, 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
{0: 0, 1: 1, 2: 2, 3: 3, 4: 4, 5: 5}
[0 0 3 ... 1 3 0]


### Creating Model

In [8]:
from tensorflow import keras

model = keras.models.Sequential([
keras.layers.Embedding(10000,16),
keras.layers.Bidirectional(keras.layers.LSTM(20, return_sequences=True)),
keras.layers.Bidirectional(keras.layers.LSTM(20)),
keras.layers.Dense(6, activation='softmax')
])
model.compile(
     loss='sparse_categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

### Training Model

In [9]:
val_tweets, val_labels = get_tweet(val)
val_seq = get_sequences(tokenizer, val_tweets)
val_labels= names_to_ids(val_labels)
h = model.fit(
     padded_train_seq, train_labels,
     validation_data=(val_seq, val_labels),
     epochs=20,
     callbacks=[keras.callbacks.EarlyStopping(monitor='val_accuracy', patience=2)]
)

Epoch 1/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m39s[0m 61ms/step - accuracy: 0.3454 - loss: 1.5714 - val_accuracy: 0.7140 - val_loss: 0.8294
Epoch 2/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 61ms/step - accuracy: 0.8026 - loss: 0.5979 - val_accuracy: 0.8470 - val_loss: 0.4540
Epoch 3/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 65ms/step - accuracy: 0.9153 - loss: 0.2655 - val_accuracy: 0.8735 - val_loss: 0.3854
Epoch 4/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 68ms/step - accuracy: 0.9489 - loss: 0.1706 - val_accuracy: 0.8825 - val_loss: 0.3850
Epoch 5/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 58ms/step - accuracy: 0.9643 - loss: 0.1153 - val_accuracy: 0.8800 - val_loss: 0.3963
Epoch 6/20
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 61ms/step - accuracy: 0.9726 - loss: 0.0894 - val_accuracy: 0.8825 - val_loss: 0.4044


### Evaluating and Testing Model

In [10]:
test_tweets, test_labels=get_tweet(test)
test_seq = get_sequences(tokenizer, test_tweets)
test_labels=names_to_ids(test_labels)
model.evaluate(test_seq, test_labels)

[1m63/63[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - accuracy: 0.8888 - loss: 0.3745


[0.36838141083717346, 0.8889999985694885]

In [11]:
import random

i = random.randint(0,len(test_labels)-1)
print('Sentence:', test_tweets[i])
print('Emotion:', index_to_class[test_labels[i]])
p = model.predict(np.expand_dims(test_seq[i], axis=0))[0]
print(test_seq[i])
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Predicted Emotion: ', pred_class)

Sentence: i have not had any serious injuries or setbacks other than that infection in my foot a couple of months ago but i have noticed that my knees and inner foot have started to ache and feel tender during the longer runs
Emotion: 2
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
[   2   21   26   57  146 1678 7339   35 6973  120   95    9 3427   12
   11 1544    7  613   10  350  491   19    2   21 1479    9   11 6069
    4 2238 1544   21  198    5 1412    4    3  755  387    6  457 1947
    0    0    0    0    0    0    0    0]
Predicted Emotion:  1


In [12]:
sentence = 'i am happy'
sequence = tokenizer.texts_to_sequences([sentence])
paddedSequence = pad_sequences(sequence, truncating = 'post', padding='post', maxlen=maxlen)
p = model.predict(np.expand_dims(paddedSequence[0], axis=0))[0]
pred_class=index_to_class[np.argmax(p).astype('uint8')]
print('Sentence:', sentence)
print('Predicted Emotion: ', pred_class)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 60ms/step
Sentence: i am happy
Predicted Emotion:  1
