In [1]:
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import numpy as np
import io
import re
import string
import tqdm

physical_devices = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0],True)

2022-11-03 04:48:43.135668: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-03 04:48:43.135704: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-03 04:48:46.389990: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 04:48:46.390500: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-03 04:48:46.390650: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.s

In [2]:
%load_ext tensorboard
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("./data/text_emotion.csv")

In [4]:
cleaned_df = df.copy()
# lowercasing 'content' column
cleaned_df['content'] = cleaned_df['content'].str.lower()

# removing all words with #hastag and @name and urls
cleaned_df['content'].apply(lambda sentence: re.sub('@[A-Za-z0-9_]+','',sentence))
cleaned_df['content'].apply(lambda sentence: re.sub('#[A-Za-z0-9_]+','',sentence))
cleaned_df['content'].apply(lambda sentence: re.sub(r'http\S+','',sentence))

# removing all punctuation marks
exclude = set(string.punctuation)
regex = re.compile('[%s]' % re.escape(string.punctuation))
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: regex.sub("",sentence))

# removing unicode characters
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: sentence.encode("ascii","ignore").decode())

# removing stopwords

stopwords = [ "a", "about", "above", "after", "again", "against", "all", "am", "an", "and", "any", "are", "as", "at", "be", "because", 
             "been", "before", "being", "below", "between", "both", "but", "by", "could", "did", "do", "does", "doing", "down", "during",
             "each", "few", "for", "from", "further", "had", "has", "have", "having", "he", "he'd", "he'll", "he's", "her", "here", 
             "here's", "hers", "herself", "him", "himself", "his", "how", "how's", "i", "i'd", "i'll", "i'm", "i've", "if", "in", "into",
             "is", "it", "it's", "its", "itself", "let's", "me", "more", "most", "my", "myself", "nor", "of", "on", "once", "only", "or",
             "other", "ought", "our", "ours", "ourselves", "out", "over", "own", "same", "she", "she'd", "she'll", "she's", "should", 
             "so", "some", "such", "than", "that", "that's", "the", "their", "theirs", "them", "themselves", "then", "there", "there's",
             "these", "they", "they'd", "they'll", "they're", "they've", "this", "those", "through", "to", "too", "under", "until", "up",
             "very", "was", "we", "we'd", "we'll", "we're", "we've", "were", "what", "what's", "when", "when's", "where", "where's",
             "which", "while", "who", "who's", "whom", "why", "why's", "with", "would", "you", "you'd", "you'll", "you're", "you've",
             "your", "yours", "yourself", "yourselves" ]

def remove_stopwords(data):
    data['content'] = data['content'].apply(lambda x : ' '.join([word for word in x.split() if word not in (stopwords)]))
    return data

data_without_stopwords = remove_stopwords(cleaned_df)

# One hot encoding labels

In [5]:
print('Unique labels:')
for i, label in enumerate(cleaned_df['sentiment'].unique()):
    print('{}) {}'.format(i,label)) 

Unique labels:
0) empty
1) sadness
2) enthusiasm
3) neutral
4) worry
5) surprise
6) love
7) fun
8) hate
9) happiness
10) boredom
11) relief
12) anger


In [6]:
# map labels -> indices
sentiment_labels = cleaned_df['sentiment'].unique().copy()

# index key -> class
label_map = {}

#class -> index key
inverse_label_map = {}
for i, label in enumerate(sentiment_labels):
    label_map[i] = label 
    inverse_label_map[label] = i

In [7]:
# get all keys
indices = []
for key,value in label_map.items():
    indices.append(key)

In [8]:
# maps the word to the one hot vector
depth = len(indices)
one_hot_encoding = tf.one_hot(indices,depth)

2022-11-03 04:48:54.862297: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
one_hot_encoding

<tf.Tensor: shape=(13, 13), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)>

In [10]:
# getting one-hot encoding for labels

labels=[]
for sentiment in tqdm.tqdm(cleaned_df['sentiment'].values):
    o_h = one_hot_encoding[inverse_label_map[sentiment]]
    labels.append(o_h)

labels = np.asarray(labels)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [00:16<00:00, 2496.72it/s]


In [11]:
# label shape is: 
# 40000 (no_of_sequences), 33(time_sequence which is also = max_length of sentence, classes_no)
labels.shape

(40000, 13)

# Converting content to word vectors

## Reading embedded Twitter data

In [12]:
def read_twitter_embeddings_file(path):
    
    word_embedding_map = {}
    inverse_word_embedding_map = {}
    
    file = open(path,'r')
    vocab_size, embedding_dim = file.readline().split()
    vocab_size,embedding_dim = int(vocab_size),int(embedding_dim)
    print("vocab size:{}".format(vocab_size))
    print("vocab size:{}".format(embedding_dim))
    
    for i in tqdm.tqdm(range(vocab_size)):
    #while(True):
        line = file.readline()
        word,embedding = line.split()[0],line.split()[1:]
        
        if not line:
            break
        word_embedding_map[word] = embedding
        
    file.close()
    return word_embedding_map,vocab_size,embedding_dim

In [13]:
word_embedding_map,vocab_size,EMBEDDING_DIM = read_twitter_embeddings_file("./data/glove.twitter.27B.50d.txt")

vocab size:1193514
vocab size:50


100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1193514/1193514 [00:37<00:00, 31862.56it/s]


In [14]:
content_lines = list(cleaned_df['content'])
tokeniser_obj = Tokenizer()
tokeniser_obj.fit_on_texts(content_lines)
sequences = tokeniser_obj.texts_to_sequences(content_lines)

# pad sequences
word_index = tokeniser_obj.word_index
print('Found {} unique tokens'.format(len(word_index)))


content_pad = pad_sequences(sequences,maxlen=EMBEDDING_DIM)
print("labels: {}".format(labels))

Found 53490 unique tokens
labels: [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [15]:
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))

In [16]:
embedding_matrix.shape

(53491, 50)

In [17]:
count = 0
for word,i in word_index.items():
    if i> num_words:
        continue
    embedding_vector = word_embedding_map.get(word)
    if(embedding_vector is not None):
        # if embedding vector is not found, 
        embedding_matrix[i] = embedding_vector
        count+=1
print("number of words vectorised={}/{}".format(count,num_words))

number of words vectorised=24566/53491


# Building GRU model

In [18]:
# callback to save the best weights for the model
def saveWeightsCallback(path,monitor,mode,save_freq):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=path,
        monitor = monitor,
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
        mode = mode,
        save_freq=save_freq,
)

In [19]:
from keras.initializers import Constant

def build_model(vocab_size,embedding_dim,max_length):
    model = keras.models.Sequential()
    model.add(layers.Embedding(num_words,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length = max_length,
                              trainable=False))
    # return_sequences=True, 
    model.add(layers.GRU(128,return_sequences=True))
    model.add(layers.GRU(128,return_sequences=True))
    model.add(layers.GRU(128))
    model.add(layers.Dense(13,activation='softmax'))
    model.summary()
    return model

In [20]:
GRU_model = build_model(vocab_size,EMBEDDING_DIM,EMBEDDING_DIM)

GRU_model.compile(
    optimizer= Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
    ]
)

GRU_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 50)            2674550   
                                                                 
 gru (GRU)                   (None, 50, 128)           69120     
                                                                 
 gru_1 (GRU)                 (None, 50, 128)           99072     
                                                                 
 gru_2 (GRU)                 (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 13)                1677      
                                                                 
Total params: 2,943,491
Trainable params: 268,941
Non-trainable params: 2,674,550
_________________________________________________________________
Model: "sequential"
____________________

In [21]:
VALIDATION_SPLIT = 0.3

indices = np.arange(content_pad.shape[0])
np.random.shuffle(indices)

content_pad = content_pad[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * content_pad.shape[0])

X_train_pad = content_pad[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_test_pad = content_pad[-num_validation_samples:]
y_test = labels[-num_validation_samples:]

In [22]:
history = GRU_model.fit(X_train_pad,
                        y_train,
                        batch_size=128,
                        epochs = 200,
                        verbose=2,
                        validation_data = (X_test_pad,y_test))

callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                     saveWeightsCallback(
                         path='./weights/GRU',
                         monitor = 'val_loss',
                         mode = 'min',
                         save_freq='epoch',
                     )],
         

Epoch 1/200
219/219 - 91s - loss: 2.1845 - accuracy: 0.2566 - val_loss: 2.0435 - val_accuracy: 0.3080 - 91s/epoch - 415ms/step
Epoch 2/200
219/219 - 78s - loss: 2.0117 - accuracy: 0.3135 - val_loss: 1.9811 - val_accuracy: 0.3197 - 78s/epoch - 354ms/step
Epoch 3/200
219/219 - 78s - loss: 1.9737 - accuracy: 0.3246 - val_loss: 1.9591 - val_accuracy: 0.3276 - 78s/epoch - 355ms/step
Epoch 4/200
219/219 - 77s - loss: 1.9537 - accuracy: 0.3301 - val_loss: 1.9427 - val_accuracy: 0.3315 - 77s/epoch - 353ms/step
Epoch 5/200
219/219 - 78s - loss: 1.9372 - accuracy: 0.3362 - val_loss: 1.9291 - val_accuracy: 0.3344 - 78s/epoch - 356ms/step
Epoch 6/200
219/219 - 77s - loss: 1.9260 - accuracy: 0.3377 - val_loss: 1.9247 - val_accuracy: 0.3368 - 77s/epoch - 354ms/step
Epoch 7/200
219/219 - 78s - loss: 1.9160 - accuracy: 0.3412 - val_loss: 1.9158 - val_accuracy: 0.3395 - 78s/epoch - 354ms/step
Epoch 8/200
219/219 - 78s - loss: 1.9071 - accuracy: 0.3454 - val_loss: 1.9122 - val_accuracy: 0.3367 - 78s/epo

219/219 - 45s - loss: 1.6845 - accuracy: 0.4246 - val_loss: 1.8933 - val_accuracy: 0.3571 - 45s/epoch - 204ms/step
Epoch 66/200
219/219 - 45s - loss: 1.6818 - accuracy: 0.4250 - val_loss: 1.8967 - val_accuracy: 0.3585 - 45s/epoch - 205ms/step
Epoch 67/200
219/219 - 44s - loss: 1.6768 - accuracy: 0.4237 - val_loss: 1.8961 - val_accuracy: 0.3574 - 44s/epoch - 203ms/step
Epoch 68/200
219/219 - 44s - loss: 1.6724 - accuracy: 0.4265 - val_loss: 1.9038 - val_accuracy: 0.3532 - 44s/epoch - 203ms/step
Epoch 69/200
219/219 - 44s - loss: 1.6686 - accuracy: 0.4279 - val_loss: 1.9042 - val_accuracy: 0.3564 - 44s/epoch - 202ms/step
Epoch 70/200
219/219 - 44s - loss: 1.6639 - accuracy: 0.4293 - val_loss: 1.9087 - val_accuracy: 0.3504 - 44s/epoch - 203ms/step
Epoch 71/200
219/219 - 44s - loss: 1.6609 - accuracy: 0.4304 - val_loss: 1.9074 - val_accuracy: 0.3579 - 44s/epoch - 200ms/step
Epoch 72/200
219/219 - 44s - loss: 1.6535 - accuracy: 0.4323 - val_loss: 1.9177 - val_accuracy: 0.3510 - 44s/epoch - 

Epoch 129/200
219/219 - 44s - loss: 1.3142 - accuracy: 0.5613 - val_loss: 2.2708 - val_accuracy: 0.3158 - 44s/epoch - 200ms/step
Epoch 130/200
219/219 - 44s - loss: 1.3083 - accuracy: 0.5636 - val_loss: 2.2816 - val_accuracy: 0.3147 - 44s/epoch - 200ms/step
Epoch 131/200
219/219 - 44s - loss: 1.3018 - accuracy: 0.5662 - val_loss: 2.2979 - val_accuracy: 0.3120 - 44s/epoch - 199ms/step
Epoch 132/200
219/219 - 44s - loss: 1.2944 - accuracy: 0.5683 - val_loss: 2.2933 - val_accuracy: 0.3077 - 44s/epoch - 200ms/step
Epoch 133/200
219/219 - 44s - loss: 1.2870 - accuracy: 0.5699 - val_loss: 2.3136 - val_accuracy: 0.3085 - 44s/epoch - 200ms/step
Epoch 134/200
219/219 - 44s - loss: 1.2766 - accuracy: 0.5760 - val_loss: 2.3292 - val_accuracy: 0.3126 - 44s/epoch - 199ms/step
Epoch 135/200
219/219 - 44s - loss: 1.2728 - accuracy: 0.5739 - val_loss: 2.3232 - val_accuracy: 0.3099 - 44s/epoch - 200ms/step
Epoch 136/200
219/219 - 44s - loss: 1.2647 - accuracy: 0.5763 - val_loss: 2.3468 - val_accuracy: 

Epoch 193/200
219/219 - 44s - loss: 0.7783 - accuracy: 0.7607 - val_loss: 3.3834 - val_accuracy: 0.2802 - 44s/epoch - 200ms/step
Epoch 194/200
219/219 - 44s - loss: 0.7737 - accuracy: 0.7599 - val_loss: 3.4117 - val_accuracy: 0.2760 - 44s/epoch - 201ms/step
Epoch 195/200
219/219 - 44s - loss: 0.7636 - accuracy: 0.7631 - val_loss: 3.4520 - val_accuracy: 0.2766 - 44s/epoch - 200ms/step
Epoch 196/200
219/219 - 43s - loss: 0.7537 - accuracy: 0.7682 - val_loss: 3.4621 - val_accuracy: 0.2783 - 43s/epoch - 195ms/step
Epoch 197/200
219/219 - 35s - loss: 0.7478 - accuracy: 0.7679 - val_loss: 3.4977 - val_accuracy: 0.2706 - 35s/epoch - 161ms/step
Epoch 198/200
219/219 - 43s - loss: 0.7396 - accuracy: 0.7724 - val_loss: 3.5218 - val_accuracy: 0.2799 - 43s/epoch - 194ms/step
Epoch 199/200
219/219 - 45s - loss: 0.7316 - accuracy: 0.7747 - val_loss: 3.5492 - val_accuracy: 0.2732 - 45s/epoch - 204ms/step
Epoch 200/200
219/219 - 44s - loss: 0.7219 - accuracy: 0.7792 - val_loss: 3.5722 - val_accuracy: 