# GRU

# Preprocessing

In [1]:
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import numpy as np
import io
import re
import string
import tqdm

physical_devices = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0],True)

2022-11-02 16:00:17.983281: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-02 16:00:17.983303: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-02 16:00:19.235760: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-02 16:00:19.236090: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-02 16:00:19.236143: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.s

In [2]:
%load_ext tensorboard
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("./data/text_emotion.csv")

## Data Cleaning
1. Getting rid of the punctuations marks from dataset
2. converting all content to lowercase
3. converting unicode characters to ascii

In [4]:
cleaned_df = df.copy()

# removing all punctuation marks
exclude = set(string.punctuation)
regex = re.compile('[%s]' % re.escape(string.punctuation))
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: regex.sub("",sentence))

# lowercasing 'content' column
cleaned_df['content'] = cleaned_df['content'].str.lower()

# removing unicode characters
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: sentence.encode("ascii","ignore").decode())


1. Now we want to find the total number of unique words (vocab_size) to rationalise the size of a embedding vector
1. We also want to find the maximum number of words that are in a sentence to justify the size of the input layer of the GRU

In [5]:
max_sentence_len = cleaned_df['content'].apply(lambda s: len(s.split())).max()
no_of_words = cleaned_df['content'].apply(lambda content: len(content.split())).sum()


# count vocab size
cache=set()

#counting padding and [unkownd] token
vocab_size= 2
for key,sentence in cleaned_df['content'].items():
    words = sentence.split()
    for word in words:
        if(word not in cache):
            vocab_size+=1
            cache.add(word)
            
print("max sentence length of 'content' is {}".format(max_sentence_len))
print("total number of words of content is {}".format(no_of_words))
print("total number of unique words/vocab_size of content is {}".format(vocab_size))

max sentence length of 'content' is 33
total number of words of content is 522873
total number of unique words/vocab_size of content is 53612


Since our max sentence length is 33 we will make an embedding of shape (64,) and pad the difference of the sentence. <br>
Since it is computationally heavy to have a vector of 63612 in size. <br>
We will use a vector of only 64, meaning that in the skip_grams algorithm, the words will only take 64 other context words into account

# using word2Vec to get continous vectors to use as embeddings instead of one-hot vectors
> using continuous vectors instead of one-hot vectors is better as continuos vectors contain contextual meaning learned from the unsupervised learning performed in the word2vec training process

Getting embeddings using Word2Vec. Word2vec has 2 algorithms, 
1. Continuous Bag of words
    - word is predicted from context "__ my name is Kevin"
2. Skip Gram
    - context is predicted from target "Hi __ __ __ __"

Combine the steps to one function

In [6]:
def generate_training_data(sequences, window_size,num_ns,vocab_size,seed):
    
    # each training sentence is appended to these list
    targets,contexts,labels = [],[],[]
    
    # sampling table for vocab_size tokens
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    
    # iterate over all sentences in dataset
    for sequence in tqdm.tqdm(sequences):
       

        # generating positive skip-gram pairs for a sequence
        positive_skip_grams,_ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size = vocab_size,
            sampling_table=sampling_table,
            window_size=window_size, # TODO: change window size
            negative_samples=0
        )
        
        # produce negative samples and create training samples (x_train,labels)
        for target_word, context_word in positive_skip_grams:
    
    
            # expand context word to frmo dim shape (1,0) to (1,1)
            context_class = tf.expand_dims(
                tf.constant([context_word],dtype='int64'),1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class, # to tell the which sample is positive
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size, #TODO: may need to change to just the negative samples of the sentence itself instead of the entire vocab
                seed=seed,
                name='negative_sampling')
    
    
            # building the context and label vectors for a target word
            context = tf.concat([tf.squeeze(context_class,1),negative_sampling_candidates],0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")
    
    
            # append each element from the training ex to global lists
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
    
    
    return targets, contexts, labels 
            
            

## Preparing training data for word2vec

In [7]:
# size of one sentence is 33 but we just use 64
sequence_length = 64

vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

2022-11-02 16:00:30.215321: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Tokenise the words in content according to their indices

In [8]:
# TODO change to smaller batch for better results
batch_size = 1024

### replace words with their respective tokens:

In [9]:
# creating a dataset of all sentences
text_ds = tf.data.Dataset.from_tensor_slices(cleaned_df['content'])
vectorize_layer.adapt(text_ds.batch(batch_size))

### build a inverse vocab which maps indexes -> words which can be handy

In [10]:
inverse_vocab = vectorize_layer.get_vocabulary()

In [11]:
# Vectorize the data in text_ds.
# prefetch does fetching of data and training at the same time using multiple thread
# improving performance
text_vector_ds = text_ds.batch(batch_size).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

As you can see we have successfully vectorised our sentences/sequences

In [12]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

40000


## Using unsupervised learning (word2Vec skip_gram) to predict context from targets. 
>While doing so, we are also training the weights on the embeddings. We can increase window size so that the embeddings learn more contextutal knowledge with respect to the words around them

In [None]:
# to determine window size, we see the median length of a sentence

print(cleaned_df['content'].apply(lambda s: len(s.split())).median())
print(cleaned_df['content'].apply(lambda s: len(s.split())).mean())

We go with window size of 12 since that is the median length of a sentence

In [None]:
# note higher window size is more computationally expensive
# from documentation, it is said that for small datasets, negative samples of range 5to 20 yields the best results

num_ns = 10

targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=12,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

Configuring training sets for Word2Vec model

In [None]:
# randomising the dataset

BATCH_SIZE = batch_size
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

In [None]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

In [None]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
              embedding_dim,
              input_length=1,
              name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
               embedding_dim,
               input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots

In [None]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [None]:
# embedding size for word2vec is chosen to be of shape (32,)
# the idea is that since there are a maximum of 33 words in a sentece
# the vector that a word takes will be in 32 dimension 
# vocab size +1 because of padding
embedding_dim = 32

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
# call back to log training stats for TensorBoard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [None]:
history = word2vec.fit(dataset, epochs=40, callbacks=[tensorboard_callback])

## Embedding lookup and analysis

In [None]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [None]:
out_v = io.open('./data/vectors1.tsv', 'w')
out_m = io.open('./data/metadata1.tsv', 'w')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

# GRU model

In [13]:
def read_tsv(path):
    result = []
    file = open(path,'r')
    while (True):
        line = file.readline()
        if not line:
            break
        result.append([float(value) for value in line.split()])
        
    file.close()
    return result

In [14]:
vectors = read_tsv('./data/vectors1.tsv')

In [15]:
def read_meta(path):
    result = []
    file = open(path,'r')
    while (True):
        line = file.readline()
        if not line:
            break
            
        result.append(line.strip())
    file.close()
    return result

In [16]:
classes = read_meta('./data/metadata1.tsv')

In [17]:
len(classes)

53611

## Preprocessing for GRU: one hot encoding the labels

In [18]:
print('Unique labels:')
for i, label in enumerate(cleaned_df['sentiment'].unique()):
    print('{}) {}'.format(i,label)) 

Unique labels:
0) empty
1) sadness
2) enthusiasm
3) neutral
4) worry
5) surprise
6) love
7) fun
8) hate
9) happiness
10) boredom
11) relief
12) anger


In [19]:
cleaned_df['sentiment'].nunique()

13

# Build a one hot vector each of size 13 for sentiments as labels

### build a map for the labels to the indices

In [20]:
# map labels -> indices
sentiment_labels = cleaned_df['sentiment'].unique().copy()

# index key -> class
label_map = {}

#class -> index key
inverse_label_map = {}
for i, label in enumerate(sentiment_labels):
    label_map[i] = label 
    inverse_label_map[label] = i

In [21]:
# get all keys
indices = []
for key,value in label_map.items():
    indices.append(key)

### Generates one-hot vector for labels

In [22]:
# maps the word to the one hot vector
depth = len(indices)
one_hot_encoding = tf.one_hot(indices,depth)

In [23]:
one_hot_encoding

<tf.Tensor: shape=(13, 13), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)>

In [24]:
one_hot_encoding.shape

TensorShape([13, 13])

Map the one hot encodings to the labels in the dataframe

In [25]:
cleaned_df['sentiment'].values

array(['empty', 'sadness', 'sadness', ..., 'love', 'happiness', 'love'],
      dtype=object)

In [26]:
# instantiate the preprocessed dataframe to feed into the model
# cleaned_df['label'] = cleaned_df['sentiment'].apply(lambda n: one_hot_encoding[inverse_label_map[n]])
# cleaned_df['label'] = cleaned_df['sentiment'].apply(lambda n: inverse_label_map[n])

labels=[]
for sentiment in tqdm.tqdm(cleaned_df['sentiment'].values):
    inner_list = []
    o_h = one_hot_encoding[inverse_label_map[sentiment]]
    for i in range(0,33):
        inner_list.append(o_h)
    labels.append(inner_list)

labels = np.asarray(labels)
#labels = np.asarray([(one_hot_encoding[inverse_label_map[sentiment]]) for sentiment in cleaned_df['sentiment'].values])
# labels = np.asarray([tf.keras.utils.to_categorical([inverse_label_map[sentiment]],33) for sentiment in cleaned_df['sentiment'].values])

100%|████████████████████████████████████████████████████████████████████████████████| 40000/40000 [00:04<00:00, 8075.25it/s]


In [27]:
# label shape is: 
# 40000 (no_of_sequences), 33(time_sequence which is also = max_length of sentence, classes_no)
labels.shape

(40000, 33, 13)

Getting embeddings using Word2Vec. Word2vec has 2 algorithms, 
1. Continuous Bag of words
    - word is predicted from context "__ my name is Kevin"
2. Skip Gram
    - context is predicted from target "Hi __ __ __ __"

## Convert Labels (sentiment) to tokenized vectors

In [30]:
content_lines = list(cleaned_df['content'])
tokeniser_obj = Tokenizer()
tokeniser_obj.fit_on_texts(content_lines)
sequences = tokeniser_obj.texts_to_sequences(content_lines)

# pad sequences
word_index = tokeniser_obj.word_index
print('Found {} unique tokens'.format(len(word_index)))

max_length = max_sentence_len
content_pad = pad_sequences(sequences,maxlen=max_length)
print("labels: {}".format(labels))

Found 53610 unique tokens
labels: [[[1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  ...
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]
  [1. 0. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]]

 [[0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  ...
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]
  [0. 1. 0. ... 0. 0. 0.]]

 ...

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  [0

In [31]:
print("shape of content tensor: {}".format(content_pad.shape))
print("shape of sentiment tensor: {}".format(labels.shape))

shape of content tensor: (40000, 33)
shape of sentiment tensor: (40000, 33, 13)


## Map embeddings from word2vec model for each word to the sequences (sentences) by createing a matrix

Map the embeddings to each word of the sentence

In [32]:
# make an inverse index of the classes name -> index
word_vector_map = {}
for index,word in enumerate(classes):
    word_vector_map[word] = vectors[index]

In [33]:
EMBEDDING_DIM=32
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))

In [34]:
embedding_matrix.shape

(53611, 32)

In [35]:
for word,i in word_index.items():
    if i> num_words:
        continue
    embedding_vector = word_vector_map.get(word)
    if(embedding_vector is not None):
        # if embedding vector is not found, 
        embedding_matrix[i] = embedding_vector

# Model of GRU

x (no_sequences,embeddings in a sequence=32,)

In [37]:
# callback to save the best weights for the model
def saveWeightsCallback(path,monitor,mode,save_freq):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=path,
        monitor = monitor,
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
        mode = mode,
        save_freq=save_freq,
)

In [38]:
from keras.initializers import Constant

def build_model(vocab_size,embedding_dim,max_length):
    model = keras.models.Sequential()
    model.add(layers.Embedding(num_words,
                               EMBEDDING_DIM,
                               embeddings_initializer=Constant(embedding_matrix),
                               input_length = max_length,
                              trainable=False))
    model.add(layers.GRU(32,return_sequences=True, activation="tanh"))
    model.add(layers.Dense(13,activation='softmax'))
    model.summary()
    return model

In [39]:
GRU_model = build_model(vocab_size,EMBEDDING_DIM,max_length)

GRU_model.compile(
    optimizer= Adam(learning_rate=0.01),
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
        'mean_squared_error'
    ]
)

GRU_model.summary()



Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 33, 32)            1715552   
                                                                 
 gru (GRU)                   (None, 33, 32)            6336      
                                                                 
 dense (Dense)               (None, 33, 13)            429       
                                                                 
Total params: 1,722,317
Trainable params: 6,765
Non-trainable params: 1,715,552
_________________________________________________________________
Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 33, 32)            1715552   
                                                                 
 gru (GRU)            

In [40]:
len(labels)

40000

In [41]:
VALIDATION_SPLIT = 0.3

indices = np.arange(content_pad.shape[0])
np.random.shuffle(indices)

content_pad = content_pad[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * content_pad.shape[0])

X_train_pad = content_pad[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_test_pad = content_pad[-num_validation_samples:]
y_test = labels[-num_validation_samples:]

In [None]:
history = GRU_model.fit(X_train_pad,
                        y_train,
                        batch_size=128,
                        epochs = 100,
                        verbose=2,
                        validation_data = (X_test_pad,y_test))

callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                     saveWeightsCallback(
                         path='./weights/GRU',
                         monitor = 'val_loss',
                         mode = 'min',
                         save_freq='epoch',
                     )],
         

Epoch 1/100
219/219 - 3s - loss: 2.0969 - accuracy: 0.2539 - mean_squared_error: 0.0650 - val_loss: 2.1137 - val_accuracy: 0.2486 - val_mean_squared_error: 0.0652 - 3s/epoch - 14ms/step
Epoch 2/100
219/219 - 3s - loss: 2.0910 - accuracy: 0.2561 - mean_squared_error: 0.0649 - val_loss: 2.1117 - val_accuracy: 0.2501 - val_mean_squared_error: 0.0651 - 3s/epoch - 14ms/step
Epoch 3/100
219/219 - 3s - loss: 2.0861 - accuracy: 0.2581 - mean_squared_error: 0.0647 - val_loss: 2.1142 - val_accuracy: 0.2460 - val_mean_squared_error: 0.0652 - 3s/epoch - 14ms/step
Epoch 4/100
219/219 - 3s - loss: 2.0815 - accuracy: 0.2591 - mean_squared_error: 0.0647 - val_loss: 2.1121 - val_accuracy: 0.2486 - val_mean_squared_error: 0.0652 - 3s/epoch - 14ms/step
Epoch 5/100
219/219 - 3s - loss: 2.0783 - accuracy: 0.2604 - mean_squared_error: 0.0646 - val_loss: 2.1190 - val_accuracy: 0.2454 - val_mean_squared_error: 0.0653 - 3s/epoch - 14ms/step
Epoch 6/100
219/219 - 3s - loss: 2.0735 - accuracy: 0.2620 - mean_squa

## Brief description of method used

# LSTM implementation

# Experiments and results