# GRU

# Preprocessing

In [1]:
import pandas as pd
import os
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


import numpy as np
import io
import re
import string
import tqdm

physical_devices = tf.config.list_physical_devices('GPU')
#tf.config.experimental.set_memory_growth(physical_devices[0],True)

2022-11-03 04:24:07.472942: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-03 04:24:07.473034: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2022-11-03 04:24:09.960194: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:975] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-11-03 04:24:09.960619: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-11-03 04:24:09.960718: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.s

In [2]:
%load_ext tensorboard
SEED = 42
AUTOTUNE = tf.data.AUTOTUNE

In [3]:
pd.set_option('display.max_colwidth', None)
df = pd.read_csv("./data/text_emotion.csv")

## Data Cleaning
1. Getting rid of the punctuations marks from dataset
2. converting all content to lowercase
3. converting unicode characters to ascii

In [4]:
cleaned_df = df.copy()

# lowercasing 'content' column
cleaned_df['content'] = cleaned_df['content'].str.lower()

# removing all words with #hastag and @name and urls
cleaned_df['content'].apply(lambda sentence: re.sub('@[A-Za-z0-9_]+','',sentence))
cleaned_df['content'].apply(lambda sentence: re.sub('#[A-Za-z0-9_]+','',sentence))
cleaned_df['content'].apply(lambda sentence: re.sub(r'http\S+','',sentence))

# removing all punctuation marks
exclude = set(string.punctuation)
regex = re.compile('[%s]' % re.escape(string.punctuation))
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: regex.sub("",sentence))

# removing unicode characters
cleaned_df['content'] = cleaned_df['content'].apply(lambda sentence: sentence.encode("ascii","ignore").decode())


1. Now we want to find the total number of unique words (vocab_size) to rationalise the size of a embedding vector
1. We also want to find the maximum number of words that are in a sentence to justify the size of the input layer of the GRU

In [5]:
max_sentence_len = cleaned_df['content'].apply(lambda s: len(s.split())).max()
no_of_words = cleaned_df['content'].apply(lambda content: len(content.split())).sum()


# count vocab size
cache=set()

#counting padding and [unkownd] token
vocab_size= 2
for key,sentence in cleaned_df['content'].items():
    words = sentence.split()
    for word in words:
        if(word not in cache):
            vocab_size+=1
            cache.add(word)
            
print("max sentence length of 'content' is {}".format(max_sentence_len))
print("total number of words of content is {}".format(no_of_words))
print("total number of unique words/vocab_size of content is {}".format(vocab_size))

max sentence length of 'content' is 33
total number of words of content is 522873
total number of unique words/vocab_size of content is 53612


Since our max sentence length is 33 we will make an embedding of shape (64,) and pad the difference of the sentence. <br>
Since it is computationally heavy to have a vector of 63612 in size. <br>
We will use a vector of only 64, meaning that in the skip_grams algorithm, the words will only take 64 other context words into account

# using word2Vec to get continous vectors to use as embeddings instead of one-hot vectors
> using continuous vectors instead of one-hot vectors is better as continuos vectors contain contextual meaning learned from the unsupervised learning performed in the word2vec training process

Getting embeddings using Word2Vec. Word2vec has 2 algorithms, 
1. Continuous Bag of words
    - word is predicted from context "__ my name is Kevin"
2. Skip Gram
    - context is predicted from target "Hi __ __ __ __"

Combine the steps to one function

In [6]:
def generate_training_data(sequences, window_size,num_ns,vocab_size,seed):
    
    # each training sentence is appended to these list
    targets,contexts,labels = [],[],[]
    
    # sampling table for vocab_size tokens
    sampling_table = tf.keras.preprocessing.sequence.make_sampling_table(vocab_size)
    
    # iterate over all sentences in dataset
    for sequence in tqdm.tqdm(sequences):
       

        # generating positive skip-gram pairs for a sequence
        positive_skip_grams,_ = tf.keras.preprocessing.sequence.skipgrams(
            sequence,
            vocabulary_size = vocab_size,
            sampling_table=sampling_table,
            window_size=window_size, # TODO: change window size
            negative_samples=0
        )
        
        # produce negative samples and create training samples (x_train,labels)
        for target_word, context_word in positive_skip_grams:
    
    
            # expand context word to frmo dim shape (1,0) to (1,1)
            context_class = tf.expand_dims(
                tf.constant([context_word],dtype='int64'),1)
            negative_sampling_candidates, _, _ = tf.random.log_uniform_candidate_sampler(
                true_classes=context_class, # to tell the which sample is positive
                num_true=1,
                num_sampled=num_ns,
                unique=True,
                range_max=vocab_size, #TODO: may need to change to just the negative samples of the sentence itself instead of the entire vocab
                seed=seed,
                name='negative_sampling')
    
    
            # building the context and label vectors for a target word
            context = tf.concat([tf.squeeze(context_class,1),negative_sampling_candidates],0)
            label = tf.constant([1] + [0]*num_ns, dtype="int64")
    
    
            # append each element from the training ex to global lists
            targets.append(target_word)
            contexts.append(context)
            labels.append(label)
    
    
    return targets, contexts, labels 
            
            

## Preparing training data for word2vec

In [7]:
# size of one sentence is 33 but we just use 64
sequence_length = 64

vectorize_layer = layers.TextVectorization(
    max_tokens=vocab_size,
    output_mode='int',
    output_sequence_length=sequence_length)

2022-11-03 04:24:14.645545: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


## Tokenise the words in content according to their indices

In [8]:
# TODO change to smaller batch for better results
batch_size = 1024

### replace words with their respective tokens:

In [9]:
# creating a dataset of all sentences
text_ds = tf.data.Dataset.from_tensor_slices(cleaned_df['content'])
vectorize_layer.adapt(text_ds.batch(batch_size))

### build a inverse vocab which maps indexes -> words which can be handy

In [10]:
inverse_vocab = vectorize_layer.get_vocabulary()

In [11]:
# Vectorize the data in text_ds.
# prefetch does fetching of data and training at the same time using multiple thread
# improving performance
text_vector_ds = text_ds.batch(batch_size).prefetch(AUTOTUNE).map(vectorize_layer).unbatch()

As you can see we have successfully vectorised our sentences/sequences

In [12]:
sequences = list(text_vector_ds.as_numpy_iterator())
print(len(sequences))

40000


## Using unsupervised learning (word2Vec skip_gram) to predict context from targets. 
>While doing so, we are also training the weights on the embeddings. We can increase window size so that the embeddings learn more contextutal knowledge with respect to the words around them

In [13]:
# to determine window size, we see the median length of a sentence

print(cleaned_df['content'].apply(lambda s: len(s.split())).median())
print(cleaned_df['content'].apply(lambda s: len(s.split())).mean())

12.0
13.071825


We go with window size of 12 since that is the median length of a sentence

In [35]:
# note higher window size is more computationally expensive
# from documentation, it is said that for small datasets, negative samples of range 5to 20 yields the best results

num_ns = 10

targets, contexts, labels = generate_training_data(
    sequences=sequences,
    window_size=2,
    num_ns=num_ns,
    vocab_size=vocab_size,
    seed=SEED)

targets = np.array(targets)
contexts = np.array(contexts)
labels = np.array(labels)

print('\n')
print(f"targets.shape: {targets.shape}")
print(f"contexts.shape: {contexts.shape}")
print(f"labels.shape: {labels.shape}")

100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [00:47<00:00, 833.56it/s]




targets.shape: (385005,)
contexts.shape: (385005, 11)
labels.shape: (385005, 11)


Configuring training sets for Word2Vec model

In [36]:
# randomising the dataset

BATCH_SIZE = batch_size
BUFFER_SIZE = 10000
dataset = tf.data.Dataset.from_tensor_slices(((targets, contexts), labels))
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE, drop_remainder=True)
print(dataset)

<BatchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None))>


In [37]:
dataset = dataset.cache().prefetch(buffer_size=AUTOTUNE)
print(dataset)

<PrefetchDataset element_spec=((TensorSpec(shape=(1024,), dtype=tf.int64, name=None), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None)), TensorSpec(shape=(1024, 11), dtype=tf.int64, name=None))>


In [38]:
class Word2Vec(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim):
        super(Word2Vec, self).__init__()
        self.target_embedding = layers.Embedding(vocab_size,
              embedding_dim,
              input_length=1,
              name="w2v_embedding")
        self.context_embedding = layers.Embedding(vocab_size,
               embedding_dim,
               input_length=num_ns+1)

    def call(self, pair):
        target, context = pair
        # target: (batch, dummy?)  # The dummy axis doesn't exist in TF2.7+
        # context: (batch, context)
        if len(target.shape) == 2:
            target = tf.squeeze(target, axis=1)
        # target: (batch,)
        word_emb = self.target_embedding(target)
        # word_emb: (batch, embed)
        context_emb = self.context_embedding(context)
        # context_emb: (batch, context, embed)
        dots = tf.einsum('be,bce->bc', word_emb, context_emb)
        # dots: (batch, context)
        return dots

In [39]:
def custom_loss(x_logit, y_true):
      return tf.nn.sigmoid_cross_entropy_with_logits(logits=x_logit, labels=y_true)

In [40]:
# embedding size for word2vec is chosen to be of shape (32,)
# the idea is that since there are a maximum of 33 words in a sentece
# the vector that a word takes will be in 32 dimension 
# vocab size +1 because of padding
embedding_dim = 32

word2vec = Word2Vec(vocab_size, embedding_dim)
word2vec.compile(optimizer='adam',
                 loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
                 metrics=['accuracy'])
# call back to log training stats for TensorBoard
tensorboard_callback = tf.keras.callbacks.TensorBoard(log_dir="logs")

In [41]:
history = word2vec.fit(dataset, epochs=40, callbacks=[tensorboard_callback])

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


## Writing vectors to file

In [42]:
weights = word2vec.get_layer('w2v_embedding').get_weights()[0]
vocab = vectorize_layer.get_vocabulary()

In [43]:
out_v = io.open('./data/vectors1.tsv', 'w')
out_m = io.open('./data/metadata1.tsv', 'w')

for index, word in enumerate(vocab):
    if index == 0:
        continue  # skip 0, it's padding.
    vec = weights[index]
    out_v.write('\t'.join([str(x) for x in vec]) + "\n")
    out_m.write(word + "\n")
out_v.close()
out_m.close()

# GRU model

In [44]:
def read_tsv(path):
    result = []
    file = open(path,'r')
    while (True):
        line = file.readline()
        if not line:
            break
        result.append([float(value) for value in line.split()])
        
    file.close()
    return result

In [45]:
vectors = read_tsv('./data/vectors1.tsv')

In [46]:
def read_meta(path):
    result = []
    file = open(path,'r')
    while (True):
        line = file.readline()
        if not line:
            break
            
        result.append(line.strip())
    file.close()
    return result

In [47]:
classes = read_meta('./data/metadata1.tsv')

In [48]:
len(classes)

53611

## Preprocessing for GRU: one hot encoding the labels

In [49]:
print('Unique labels:')
for i, label in enumerate(cleaned_df['sentiment'].unique()):
    print('{}) {}'.format(i,label)) 

Unique labels:
0) empty
1) sadness
2) enthusiasm
3) neutral
4) worry
5) surprise
6) love
7) fun
8) hate
9) happiness
10) boredom
11) relief
12) anger


In [50]:
cleaned_df['sentiment'].nunique()

13

# Build a one hot vector each of size 13 for sentiments as labels

### build a map for the labels to the indices

In [51]:
# map labels -> indices
sentiment_labels = cleaned_df['sentiment'].unique().copy()

# index key -> class
label_map = {}

#class -> index key
inverse_label_map = {}
for i, label in enumerate(sentiment_labels):
    label_map[i] = label 
    inverse_label_map[label] = i

In [52]:
# get all keys
indices = []
for key,value in label_map.items():
    indices.append(key)

### Generates one-hot vector for labels

In [53]:
# maps the word to the one hot vector
depth = len(indices)
one_hot_encoding = tf.one_hot(indices,depth)

In [54]:
one_hot_encoding

<tf.Tensor: shape=(13, 13), dtype=float32, numpy=
array([[1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]],
      dtype=float32)>

In [55]:
one_hot_encoding.shape

TensorShape([13, 13])

Map the one hot encodings to the labels in the dataframe

In [56]:
cleaned_df['sentiment'].values

array(['empty', 'sadness', 'sadness', ..., 'love', 'happiness', 'love'],
      dtype=object)

In [57]:
labels=[]
for sentiment in tqdm.tqdm(cleaned_df['sentiment'].values):
    o_h = one_hot_encoding[inverse_label_map[sentiment]]
    labels.append(o_h)

labels = np.asarray(labels)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 40000/40000 [00:05<00:00, 7124.57it/s]


In [58]:
labels.shape

(40000, 13)

# instantiate the preprocessed dataframe to feed into the model
# cleaned_df['label'] = cleaned_df['sentiment'].apply(lambda n: one_hot_encoding[inverse_label_map[n]])
# cleaned_df['label'] = cleaned_df['sentiment'].apply(lambda n: inverse_label_map[n])

labels=[]
for sentiment in tqdm.tqdm(cleaned_df['sentiment'].values):
    inner_list = []
    o_h = one_hot_encoding[inverse_label_map[sentiment]]
    for i in range(0,33):
        inner_list.append(o_h)
    labels.append(inner_list)

labels = np.asarray(labels)
#labels = np.asarray([(one_hot_encoding[inverse_label_map[sentiment]]) for sentiment in cleaned_df['sentiment'].values])
# labels = np.asarray([tf.keras.utils.to_categorical([inverse_label_map[sentiment]],33) for sentiment in cleaned_df['sentiment'].values])

Getting embeddings using Word2Vec. Word2vec has 2 algorithms, 
1. Continuous Bag of words
    - word is predicted from context "__ my name is Kevin"
2. Skip Gram
    - context is predicted from target "Hi __ __ __ __"

## Convert Content to tokenized vectors

In [59]:
content_lines = list(cleaned_df['content'])
tokeniser_obj = Tokenizer()
tokeniser_obj.fit_on_texts(content_lines)
sequences = tokeniser_obj.texts_to_sequences(content_lines)

# pad sequences
word_index = tokeniser_obj.word_index
print('Found {} unique tokens'.format(len(word_index)))

max_length = max_sentence_len
content_pad = pad_sequences(sequences,maxlen=max_length)
print("labels: {}".format(labels))

Found 53610 unique tokens
labels: [[1. 0. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 [0. 1. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [60]:
print("shape of content tensor: {}".format(content_pad.shape))
print("shape of sentiment tensor: {}".format(labels.shape))

shape of content tensor: (40000, 33)
shape of sentiment tensor: (40000, 13)


## Map embeddings from word2vec model for each word to the sequences (sentences) by createing a matrix

Map the embeddings to each word of the sentence

In [61]:
# make an inverse index of the classes name -> index
word_vector_map = {}
for index,word in enumerate(classes):
    word_vector_map[word] = vectors[index]

In [62]:
EMBEDDING_DIM=32
num_words = len(word_index) + 1
embedding_matrix = np.zeros((num_words,EMBEDDING_DIM))

In [63]:
embedding_matrix.shape

(53611, 32)

In [64]:
count = 0
for word,i in word_index.items():
    if i> num_words:
        continue
    embedding_vector = word_vector_map.get(word)
    if(embedding_vector is not None):
        # if embedding vector is not found, 
        embedding_matrix[i] = embedding_vector
        count+=1
print("number of words successfully mapped to vectors: {}/{}".format(count,vocab_size))

number of words successfully mapped to vectors: 53610/53612


# Model of GRU

x (no_sequences,embeddings in a sequence=32,)

In [65]:
# callback to save the best weights for the model
def saveWeightsCallback(path,monitor,mode,save_freq):
    return tf.keras.callbacks.ModelCheckpoint(
        filepath=path,
        monitor = monitor,
        verbose = 1,
        save_best_only = True,
        save_weights_only = True,
        mode = mode,
        save_freq=save_freq,
)

In [66]:
from keras.initializers import Constant

def build_model(vocab_size,embedding_dim,max_length):
    model = keras.models.Sequential()
    model.add(layers.Embedding(num_words,
                               EMBEDDING_DIM,
                               weights=[embedding_matrix],
                               input_length = max_length,
                              trainable=False))
    model.add(layers.GRU(128,return_sequences=True))
    model.add(layers.GRU(128,return_sequences=True))
    model.add(layers.GRU(128)) 
    model.add(layers.Dense(13,activation='softmax'))
    model.summary()
    return model

In [67]:
GRU_model = build_model(vocab_size,EMBEDDING_DIM,max_length)

GRU_model.compile(
    optimizer= Adam(learning_rate=0.0001),
    loss='categorical_crossentropy',
    metrics=[
        'accuracy',
    ]
)

GRU_model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_4 (Embedding)     (None, 33, 32)            1715552   
                                                                 
 gru (GRU)                   (None, 33, 128)           62208     
                                                                 
 gru_1 (GRU)                 (None, 33, 128)           99072     
                                                                 
 gru_2 (GRU)                 (None, 128)               99072     
                                                                 
 dense (Dense)               (None, 13)                1677      
                                                                 
Total params: 1,977,581
Trainable params: 262,029
Non-trainable params: 1,715,552
_________________________________________________________________
Model: "sequential"
____________________

In [68]:
VALIDATION_SPLIT = 0.3

indices = np.arange(content_pad.shape[0])
np.random.shuffle(indices)

content_pad = content_pad[indices]
labels = labels[indices]
num_validation_samples = int(VALIDATION_SPLIT * content_pad.shape[0])

X_train_pad = content_pad[:-num_validation_samples]
y_train = labels[:-num_validation_samples]
X_test_pad = content_pad[-num_validation_samples:]
y_test = labels[-num_validation_samples:]

In [69]:
history = GRU_model.fit(X_train_pad,
                        y_train,
                        batch_size=128,
                        epochs = 100,
                        verbose=2,
                        validation_data = (X_test_pad,y_test))

callbacks = [EarlyStopping(monitor='val_loss', patience=3),
                     saveWeightsCallback(
                         path='./weights/GRU_word2vec',
                         monitor = 'val_loss',
                         mode = 'min',
                         save_freq='epoch',
                     )],
         

Epoch 1/100
219/219 - 27s - loss: 2.2174 - accuracy: 0.2358 - val_loss: 2.1405 - val_accuracy: 0.2632 - 27s/epoch - 122ms/step
Epoch 2/100
219/219 - 22s - loss: 2.1104 - accuracy: 0.2614 - val_loss: 2.0859 - val_accuracy: 0.2688 - 22s/epoch - 100ms/step
Epoch 3/100
219/219 - 28s - loss: 2.0728 - accuracy: 0.2756 - val_loss: 2.0656 - val_accuracy: 0.2841 - 28s/epoch - 129ms/step
Epoch 4/100
219/219 - 33s - loss: 2.0602 - accuracy: 0.2838 - val_loss: 2.0602 - val_accuracy: 0.2906 - 33s/epoch - 150ms/step
Epoch 5/100
219/219 - 36s - loss: 2.0522 - accuracy: 0.2846 - val_loss: 2.0523 - val_accuracy: 0.2908 - 36s/epoch - 166ms/step
Epoch 6/100
219/219 - 36s - loss: 2.0464 - accuracy: 0.2860 - val_loss: 2.0464 - val_accuracy: 0.2930 - 36s/epoch - 164ms/step
Epoch 7/100
219/219 - 47s - loss: 2.0422 - accuracy: 0.2895 - val_loss: 2.0451 - val_accuracy: 0.2892 - 47s/epoch - 215ms/step
Epoch 8/100
219/219 - 49s - loss: 2.0379 - accuracy: 0.2899 - val_loss: 2.0407 - val_accuracy: 0.2957 - 49s/epo

219/219 - 52s - loss: 1.8877 - accuracy: 0.3486 - val_loss: 2.0071 - val_accuracy: 0.3142 - 52s/epoch - 238ms/step
Epoch 66/100
219/219 - 51s - loss: 1.8861 - accuracy: 0.3475 - val_loss: 2.0131 - val_accuracy: 0.3133 - 51s/epoch - 232ms/step
Epoch 67/100
219/219 - 52s - loss: 1.8844 - accuracy: 0.3470 - val_loss: 2.0065 - val_accuracy: 0.3098 - 52s/epoch - 236ms/step
Epoch 68/100
219/219 - 52s - loss: 1.8811 - accuracy: 0.3492 - val_loss: 2.0073 - val_accuracy: 0.3162 - 52s/epoch - 238ms/step
Epoch 69/100
219/219 - 51s - loss: 1.8782 - accuracy: 0.3519 - val_loss: 2.0074 - val_accuracy: 0.3140 - 51s/epoch - 233ms/step
Epoch 70/100
219/219 - 51s - loss: 1.8748 - accuracy: 0.3529 - val_loss: 2.0080 - val_accuracy: 0.3164 - 51s/epoch - 234ms/step
Epoch 71/100
219/219 - 52s - loss: 1.8728 - accuracy: 0.3545 - val_loss: 2.0076 - val_accuracy: 0.3137 - 52s/epoch - 237ms/step
Epoch 72/100
219/219 - 51s - loss: 1.8706 - accuracy: 0.3549 - val_loss: 2.0117 - val_accuracy: 0.3154 - 51s/epoch - 

# implementing GRU without word2vec

In [None]:
depth = len(indices)
one_hot_encoding = tf.one_hot(indices,depth)

2022-11-03 06:10:33.270883: W tensorflow/core/framework/cpu_allocator_impl.cc:82] Allocation of 6400000000 exceeds 10% of free system memory.


In [None]:
labels=[]
for sentiment in tqdm.tqdm(cleaned_df['sentiment'].values):
    o_h = one_hot_encoding[inverse_label_map[sentiment]]
    labels.append(o_h)

labels = np.asarray(labels)

In [None]:
train_split = int(0.7*len(cleaned_df['content']))
X_train = cleaned_df.loc[:train_split-1,'content']
y_train = labels[:train_split]

X_test = cleaned_df.loc[train_split:,'content']
y_test = labels[train_split:]

In [None]:
tokeniser_obj = Tokenizer()
total_contents = cleaned_df['content']
tokeniser_obj.fit_on_texts(total_contents)

#pad sequences
max_length = max([len(s.split()) for s in total_contents])

# define vocan size 
vocab_size = len(tokeniser_obj.word_index)+1

X_train_tokens =  tokeniser_obj.texts_to_sequences(X_train)
X_test_tokens =tokeniser_obj.texts_to_sequences(X_test)

X_train_pad = pad_sequences(X_train_tokens,maxlen=max_length,padding='post')
X_test_pad = pad_sequences(X_test_tokens,maxlen=max_length,padding='post')

In [None]:
model = keras.models.Sequential()

EMBEDDING_DIM = 100

model.add(layers.Embedding(vocab_size,EMBEDDING_DIM,input_length=max_length))
model.add(layers.GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(layers.Dense(13,activation='softmax'))

model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['accuracy'])

In [None]:
model.fit(X_train_pad,y_train,batch_size=128,epochs=28,validation_data=(X_test_pad,y_test),verbose=2)