In [1]:
MAX_NB_WORDS = 56000 # max no. of words for tokenizer
MAX_SEQUENCE_LENGTH = 30 # max length of text (words) including padding
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 200 # embedding dimensions for word vectors (word2vec/GloVe)
GLOVE_DIR = "glove.twitter.27B.200d.txt"
print("Loaded Parameters:\n", MAX_NB_WORDS,MAX_SEQUENCE_LENGTH+5, VALIDATION_SPLIT,EMBEDDING_DIM,"\n", GLOVE_DIR)

Loaded Parameters:
 56000 35 0.2 200 
 glove.twitter.27B.200d.txt


In [2]:
print("Importing Modules...")
import numpy as np
import pandas as pd
import re, sys, os, csv, keras, pickle

Importing Modules...


Using TensorFlow backend.


In [13]:
from keras import regularizers, initializers, optimizers, callbacks
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten, Concatenate
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout, LSTM, GRU, Bidirectional
from keras.models import Model
from keras import backend as K
from keras.engine.topology import Layer, InputSpec
from tensorflow.keras.callbacks import ModelCheckpoint
print("Using Keras version",keras.__version__)

Using Keras version 2.4.2


In [14]:
print("Finished Importing Modules")

Finished Importing Modules


In [15]:
texts, labels = [], []
print("Reading from csv file...", end="")
with open('cleaned_data/emotion_data.csv') as csvfile:
    readCSV = csv.reader(csvfile, delimiter=',')
    for row in readCSV:
        texts.append(row[0])
        labels.append(row[1])
print("\nDone!")

Reading from csv file...
Done!


In [16]:
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [17]:
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data_int = pad_sequences(sequences, padding='pre', maxlen=(MAX_SEQUENCE_LENGTH-5))
data = pad_sequences(data_int, padding='post', maxlen=(MAX_SEQUENCE_LENGTH))

Found 34359 unique tokens.


In [18]:
labels = to_categorical(np.asarray(labels)) # convert to one-hot encoding vectors
print('Shape of data tensor:', data.shape)
print('Shape of label tensor:', labels.shape)

Shape of data tensor: (55775, 30)
Shape of label tensor: (55775, 5)


In [19]:
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

In [20]:
x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

print('Number of entries in each category:-')
print("Training:\n",y_train.sum(axis=0))
print("Validation:\n",y_val.sum(axis=0))

Number of entries in each category:-
Training:
 [ 7680. 12414. 13028.  6550.  4948.]
Validation:
 [1963. 3030. 3223. 1699. 1240.]


In [21]:
embeddings_index = {}
f = open("glove.twitter.27B.200d.txt", encoding="utf8")
print("Loading GloVe...")
for line in f:
    values = line.split()
    word = values[0]
    embeddings_index[word] = np.asarray(values[1:], dtype='float32')
f.close()
print("Done.\nProceeding with Embedding Matrix...", end="")
embedding_matrix = np.random.random((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector
print("\nCompleted!")

Loading GloVe...
Done.
Proceeding with Embedding Matrix...
Completed!


In [22]:
print("Finished running setup.")

Finished running setup.
