In [7]:
import numpy as np
#from numpy import array
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import pandas as pd
import numpy as np
import os

from sklearn import metrics
from sklearn.utils import shuffle
import joblib

import spacy
nlp = spacy.load('en')

import keras

from keras.layers import Embedding, Flatten, Dense, LSTM, Bidirectional
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.preprocessing.text import one_hot
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from keras.preprocessing.text import Tokenizer

from sklearn.preprocessing import LabelEncoder

In [8]:
path_name = '/Users/ilya/Desktop/Computer-Science/Github/emotion-nlp-detector/dataset/'

with open(os.path.join(path_name,'train.txt')) as f:
    train_list = f.readlines()

with open(os.path.join(path_name,'test.txt')) as f:
    test_list = f.readlines()

with open(os.path.join(path_name,'val.txt'))as f:
    val_list = f.readlines()

In [9]:
full_list = val_list+test_list+train_list

df = pd.DataFrame()

df['sentence'] = [sentence.split(';')[0] for sentence in full_list]
df['emotion'] = [sentence.split(';')[1].strip('\n') for sentence in full_list]

df.head()

Unnamed: 0,sentence,emotion
0,im feeling quite sad and sorry for myself but ...,sadness
1,i feel like i am still looking at a blank canv...,sadness
2,i feel like a faithful servant,love
3,i am just feeling cranky and blue,anger
4,i can have for a treat or if i am feeling festive,joy


In [10]:
df.groupby('emotion').count().reset_index().rename(columns = {'sentence':'count'})

Unnamed: 0,emotion,count
0,anger,2709
1,fear,2373
2,joy,6761
3,love,1641
4,sadness,5797
5,surprise,719


In [11]:
def lemmatize_string(item):
    doc = nlp(item)
    return ' '.join([token.lemma_ for token in doc if token.lemma_ != '-PRON-'])

In [12]:
main_df = pd.DataFrame()
counter = 0

for chunk in np.array_split(df, 20):
    chunk['sentence'] = chunk['sentence'].apply(lambda x: lemmatize_string(x))
    main_df = main_df.append(chunk)
    counter = counter+1
    print(counter, end = ' ')

1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 

In [13]:
le = LabelEncoder()
encoded_labels = le.fit_transform(main_df['emotion'])
labels = to_categorical(encoded_labels)
labels[0]

array([0., 0., 0., 0., 1., 0.], dtype=float32)

In [14]:
sorted_list = sorted([len(sentence.split(' ')) for sentence in main_df['sentence']], reverse = True)
print('Max Number of words in sentnce: {}'.format(sorted_list[0]))

Max Number of words in sentnce: 61


In [15]:
hot_encode_dict = {0:'anger', 1:'fear', 2:'joy', 3:'love', 4:'sadness', 5:'surprise'}

vocab_size = 1000
sent_len = 62
dim_num = 100

In [16]:
tokenizer = Tokenizer(vocab_size, filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')
tokenizer.fit_on_texts(main_df['sentence'])
one_hot_rep = tokenizer.texts_to_sequences(main_df['sentence'])

#one_hot_rep = [one_hot(sentence, vocab_size) for sentence in main_df['sentence']]
padded_docs = pad_sequences(one_hot_rep, padding = 'pre', maxlen = sent_len)

## original implementation
#one_hot_rep = [one_hot(sentence, vocab_size) for sentence in main_df['sentence']]
#padded_docs = pad_sequences(one_hot_rep, padding = 'pre', maxlen = sent_len)

In [22]:
type(one_hot_rep[0][0])

int

In [17]:
joblib.dump(tokenizer, 'tokenizer.pkl')

['tokenizer.pkl']

In [12]:
x_train = padded_docs[:15000]
x_test = padded_docs[15000:]
y_train = labels[:15000]
y_test = labels[15000:]

In [13]:
model = Sequential()
model.add(Embedding(vocab_size, dim_num, input_length=sent_len))
model.add(Bidirectional(LSTM(128)))
model.add(Dense(6, activation='softmax'))

# compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
# summarize the model
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 62, 100)           100000    
_________________________________________________________________
bidirectional (Bidirectional (None, 256)               234496    
_________________________________________________________________
dense (Dense)                (None, 6)                 1542      
Total params: 336,038
Trainable params: 336,038
Non-trainable params: 0
_________________________________________________________________
None


In [14]:
earlystopping = EarlyStopping(monitor="val_loss", patience=2, restore_best_weights=True)

# fit the model
model.fit(x_train, y_train, epochs=15, verbose=1, validation_split = 0.7, callbacks=[earlystopping])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


<tensorflow.python.keras.callbacks.History at 0x1a27a8d950>

In [15]:
classes = model.predict_classes(x_test)

eval_results = model.evaluate(x_test, y_test)





In [16]:
val_df = pd.DataFrame()
val_df['actual_label'] = encoded_labels[15000:]
val_df['pred_label'] = classes

val_df.loc[:,'actual_label'] = [hot_encode_dict[label_code] for label_code in list(val_df['actual_label'])]
val_df.loc[:,'pred_label'] = [hot_encode_dict[label_code] for label_code in list(val_df['pred_label'])]

val_df = val_df.groupby('actual_label').count().reset_index().rename(columns = {'pred_label':'label_count','actual_label':'label'})\
    .merge(val_df[val_df['actual_label'] == val_df['pred_label']].groupby('actual_label').count().reset_index()\
        .rename(columns = {'pred_label':'correct_label_count','actual_label':'label'}), left_on = ['label'], right_on = ['label'])

val_df.loc[:,'class_accuracy'] = round(val_df['correct_label_count']/val_df['label_count'],2)

val_df.sort_values(by = 'class_accuracy', ascending=True)

Unnamed: 0,label,label_count,correct_label_count,class_accuracy
5,surprise,172,90,0.52
3,love,393,272,0.69
0,anger,691,524,0.76
1,fear,628,482,0.77
2,joy,1639,1420,0.87
4,sadness,1477,1323,0.9


In [17]:
test_sentence = 'What an awesome day to deploy a real model with embeddings!'

In [18]:
one_hot_rep = tokenizer.texts_to_sequences([test_sentence])
padded_sent = pad_sequences(one_hot_rep, padding = 'pre', maxlen = sent_len)
response = np.argmax(model.predict(padded_sent), axis=-1)[0]

hot_encode_dict[response]

'joy'

In [23]:
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

Saved model to disk


In [27]:
from keras.models import model_from_json

# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")

Loaded model from disk


In [29]:
one_hot_rep = tokenizer.texts_to_sequences([test_sentence])
padded_sent = pad_sequences(one_hot_rep, padding = 'pre', maxlen = sent_len)
response = np.argmax(loaded_model.predict(padded_sent), axis=-1)[0]

hot_encode_dict[response]

'joy'