## Import libraries

In [108]:
import re
import nltk
from nltk.corpus import stopwords
import pandas as pd
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.models import KeyedVectors
from textblob import TextBlob
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.models import load_model
from keras.utils import np_utils
from keras.preprocessing import sequence
from keras.layers.embeddings import Embedding
from keras.utils import plot_model
import string
import warnings
warnings.filterwarnings('ignore')

## Path to folders/files

In [3]:
PATH_TO_FOLDER = '../AI_data/'
PATH_TO_Glove_vectors = '../glove.twitter.27B/glove.twitter.27B.200d.txt'
PATH_TO_Word2Vec_vectors = '../glove.twitter.27B/glove.twitter.27B.200d.word2vec'

In [174]:
# np.unique(data_frame['emotion'].values)
emotion_labels = {'anger':0, 'happiness':1, 'love':2, 'neutral':3, 'sadness':4, 'worry': 5}
emotion_labels_reverse = {0:'anger', 1:'happiness', 2:'love', 3:'neutral', 4:'sadness', 5:'worry'}

In [8]:
convert_glove_to_word2vec_vectors(PATH_TO_Glove_vectors, PATH_TO_Word2Vec_vectors)

In [32]:
model = read_word2vec_vectors(PATH_TO_Word2Vec_vectors)

## Read data

In [4]:
def read_data(filename, delimiter):
    data = pd.read_csv(filename, delimiter='\t')
    data = data.drop('Unnamed: 0', axis=1)
    return data

In [5]:
def convert_glove_to_word2vec_vectors(filepath, output_filepath):
    glove2word2vec(filepath, output_filepath)

In [6]:
def read_word2vec_vectors(filepath):
    model = KeyedVectors.load_word2vec_format(filepath, binary=False)
    return model

In [7]:
def preprocess(text):
    text = re.sub(r"http\S+", "", text)
    tokens = nltk.word_tokenize(text)
    tokens = [word.lower() for word in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    tokens = [word for word in stripped if word.isalpha()]
    stop_words = stopwords.words('english')
    token_list = []
    for word in tokens:
        if not word in stop_words:
            token_list.append(word)
    #token_list = [TextBlob(word).correct() for word in token_list]
    return token_list

In [8]:
def preprocess_df(data_frame):
    train_data = []
    for review in data_frame['review'].values:
        review = preprocess(review)
        train_data.append(review)
    return train_data

In [27]:
def convert_labels_into_categorical(data_frame):
    categorical_list = []
    for emotion in data_frame['emotion'].values:
        categorical_list.append(emotion_labels[emotion])
    categorical_list = np_utils.to_categorical(categorical_list)
    return categorical_list

In [273]:
def unique_words(data):
    unique_words = []
    for l in data:
        unique_words += l
    unique_words = list(set(unique_words))
    return unique_words

In [274]:
def convert_words_into_numbers(data, unique_words):
    data_in_numbers = []
    for l in data:
        data_in_numbers.append([unique_words.index(word) for word in l])
    return data_in_numbers

## Main function

In [275]:
filename_train_data = PATH_TO_FOLDER + 'emotion_trainingdataset.csv'
train_data = read_data(filename_train_data, '\t')

filename_test_data = PATH_TO_FOLDER + 'emotion_testdataset.csv'
test_data = read_data(filename_test_data, '\t')

In [276]:
X_train = preprocess_df(train_data)
X_test = preprocess_df(test_data)
unique_words = unique_words(X_train + X_test)
X_train_set = convert_words_into_numbers(X_train, unique_words)
X_test = convert_words_into_numbers(X_test, unique_words)

In [277]:
y_train_set = convert_labels_into_categorical(train_data)

In [272]:
'''
import random
num = list(range(len(X_train_set)))
print(num)
random.shuffle(num)
print(num)
X_train_set = [X_train_set[n] for n in num]
y_train_set = [y_train_set[n] for n in num]
'''

'\nimport random\nnum = list(range(len(X_train_set)))\nprint(num)\nrandom.shuffle(num)\nprint(num)\nX_train_set = [X_train_set[n] for n in num]\ny_train_set = [y_train_set[n] for n in num]\n'

In [278]:
length = len(X_train_set)
point = int(length * 0.75)
X_train = X_train_set[:point]
y_train = y_train_set[:point]
X_validation = X_train_set[point:]
y_validation = y_train_set[point:]

In [279]:
number_of_words = len(unique_words)
max_length_of_input = 200

X_train = sequence.pad_sequences(X_train, max_length_of_input)
X_test = sequence.pad_sequences(X_test, max_length_of_input)
X_validation = sequence.pad_sequences(X_validation, max_length_of_input)

embedding_vector_length = 32
model = Sequential()
model.add(Embedding(number_of_words, embedding_vector_length, input_length = max_length_of_input))
model.add(Dropout(0.2))
#model.add(LSTM(20, return_sequences=True))
#model.add(Dropout(0.2))
model.add(LSTM(20))
model.add(Dense(6, activation = 'sigmoid'))
model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['accuracy'])
print(model.summary())
#plot_model(model, to_file='model_LSTM.png', show_shapes=True, show_layer_names=True)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_28 (Embedding)     (None, 200, 32)           235776    
_________________________________________________________________
dropout_25 (Dropout)         (None, 200, 32)           0         
_________________________________________________________________
lstm_41 (LSTM)               (None, 20)                4240      
_________________________________________________________________
dense_23 (Dense)             (None, 6)                 126       
Total params: 240,142
Trainable params: 240,142
Non-trainable params: 0
_________________________________________________________________
None


In [280]:
print(len(X_train[0]), len(X_validation[0]))

200 200


In [None]:
print(np.array(X_validation).shape)
print(np.array(y_validation).shape)

model.fit(X_train, y_train, epochs=100, batch_size=32, validation_data=(X_validation, y_validation))

In [None]:
score_train, accuracy_train = model.evaluate(X_train, y_train)
print(score_train, accuracy_train)

In [203]:
model.save('../AI_models/5_model_LSTM' + '_' + str(30) + '_' + str(accuracy_train) + '.h5')

In [182]:
def predict_class(test_sentence, model):
    test_sentence = preprocess(test_sentence)
    test_sentence = [unique_words.index(word) for word in test_sentence]
    print(test_sentence)
    test_sentence = sequence.pad_sequences([test_sentence], max_length_of_input)
    print(model.predict(test_sentence))
    predicted_class = model.predict_classes(test_sentence)[0]
    emotion = emotion_labels_reverse[predicted_class]
    print(emotion)

In [183]:
test_sentence = "WOW i just drank a drink of water - 12 ice cubes that took ages to melt. i now have brian freeze"
predict_class(test_sentence, model)

[2567, 4252, 2516, 1520, 2308, 5558, 4966, 5786, 1416, 5406, 6791]
[[2.5340414e-04 4.2796298e-04 4.4181332e-04 6.4654440e-01 2.0610722e-04
  6.8068277e-04]]
neutral
