# Train Emotion Recognition Model

Here we'll train a emotion recognition model, using the output data from the sentiment analysis.

In [1]:
# Add project path to the PYTHONPATH

import nltk
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from datetime import datetime,timedelta

import os
import sys
import pandas as pd
import numpy as np
import pickle
import ast
import random

from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM , GlobalAveragePooling1D, GlobalMaxPooling1D, Bidirectional, Conv1D, Dense, concatenate,MaxPooling1D
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.preprocessing import LabelBinarizer
import keras

import seaborn as sns
import matplotlib.pyplot as plt
from cleaning import cf

import scikitplot as skplt


# nltk.download('punkt')

## Load Dataset

Load the emotion labeled dataset

In [2]:
data = pd.read_csv('emotion_tweets_cleaned.csv')

data['emoji'] = data.tweet.apply(lambda x: cf.extract_emojis(x))
# data['tweet'] = data.apply(lambda x: remove_hashtags(x), axis=1)
data['tweet'] = data.tweet.apply(lambda x: cf.remove_specific_links(x))
data['tweet'] = data.tweet.apply(lambda x: cf.remove_urls(x))
data["tweet"] = data["tweet"].apply(lambda x: cf.replace_contractions(x))
data['tweet'] = data.tweet.apply(lambda x: cf.remove_punctuation(x))

# data['tweet'] = data.apply(lambda x: cf.add_emojis(x), axis=1)

data = data.drop(data[data.emotion == 'anger'].sample(6000).index)

data = data[data.tweet.apply(lambda x: len(x)) > 60]

data['len'] = data['tweet'].apply(lambda s : len(s))


data.sample(20)

Unnamed: 0,id,created_at,date,tweet,hashtags,username,nlikes,nreplies,nretweets,search,emotion,lang,emoji,len
28148,1124371022185140224,1556905916000,2019-05-03 12:51:56,realestateagent latoyaforbesgroup latoyalyonel...,"['#blessed', '#realestateagent', '#latoyaforbe...",LatoyaUrRealtor,0,0,0,#trust,trust,en,[],157
23,1224106016524197889,1580684591000,2020-02-02 18:03:11,people are afraid to make a change because the...,"['#people', '#afraid', '#change', '#focus', '#...",MichelleAileene,0,0,0,#afraid,fear,en,[],189
14399,1011309671808811009,1529949989000,2018-06-25 13:06:29,black namaste white violent smile writers poet...,"['#writers', '#poet', '#writer', '#spokenword'...",stiffmidlefinga,0,0,0,#hate,anger,en,[],69
19427,962439792674865153,1518298502000,2018-02-10 16:35:02,at the moment i m trying around with different...,"['#drawing', '#scrapbooking', '#creepy', '#ske...",IICherubinaII,2,0,0,#fear,fear,en,[],215
461,1070466884003344386,1544054168000,2018-12-05 18:56:08,thank you riesling always there for me when bo...,"['#wine', '#thankful', '#sweaty', '#riesling',...",NKH3,1,0,0,#thankful,joy,en,"[🙏, 🙏, 🥶, 🔥]",157
6156,979384672873603077,1522338476000,2018-03-29 10:47:56,strikes again left this outside in the rain ch...,"['#angrytweet', '#hermes']",R_Leeee,1,0,0,#angrytweet,anger,en,[],208
16934,1127708795118018562,1557701703000,2019-05-12 17:55:03,join us for a wonderland tea party with alice ...,"['#thekentuckycastle', '#tkcevents', '#castle'...",thecastlepost,1,0,1,#happy,joy,en,[],179
15124,1021559301696049152,1532393691000,2018-07-23 19:54:51,the outcome of being without power for three d...,"['#pissed', '#hungrynow']",KeMo_Allen76,0,0,0,#pissed,anger,en,"[🤦, 🏽, ♀, 😠, 🤦, 🏽, ♀]",90
14493,1009877668198801408,1529608573000,2018-06-21 14:16:13,ihearya brave anxiety all shapes sizes gender ...,"['#ihearya', '#brave', '#anxiety', '#anxious']",mrsroo83,1,0,0,#anxious,fear,en,[],89
23201,1160303455212359680,1565472875000,2019-08-10 16:34:35,trust is choosing to make something important ...,"['#vulnerability', '#trust', '#inspiration']",LeeMichaelWalt1,2,0,0,#trust,trust,en,"[✨, 🕊]",142


### Text Preparation




In [3]:
# Place text into a list of lists, only keywords are included
# All the keywords on a n x m list | n = #tweets in dataset m= #wordsxtweet
sequences = [x for x in data.tweet]

# Tokenize, convert words to numbers
# df['tokenizer'] = df.clean_tweet.apply(lambda x: ' '.join(ast.literal_eval(x)))
tokenizer = Tokenizer(num_words=20000, lower=True)
tokenizer.fit_on_texts(data.tweet)
tokenized = tokenizer.texts_to_sequences(sequences)

#Padding : Make all inputs the same size
maxlen = np.max([len(i) for i in tokenized])
print('Max length of the cleaned strings:' , maxlen) 
X = pad_sequences(tokenized, maxlen=maxlen)

# Encode outcome variable
encoder = LabelBinarizer()
encoder.fit(data.emotion.unique())
y = encoder.transform(data.emotion)

Max length of the cleaned strings: 62


### Train, Validation and Test Datasets

In [4]:
train_keys = [x for x in range(0,len(X)-1)]
val_keys = list(np.random.choice(len(X), size=int(len(X)*.25), replace=False))
train_keys = list(set(train_keys) - set(val_keys))
test_keys = list(np.random.choice(len(X), size=int(len(X)*.1), replace=False))
train_keys = list(set(train_keys) - set(test_keys))

X_train,X_val,X_test = X[train_keys], X[val_keys],X[test_keys]
print(len(X_train),len(X_val),len(X_test))
y_train,y_val,y_test = y[train_keys], y[val_keys],y[test_keys]
print(len(y_train),len(y_val),len(y_test))


14089 5209 2083
14089 5209 2083


## Model

Define the **LSTM** + **CNN** model

In [5]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
num_classes = len(data.emotion.unique())
embedding_dim = 500
lstm_units = 16
lstm_dropout = 0.2
recurrent_dropout = 0.2
spatial_dropout=0.4
filters=8
kernel_size=3

In [6]:
input_layer = Input(shape=(maxlen,))
output_layer = Embedding( input_dim=input_dim, output_dim=embedding_dim, input_shape=(maxlen,))(input_layer)
output_layer = SpatialDropout1D(spatial_dropout)(output_layer)


output_layer = Bidirectional(LSTM(lstm_units, return_sequences=True, dropout=lstm_dropout, recurrent_dropout=recurrent_dropout))(output_layer)
output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',  kernel_initializer='glorot_uniform')(output_layer)

avg_pool = GlobalAveragePooling1D()(output_layer)
max_pool = GlobalMaxPooling1D()(output_layer)
output_layer = concatenate([avg_pool, max_pool])

output_layer = Dense(num_classes, activation='sigmoid')(output_layer)

model = Model(input_layer, output_layer)


In [140]:

optimizer = keras.optimizers.Adam(lr=0.0005)
model.compile(loss='categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            [(None, 63)]         0                                            
__________________________________________________________________________________________________
embedding_4 (Embedding)         (None, 63, 500)      10000000    input_5[0][0]                    
__________________________________________________________________________________________________
spatial_dropout1d_4 (SpatialDro (None, 63, 500)      0           embedding_4[0][0]                
__________________________________________________________________________________________________
bidirectional_4 (Bidirectional) (None, 63, 32)       66176       spatial_dropout1d_4[0][0]        
____________________________________________________________________________________________

## Train model

Do the training process with the given data

In [None]:
model.fit(
    x = X_train,
    y = y_train,
    batch_size=128,
    epochs=100,
    validation_data=(X_val, y_val)
)