# Predict Emotion

The main objective of this notebook is to predict emotions from tweets

In [1]:
# Add project path to the PYTHONPATH

import os
import sys
from pathlib import Path

sys.path.append(Path(os.path.join(os.path.abspath(''), '../')).resolve().as_posix())

In [2]:
import pickle

## Load Tokenizer

Load `.pickle` file with the tokenizer

In [3]:
tokenizer_path = Path('../datasets/sentiment_analysis/tokenizer.pickle').resolve()
with tokenizer_path.open('rb') as file:
    tokenizer = pickle.load(file)

## Load Model

Load the trained emotion recognition model

In [4]:
from tensorflow.keras.layers import Input, Embedding, SpatialDropout1D, LSTM
from tensorflow.keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from tensorflow.keras.layers import Bidirectional, Conv1D, Dense, concatenate
from tensorflow.keras.models import Model

In [5]:
input_dim = min(tokenizer.num_words, len(tokenizer.word_index) + 1)
num_classes = 4
embedding_dim = 500
input_length = 100
lstm_units = 128
lstm_dropout = 0.1
recurrent_dropout = 0.1
spatial_dropout=0.2
filters=64
kernel_size=3

In [6]:
input_layer = Input(shape=(input_length,))
output_layer = Embedding(
  input_dim=input_dim,
  output_dim=embedding_dim,
  input_shape=(input_length,)
)(input_layer)

output_layer = SpatialDropout1D(spatial_dropout)(output_layer)

output_layer = Bidirectional(
LSTM(lstm_units, return_sequences=True,
     dropout=lstm_dropout, recurrent_dropout=recurrent_dropout)
)(output_layer)
output_layer = Conv1D(filters, kernel_size=kernel_size, padding='valid',
                    kernel_initializer='glorot_uniform')(output_layer)

avg_pool = GlobalAveragePooling1D()(output_layer)
max_pool = GlobalMaxPooling1D()(output_layer)
output_layer = concatenate([avg_pool, max_pool])

output_layer = Dense(num_classes, activation='softmax')(output_layer)

model = Model(input_layer, output_layer)

In [7]:
model_weights_path = Path('../models/emotion_recognition/model_weights.h5').resolve()
model.load_weights(model_weights_path.as_posix())

## Load data

Load the data that will have the labels predicted by the model

**data_path**: Path to the `.csv` file that will be used

In [8]:
import pandas as pd

In [10]:
data_path = Path('../datasets/sentiment_analysis/dataset.csv').resolve()
#../datasets/predict/1151893341782585349-1151863653320159233_kyoto_animation.csv
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,label,id,date,user,text
0,anger,1337090985638187009,2020-12-10 17:45:07,maahiwins,New day #angry maahi https://t.co/fUIR5zhu2R
1,anger,1337084058350784516,2020-12-10 17:17:35,SandPap39180050,@RoyalMail @RoyalMailHelp where is my parcel t...
2,anger,1337055644294242306,2020-12-10 15:24:41,shady2405,It's been a really hard few weeks with loosing...
3,anger,1337042537912135680,2020-12-10 14:32:36,AnnaLogue75,I'm fighting to get just the minimal dose to g...
4,anger,1337017566959644673,2020-12-10 12:53:23,Ninnersgirl,@GetSpectrum A Spectrum tech comes to my house...


## Load Encoder

Load `.pickle` file with the encoder

In [11]:
encoder_path = Path('../models/emotion_recognition/encoder.pickle').resolve()
with encoder_path.open('rb') as file:
    encoder = pickle.load(file)

## Preprocess data

Preprocess the data that will be used

In [12]:
from nlp import preprocess
from tensorflow.keras.preprocessing.sequence import pad_sequences

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\leena\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
cleaned_data = preprocess(data.text)
sequences = [text.split() for text in cleaned_data]
list_tokenized = tokenizer.texts_to_sequences(sequences)
x_data = pad_sequences(list_tokenized, maxlen=100)

Time to clean up: 8.24 sec


## Results

Predict the labels and generate a confusion matrix

In [14]:
import numpy as np

In [15]:
y_pred = model.predict(x_data)

In [16]:
for index, value in enumerate(np.sum(y_pred, axis=0) / len(y_pred)):
    print(encoder.classes_[index] + ": " + str(value))

anger: 0.015743947
fear: 0.08909034
joy: 0.43358883
sadness: 0.46157965


In [17]:
y_pred_argmax = y_pred.argmax(axis=1)
data_len = len(y_pred_argmax)
for index, value in enumerate(np.unique(y_pred_argmax)):
    print(encoder.classes_[index] + ": " + str(len(y_pred_argmax[y_pred_argmax == value]) / data_len))

anger: 0.00017329520838748808
fear: 0.10865609565895502
joy: 0.43072524044710164
sadness: 0.46044536868555586


In [22]:
y_pred[5:10].argmax(axis=1)

array([1, 1, 1, 1, 3], dtype=int64)

In [23]:
data.text.iloc[9]

'Listen to "How Are You Coping Through the Pandemic? Several Strategies That can Help you!" by Grandpa Jim. ⚓ https://t.co/BjiZKfMQ7V #richmond #harrisburg #raleigh #mke #annapolis #dover #norfolk #mad #sad #angry #happy #fun #lol #lmao #jacksonville #jax #sd #nd'