<a href="https://colab.research.google.com/github/ssttefann/EmotionClassification/blob/master/notebooks/deep_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Emotion Classificaton in Text - Deep Learning Approach**



In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

# Tensorflow
try:
  # %tensorflow_version only exists in Colab.
  !pip install -q tf-nightly
except Exception:
  pass
import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

import tensorflow_datasets as tfds
tfds.disable_progress_bar()

# General
import numpy as np
import pandas as pd
import re

# Visualization
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot') 
# plt.style.use('dark_background')

try:
  import pyprind
except Exception:
  !pip install pyprind
  import pyprind

# Data preprocessing
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

from google.colab import drive
drive.mount('/content/drive')

[K     |████████████████████████████████| 532.2MB 29kB/s 
[K     |████████████████████████████████| 2.9MB 36.8MB/s 
[K     |████████████████████████████████| 2.8MB 56.6MB/s 
[K     |████████████████████████████████| 460kB 59.6MB/s 
[K     |████████████████████████████████| 778kB 51.8MB/s 
[31mERROR: tensorflow 1.15.0 has requirement gast==0.2.2, but you'll have gast 0.3.3 which is incompatible.[0m
[?25hCollecting pyprind
  Downloading https://files.pythonhosted.org/packages/1e/30/e76fb0c45da8aef49ea8d2a90d4e7a6877b45894c25f12fb961f009a891e/PyPrind-2.11.2-py3-none-any.whl
Installing collected packages: pyprind
Successfully installed pyprind-2.11.2
Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive

In [0]:
import json
import tensorflow as tf

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

## Loading the data

In [3]:
# set the seed for entire process
# so that results can be reproducible
np.random.seed(123)

dataset = pd.read_csv("/content/drive/My Drive/emotion.data")
dataset.drop(dataset.columns[0], axis='columns', inplace=True)    # drop id column since it won't be used 
dataset = dataset.reindex(np.random.permutation(dataset.index))   # shuffle data 
dataset.head()

Unnamed: 0,text,emotions
98301,i am feeling melancholy and have finally pinpo...,sadness
41147,i miss having someone to talk to who i have th...,joy
151024,i just feel like i get blamed for everything,sadness
202104,i also apologize for mentioning about him in m...,anger
73602,im finding is the difference in having a life ...,joy


## Preprocessing

In [0]:
try:
  stop = stopwords.words('english')
except LookupError:
  import nltk
  nltk.download('stopwords')
  stop = stopwords.words('english')

stop.extend(['img', 'src', 'href'])     # some of these appear in given dataset
print(stop[:10])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're"]


### Tokenizing

In [0]:

def tokenizer(text):
    ''' Removes stop words and special characters,
        and returns list of all words that are left

        Parameters
        -----------
        text : string 

        Returns
        ----------
        tokenized : list {string}
    '''

    text = re.sub('http.*', '', text)                               # delete link references
    text = re.sub('[\W]+', ' ', text.lower()).replace('-', '')      # delete non-word characters [^a-zA-Z0-9]
    tokenized = [w for w in text.split() if w not in stop]          # delete stop words ( I, me , a, the)
    return tokenized


### Stemming

In [0]:
porter = PorterStemmer() 
def tokenizer_porter(text):
  return [porter.stem(word) for word in tokenizer(text)]

## Creating Train and Test Data

In [4]:
label2id = {"joy" : 0, "sadness" : 1, "anger" : 2, "fear" : 3, "love" : 4, "surprise" : 5}
id2label = {0 : "joy", 1 : "sadness", 2 : "anger", 3 : "fear", 4 : "love", 5 : "surprise"}

label2col = {"joy":"yellow", "sadness":"blue", "anger":"red", "fear":"grey", "love":"pink", "surprise":"orange"}

#label2id = { label : idx for idx, label in enumerate(set(dataset['emotions'].values))}
#id2label = { id : label for label, id in label2id.items()}

print(label2id)
print(id2label)

{'joy': 0, 'sadness': 1, 'anger': 2, 'fear': 3, 'love': 4, 'surprise': 5}
{0: 'joy', 1: 'sadness', 2: 'anger', 3: 'fear', 4: 'love', 5: 'surprise'}


In [0]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(dataset['text'].values, 
                                                    dataset['emotions'].values, 
                                                    test_size=0.1, 
                                                    random_state=123)

In [0]:
y_train = [ label2id[val] for val in y_train]
y_test =  [ label2id[val] for val in y_test]

## Training the model

In [0]:
vocab_size = 10000
embedding_dim = 16
max_length = 100
trunc_type='post'
padding_type='post'
oov_tok = "<OOV>"
training_size = 20000

In [0]:
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
tokenizer.fit_on_texts(X_train)

word_index = tokenizer.word_index

training_sequences = tokenizer.texts_to_sequences(X_train)
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

testing_sequences = tokenizer.texts_to_sequences(X_test)
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

In [9]:
y_train = keras.utils.to_categorical(y_train, num_classes=len(label2id), dtype='float32')
y_test = keras.utils.to_categorical(y_test, num_classes=len(label2id), dtype='float32')

y_test.shape

(41681, 6)

In [0]:
embedding_dim =  16
max_words = max_length

model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
])
model.compile(loss="categorical_crossentropy",
              optimizer=tf.keras.optimizers.Adam(1e-4),
              metrics=['accuracy'])

In [0]:
def create_model():
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(6, activation='softmax')
  ])
  model.compile(loss="categorical_crossentropy",
                optimizer=tf.keras.optimizers.Adam(1e-4),
                metrics=['accuracy'])
  return model

In [0]:
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 16)           160000    
_________________________________________________________________
bidirectional_3 (Bidirection (None, 128)               41472     
_________________________________________________________________
dense_6 (Dense)              (None, 64)                8256      
_________________________________________________________________
dense_7 (Dense)              (None, 6)                 390       
Total params: 210,118
Trainable params: 210,118
Non-trainable params: 0
_________________________________________________________________


In [0]:
import os

checkpoint_path = "/content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 save_weights_only=True,
                                                 verbose=1)

In [0]:


history = model.fit(training_padded, 
                    y_train, epochs=10, 
                    validation_data=(testing_padded, y_test),
                    callbacks=[cp_callback])


Epoch 1/10
Epoch 00001: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 2/10
Epoch 00002: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 3/10
Epoch 00003: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 4/10
Epoch 00004: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 5/10
Epoch 00005: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 6/10
Epoch 00006: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 7/10
Epoch 00007: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim.ckpt
Epoch 8/10

In [13]:
model = create_model()

model.load_weights(checkpoint_path)

loss,acc = model.evaluate(testing_padded,  y_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1303/1303 - 21s - loss: 0.1211 - accuracy: 0.9260
Restored model, accuracy: 92.60%


In [26]:
sentence = ["I am not happy at all"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

label_id = np.argmax(model.predict(padded))
print(id2label[label_id])

joy


## Second model


In [0]:
def create_model_2lstm():
  model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64,  return_sequences=True)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(32)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(6, activation='softmax')
  ])
  model.compile(loss="categorical_crossentropy",
                optimizer="adam",
                metrics=['accuracy'])
  return model

In [36]:
model2.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 100, 16)           160000    
_________________________________________________________________
bidirectional_1 (Bidirection (None, 100, 128)          41472     
_________________________________________________________________
bidirectional_2 (Bidirection (None, 64)                41216     
_________________________________________________________________
dense_2 (Dense)              (None, 64)                4160      
_________________________________________________________________
dropout (Dropout)            (None, 64)                0         
_________________________________________________________________
dense_3 (Dense)              (None, 6)                 390       
Total params: 247,238
Trainable params: 247,238
Non-trainable params: 0
________________________________________________

In [0]:
model2 = create_model_2lstm()

checkpoint_path_2lstm = "/content/drive/My Drive/Colab Notebooks/training_1/cp16dim2lstm.ckpt"
checkpoint_dir_2lstm = os.path.dirname(checkpoint_path_2lstm)

# Create a callback that saves the model's weights
cp_callback_2lstm = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path_2lstm,
                                                 save_weights_only=True,
                                                 verbose=1)

In [42]:
history_2lstm = model2.fit(training_padded, 
                    y_train, epochs=2, 
                    validation_data=(testing_padded, y_test),
                    callbacks=[cp_callback_2lstm])

Epoch 1/2
Epoch 00001: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim2lstm.ckpt
Epoch 2/2
Epoch 00002: saving model to /content/drive/My Drive/Colab Notebooks/training_1/cp16dim2lstm.ckpt


In [45]:
model2 = create_model_2lstm()

model2.load_weights(checkpoint_path_2lstm)

loss,acc = model2.evaluate(testing_padded,  y_test, verbose=2)
print("Restored model, accuracy: {:5.2f}%".format(100*acc))

1303/1303 - 44s - loss: 0.0938 - accuracy: 0.9384
Restored model, accuracy: 93.84%


In [47]:
sentence = ["I am not happy at all"]
sequences = tokenizer.texts_to_sequences(sentence)
padded = pad_sequences(sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

label_id = np.argmax(model2.predict(padded))
print(id2label[label_id])

model2.predict(padded)

joy


array([[8.5013151e-01, 8.3982740e-03, 1.0897810e-01, 3.1975593e-02,
        4.5883219e-04, 5.7785037e-05]], dtype=float32)

In [0]:
import matplotlib.pyplot as plt


def plot_graphs(history, string):
  plt.plot(history.history[string])
  plt.plot(history.history['val_'+string])
  plt.xlabel("Epochs")
  plt.ylabel(string)
  plt.legend([string, 'val_'+string])
  plt.show()
  
plot_graphs(history_2lstm, "accuracy")
plot_graphs(history_2lstm, "loss")