## Emotion Classifier

A neural network-based classifier that identifies emotion in text limited to six basic emotions: anger, fear, joy, love, sadness, and surprise. 

Dataset: 
https://github.com/dair-ai/emotion_dataset 

Data has been largely preprocessed already, using technique from this paper: https://www.aclweb.org/anthology/D18-1404/

Data dictionary:

- text: string 
- emotions: class label

In [243]:
import warnings
warnings.filterwarnings('ignore')
import pandas as pd
import numpy as np 

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix

import tensorflow as tf
from tensorflow.keras import utils, losses, optimizers, Input
from tensorflow.keras.layers import Dense, Dropout, GlobalMaxPooling1D, Conv1D, Embedding, Flatten, TextVectorization
from tensorflow.keras.models import Sequential

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle 

### Load and read the data

In [244]:
!wget https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt
!wget https://www.dropbox.com/s/1pzkadrvffbqw6o/train.txt
!wget https://www.dropbox.com/s/2mzialpsgf9k5l3/val.txt

--2021-12-31 02:52:26--  https://www.dropbox.com/s/ikkqxfdbdec3fuj/test.txt
Resolving www.dropbox.com (www.dropbox.com)... 162.125.5.18, 2620:100:601d:18::a27d:512
Connecting to www.dropbox.com (www.dropbox.com)|162.125.5.18|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: /s/raw/ikkqxfdbdec3fuj/test.txt [following]
--2021-12-31 02:52:26--  https://www.dropbox.com/s/raw/ikkqxfdbdec3fuj/test.txt
Reusing existing connection to www.dropbox.com:443.
HTTP request sent, awaiting response... 302 Found
Location: https://ucd302ed20dc84283c69398acddd.dl.dropboxusercontent.com/cd/0/inline/Bc2hi6qvKzGwjklORNwtJUvo0hREiQ33a4_mKIAvER0K9JrnbVKor2rCZmwrSqFTNZQCRHoc8s0HFakybySZQ3pE0MspVkD-AWcvve1IwtDHA9ncLJ_8DqQLqGyNS96ZWRLmjX5Rq7HWIo3qfUnIZClM/file# [following]
--2021-12-31 02:52:26--  https://ucd302ed20dc84283c69398acddd.dl.dropboxusercontent.com/cd/0/inline/Bc2hi6qvKzGwjklORNwtJUvo0hREiQ33a4_mKIAvER0K9JrnbVKor2rCZmwrSqFTNZQCRHoc8s0HFakybySZQ3pE0MspVkD-AWcvve

In [245]:
!mkdir emotion_data
!mv *.txt emotion_data

mkdir: cannot create directory ‘emotion_data’: File exists


In [246]:
train_path = "emotion_data/train.txt"
test_path = "emotion_data/test.txt"
val_path = "emotion_data/val.txt"

In [247]:
data = pd.read_csv(train_path, sep=";", header=None, names=['text', 'emotion'],
                               engine="python")
data.emotion.unique()

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [248]:
data.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [249]:
data.count()

text       16000
emotion    16000
dtype: int64

In [250]:
from sklearn import preprocessing
 
# label_encoder object knows how to understand word labels.
label_encoder = preprocessing.LabelEncoder()
 
# Encode labels in column 'emotion'.
data['emotion']= label_encoder.fit_transform(data['emotion'])
 
data['emotion'].unique()

array([4, 0, 3, 5, 1, 2])

In [251]:
EMOTIONS = {
    0: 'anger',
    1: 'fear',
    2: 'joy',
    3: 'love',
    4: 'sadness',
    5: 'surprise' 
}

In [252]:
text = data.text
labels = data.emotion
data.head()

Unnamed: 0,text,emotion
0,i didnt feel humiliated,4
1,i can go from feeling so hopeless to so damned...,4
2,im grabbing a minute to post i feel greedy wrong,0
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,0


### Data Split

In [253]:
SEED = 100

In [254]:
X = data.drop('emotion', axis=1)
labels = data['emotion']

# create training and validation sets with 80-20 split
X_train, X_validation, y_train, y_validation = train_test_split(X, labels, test_size=0.2, random_state = SEED)

# split the validation sets to get a holdout dataset (for testing) 50-50 split
X_validation, X_test, y_validation, y_test = train_test_split(X_validation, y_validation, test_size=0.5, random_state = SEED)

print(X_train.shape)
print(X_validation.shape)
print(y_train.shape)
print(y_validation.shape)
print(X_test.shape)
print(y_test.shape)

(12800, 1)
(1600, 1)
(12800,)
(1600,)
(1600, 1)
(1600,)


In [255]:
"""
If you want to apply tf.data transformations to a DataFrame of a uniform dtype, the Dataset.from_tensor_slices method will create a dataset 
that iterates over the rows of the DataFrame. 
Each row is initially a vector of values. 
To train a model, you need (inputs, labels) pairs.
"""
batch_size = 32

# train dataset
train_numeric_ds = tf.data.Dataset.from_tensor_slices((X_train, y_train))

# in tensorflow it is expected that you pass batches 
# tf.keras models are optimized to make predictions on a batch, or collection, of examples at once. 
train_numeric_ds = train_numeric_ds.batch(batch_size)

# val dataset
val_numeric_ds = tf.data.Dataset.from_tensor_slices((X_validation, y_validation))
val_numeric_ds = val_numeric_ds.batch(batch_size)

# test dataset 
test_numeric_ds = tf.data.Dataset.from_tensor_slices((X_test, y_test))
test_numeric_ds = test_numeric_ds.batch(batch_size)

for row in train_numeric_ds.take(1):
  r = row[1] # batch of labels
  first_label = r[0] # first label in batch 
  print(first_label)

  num_label = first_label.numpy()
  print(EMOTIONS[num_label]) # this is how you get the class output. you'll have to map it to the emotion

tf.Tensor(0, shape=(), dtype=int64)
anger


### Vectorization


In [256]:
VOCAB_SIZE = 10000
MAX_SEQUENCE_LENGTH = 250

vectorize_layer = TextVectorization(
    max_tokens=VOCAB_SIZE,
    output_mode='int',
    output_sequence_length=MAX_SEQUENCE_LENGTH)

In [257]:
"""
Next, you will call adapt to fit the state of the preprocessing layer to the dataset. This will cause the model to build an index of strings to integers.

Note: It's important to only use your training data when calling adapt (using the test set would leak information).

Make a text-only dataset (without labels), then call adapt. 
"""

train_text = train_numeric_ds.map(lambda x, y: x)
vectorize_layer.adapt(train_text)

In [258]:
def vectorize_text(text, label):
  text = tf.expand_dims(text, -1)
  return vectorize_layer(text), label

In [259]:
# You are nearly ready to train your model. As a final preprocessing step, 
# you will apply the TextVectorization layer you created earlier to the train, validation, and test dataset.


train_data = train_numeric_ds.map(vectorize_text)
val_data = val_numeric_ds.map(vectorize_text)
test_data = test_numeric_ds.map(vectorize_text)

In [260]:
for row in train_data.take(1):
  # first label in batch 
  num_label = row[1][0].numpy()

  # this is how you get the class output. you'll have to map it to the emotion
  print(EMOTIONS[num_label]) 

anger


### Configure for performance


Use these methods when loading data to make sure that I/O does not become blocking.

.cache() keeps data in memory after it's loaded off disk. This will ensure the dataset does not become a bottleneck while training your model. If your dataset is too large to fit into memory, you can also use this method to create a performant on-disk cache, which is more efficient to read than many small files.

.prefetch() overlaps data preprocessing and model execution while training.


In [261]:
AUTOTUNE = tf.data.AUTOTUNE

train_data = train_data.cache().prefetch(buffer_size=AUTOTUNE)
val_data = val_data.cache().prefetch(buffer_size=AUTOTUNE)
test_data = test_data.cache().prefetch(buffer_size=AUTOTUNE)

### Modelling

In [262]:
def create_model(vocab_size, num_labels):
  model = Sequential()

  # 32 is the batch size
  model.add(Embedding(vocab_size, 32, mask_zero=True, input_length=250))
  model.add(Conv1D(32, 5, padding="valid", activation="relu", strides=2))
  model.add(GlobalMaxPooling1D())
  model.add(Dense(num_labels))

  return model

In [263]:
# `vocab_size` is `VOCAB_SIZE + 1` since `0` is used additionally for padding.
int_model = create_model(vocab_size=VOCAB_SIZE + 1, num_labels=6)

int_model.compile(
    loss=losses.SparseCategoricalCrossentropy(from_logits=True),
    optimizer='adam',
    metrics=['accuracy'])

int_model.summary()

Model: "sequential_20"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_20 (Embedding)    (None, 250, 32)           320032    
                                                                 
 conv1d_16 (Conv1D)          (None, 123, 32)           5152      
                                                                 
 global_max_pooling1d_8 (Glo  (None, 32)               0         
 balMaxPooling1D)                                                
                                                                 
 dense_18 (Dense)            (None, 6)                 198       
                                                                 
Total params: 325,382
Trainable params: 325,382
Non-trainable params: 0
_________________________________________________________________


In [264]:
history = int_model.fit(train_data, validation_data=val_data, epochs=5)

Epoch 1/5


ValueError: ignored

In [None]:
print("ConvNet model on int vectorized data:")
print(int_model.summary())

In [None]:
# model.compile(optimizer='sgd',
#               loss=tf.keras.losses.SparseCategoricalCrossentropy())

In [None]:
# epochs = 10
# history = model.fit(
#     train_data,
#     validation_data=val_data,
#     epochs=epochs)

In [None]:
#model.fit(X_train, y_train.values, batch_size = 200, epochs = 10, verbose = 1)

### Model Performance Evaluation

In [None]:
#print('Accuracy: '+ str(model.evaluate(X_validation, y_validation.values)[1]))