In [1]:
# read data
import pandas as pd
import os
import re

# https://www.kaggle.com/shrivastava/isears‐dataset
label_renamer = {'joy': 'happy', 'sadness': 'sad'}
data_isear = pd.read_csv('data/isear.csv', sep='|', engine='python')
data_isear['label'] = data_isear['Field1'].map(lambda l: label_renamer.setdefault(l, l))
data_isear = pd.concat([data_isear['SIT'], data_isear['label']], axis=1, keys=['text', 'label'])

# https://www.site.uottawa.ca/~diana/resources/emotion_stimulus_data/
texts = []
labels = []
path = 'data/Diman et al'
for file_name in os.listdir(path):
    if file_name != 'Readme.txt':
        with open(os.path.join(path, file_name)) as f:
            for line in f.readlines():
                line = re.sub(r'<cause>.*<\\cause>', '', line)
                token = line.split('>')
                label = token[0][1:]
                labels.append(label)
                texts.append(token[1][:-(len(label) + 2)])
data_diman = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

# https://www.aclweb.org/anthology/I17-1099/
texts = []
labels = []
with open('data/EMNLP_dataset/dialogues_text.txt') as f_text:
    with open('data/EMNLP_dataset/dialogues_emotion.txt') as f_label:
        label_decoder = {'1': 'anger', '2': 'disgust', '3': 'fear', '4': 'happy', '5': 'sad', '6': 'surprise'}
        for line in f_text:
            text_tokens = line.strip().split('__eou__')
            label_tokens = f_label.readline().strip().split(' ')
            for i in range(len(label_tokens)):
                if label_tokens[i] != '0':
                    texts.append(text_tokens[i])
                    labels.append(label_decoder[label_tokens[i]])
data_emnlp = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

# http://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
texts = []
labels = []
path = 'data/EmoInt'
for file_name in os.listdir(path):
    with open(os.path.join(path, file_name)) as f:
        for line in f:
            tokens = line.split('\t')
            texts.append(re.sub(r'@+\w+', '', tokens[1]))
            labels.append(label_renamer.setdefault(tokens[2], tokens[2]))
data_emoint = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

data = pd.concat([data_isear, data_diman, data_emnlp, data_emoint])

In [2]:
import nltk
import ssl
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TweetTokenizer

stopwords = set(stopwords.words('english'))
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
tknzr = TweetTokenizer()

def get_tokens(sentence):
    # tokens = nltk.word_tokenize(sentence)
    tokens = tknzr.tokenize(sentence)
    tokens = [token for token in tokens if token not in stopwords and len(token) > 1]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    return lemma
token_list = data['text'].apply(get_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# Encode data for model
import keras.preprocessing as pp
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# prepare tokenizer
t = pp.text.Tokenizer()
t.fit_on_texts(token_list)

max_len = 60
# integer encode the documents
encoded_texts = t.texts_to_sequences(data['text'])
X = pp.sequence.pad_sequences(encoded_texts, maxlen=max_len, padding='post')
le = preprocessing.LabelEncoder()
Y = le.fit_transform(data['label'])

# now splitting into test and training data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=4)

In [4]:
import numpy as np

# create the embedding matrix for the embedding layer
vocab_size = len(t.word_index) + 1
embedding_dim = 100
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open('data/glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(embedding_dim)) as f:
    for line in f:
        word, *vector = line.split()
        if word in t.word_index:
            idx = t.word_index[word]
            embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

In [5]:
from keras.layers import *
from tensorflow.keras.models import Model

# build bidirectional LSTM
input_layer = Input(shape=(max_len,))
model = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(input_layer)
model = Bidirectional(LSTM(embedding_dim, return_sequences=True, dropout=0.50), merge_mode='concat')(model)
model = TimeDistributed(Dense(embedding_dim, activation='relu'))(model)
model = GlobalMaxPool1D()(model)
model = Dense(100, activation='relu')(model)
output_layer = Dense(Y.max() + 1, activation='softmax', name='prediction')(model)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# train model
model.fit(X_train, Y_train, validation_split=0.25, epochs=10, verbose=2)

# evaluate the model
loss, accuracy = model.evaluate(X_test, Y_test, verbose=2)
print('Accuracy: %f' % (accuracy * 100))

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 60)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 60, 100)           1815200   
_________________________________________________________________
bidirectional (Bidirectional (None, 60, 200)           160800    
_________________________________________________________________
time_distributed (TimeDistri (None, 60, 100)           20100     
_________________________________________________________________
global_max_pooling1d (Global (None, 100)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 100)               10100     
_________________________________________________________________
prediction (Dense)           (None, 8)                 808   

In [6]:
texts = ['Oh yes!', 'I want to cry', 'What was that?']

# Predict
sequences = t.texts_to_sequences(texts)
to_predict = pp.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
prediction = model.predict([to_predict,])

# Display prediction results
for text in range(len(prediction)):
    print(texts[text])
    for label in range(len(prediction[text])):
        if prediction[text][label] >= .0001:
            print("\t{:<{}}: {:.2%}".format(le.classes_[label], 8, prediction[text][label]))

Oh yes!
	anger   : 7.99%
	disgust : 1.94%
	fear    : 0.86%
	happy   : 61.29%
	sad     : 2.55%
	shame   : 0.14%
	surprise: 25.24%
I want to cry
	anger   : 6.82%
	disgust : 2.75%
	fear    : 1.92%
	guilt   : 3.17%
	happy   : 28.03%
	sad     : 53.17%
	shame   : 3.51%
	surprise: 0.63%
What was that?
	anger   : 5.79%
	disgust : 1.02%
	fear    : 0.55%
	happy   : 20.20%
	sad     : 0.88%
	shame   : 0.10%
	surprise: 71.46%
