In [27]:
# Read data for model
import pandas as pd
import os
import re

# https://www.kaggle.com/shrivastava/isears‚Äêdataset
label_renamer = {'joy': 'happy', 'sadness': 'sad'}
data_isear = pd.read_csv('data/isear.csv', sep='|', engine='python')
data_isear['label'] = data_isear['Field1'].map(lambda l: label_renamer.setdefault(l, l))
data_isear = pd.concat([data_isear['SIT'], data_isear['label']], axis=1, keys=['text', 'label'])

# https://www.site.uottawa.ca/~diana/resources/emotion_stimulus_data/
texts = []
labels = []
pattern = re.compile(r"<cause>.*<\\cause>")
path = 'data/Diman et al'
for file_name in os.listdir(path):
    if file_name != 'Readme.txt':
        with open(os.path.join(path, file_name)) as f:
            for line in f.readlines():
                line = re.sub(pattern, '', line)
                token = line.split('>')
                label = token[0][1:]
                labels.append(label)
                texts.append(token[1][:-(len(label) + 2)])
data_diman = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

# https://www.aclweb.org/anthology/I17-1099/
texts = []
labels = []
with open('data/EMNLP_dataset/dialogues_text.txt') as f_text:
    with open('data/EMNLP_dataset/dialogues_emotion.txt') as f_label:
        label_decoder = {'1': 'anger', '2': 'disgust', '3': 'fear', '4': 'happy', '5': 'sad', '6': 'surprise'}
        for line in f_text:
            text_tokens = line.strip().split('__eou__')
            label_tokens = f_label.readline().strip().split(' ')
            for i in range(len(label_tokens)):
                if label_tokens[i] != '0':
                    texts.append(text_tokens[i])
                    labels.append(label_decoder[label_tokens[i]])
data_emnlp = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

# http://saifmohammad.com/WebPages/EmotionIntensity-SharedTask.html
texts = []
labels = []
pattern = re.compile(r"@+\w+")
path = 'data/EmoInt'
for file_name in os.listdir(path):
    with open(os.path.join(path, file_name)) as f:
        for line in f:
            tokens = line.split('\t')
            texts.append(re.sub(pattern, '', tokens[1]))
            labels.append(label_renamer.setdefault(tokens[2], tokens[2]))
data_emoint = pd.concat([pd.Series(texts), pd.Series(labels)], axis=1, keys=['text', 'label'])

data = pd.concat([data_isear, data_diman, data_emnlp, data_emoint])

In [28]:
# Tokenize text data
import nltk
import ssl
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import TweetTokenizer

stopwords = set(stopwords.words('english'))
try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
tknzr = TweetTokenizer()

def get_tokens(sentence):
    # tokens = nltk.word_tokenize(sentence)
    tokens = tknzr.tokenize(sentence)
    tokens = [token for token in tokens if token not in stopwords and len(token) > 1]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    return lemma

# Create list of tokens after removing irrelevant words
token_list = data['text'].apply(get_tokens)

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/johngilbertson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [29]:
# Encode data for model
import keras.preprocessing as pp
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

# prepare tokenizer
t = pp.text.Tokenizer()
t.fit_on_texts(token_list)

# integer encode the documents
encoded_texts = t.texts_to_sequences(data['text'])
max_len = 60
X = pp.sequence.pad_sequences(encoded_texts, maxlen=max_len, padding='post')
le = preprocessing.LabelEncoder()
Y = le.fit_transform(data['label'])

# now splitting into test and training data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.20, random_state=4)

In [30]:
# Use pre-trained word vectors
# http://nlp.stanford.edu/data/glove.twitter.27B.zip
import numpy as np

embedding_dims = [25, 50, 100, 200] # DO NOT CHANGE
embedding_dim = embedding_dims[1]
vocab_size = len(t.word_index) + 1
embedding_matrix = np.zeros((vocab_size, embedding_dim))
with open('data/glove.twitter.27B/glove.twitter.27B.{}d.txt'.format(embedding_dim)) as f:
    for line in f:
        word, *vector = line.split()
        if word in t.word_index:
            idx = t.word_index[word]
            embedding_matrix[idx] = np.array(vector, dtype=np.float32)[:embedding_dim]

In [48]:
from keras.layers import *
from tensorflow.keras.models import Model

# Build bidirectional LSTM model
input_layer = Input(shape=(max_len,))
model = Embedding(vocab_size, embedding_dim, weights=[embedding_matrix], input_length=max_len, trainable=True)(input_layer)
model = Bidirectional(LSTM(embedding_dim, return_sequences=True, dropout=0.50), merge_mode='concat')(model)
model = TimeDistributed(Dense(embedding_dim, activation='relu'))(model)
model = GlobalMaxPool1D()(model)
model = Dense(embedding_dim, activation='relu')(model)
output_layer = Dense(len(le.classes_), activation='softmax', name='prediction')(model)
model = Model(inputs=input_layer, outputs=output_layer)
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train model
model.fit(X_train, Y_train, validation_split=0.25, epochs=10, verbose=2)

# Evaluate model
accuracy = model.evaluate(X_test, Y_test, verbose=2)[1]
print('Accuracy: {:.2%}'.format(accuracy))

Model: "model_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_12 (InputLayer)        [(None, 60)]              0         
_________________________________________________________________
embedding_11 (Embedding)     (None, 60, 50)            907600    
_________________________________________________________________
bidirectional_11 (Bidirectio (None, 60, 100)           40400     
_________________________________________________________________
time_distributed_8 (TimeDist (None, 60, 50)            5050      
_________________________________________________________________
global_max_pooling1d_10 (Glo (None, 50)                0         
_________________________________________________________________
dense_14 (Dense)             (None, 50)                2550      
_________________________________________________________________
prediction (Dense)           (None, 8)                 408

In [43]:
# Access Reddit
import praw

# Create Reddit instance
reddit = praw.Reddit(
    client_id="fyb1niVZk_v30w",
    client_secret="0pt_-VthXOTp2KAXHKDyp1qb-dSnPg",
    user_agent="python:emotiondetection:v1.0.0",
)

# Retrieve comments from subreddit submission
beaver_url = "https://www.reddit.com/r/technews/comments/mzfq8e/hundreds_lose_internet_service_in_northern_bc/"
one_of_us_url = "https://www.reddit.com/r/wallstreetbets/comments/n0spz6/one_of_us/"
submission = reddit.submission(url=one_of_us_url)
submission.comments.replace_more(limit=0)

# Parse comments
reddit_comments = []
pattern = re.compile(r"((\[deleted])|(\[removed])|(\b((u/)|(r/)))\w+)")
link_pat = re.compile(r"###\[View link]")
for comment in submission.comments.list():
    comment_body = ("" if re.match(link_pat, comment.body) is not None else re.sub(pattern, '', comment.body)).strip()
    if comment_body:
        reddit_comments.append(comment_body)

In [46]:
# Predict emotions of Reddit comments
sequences = t.texts_to_sequences(reddit_comments)
to_predict = pp.sequence.pad_sequences(sequences, maxlen=max_len, padding='post')
prediction = model.predict([to_predict,])

# Store prediction statistics
emotion_total = [0] * len(le.classes_)
highest_perc = dict()

def save_best(text, label):
    value = highest_perc.get(label)
    if value is None or prediction[text][label] > value[0]:
        highest_perc[label] = (prediction[text][label], reddit_comments[text])

# Display prediction results
print('\t-- Comments --')
label_max_width = len(max(data['label'], key=len))
for text in range(len(prediction)):
    print(reddit_comments[text])
    for label in range(len(prediction[text])):
        emotion_total[label] += prediction[text][label]
        save_best(text, label)
        if prediction[text][label] >= .0001:
            print("\t{:<{}}: {:.2%}".format(le.classes_[label], label_max_width, prediction[text][label]))

# Display prediction averages
print('-' * 35)
print('\t-- Averages --')
for label in range(len(emotion_total)):
    print("\t{:<{}}: {:.2%}".format(le.classes_[label], label_max_width, emotion_total[label] / len(reddit_comments)))

# Display highest percentage comment for each emotion
print('-' * 35)
print('\t-- Highest Percentage Comments --')
for label, (percent, comment) in highest_perc.items():
    print(comment)
    print("\t{}: {:.2%}".format(le.classes_[label], percent))


	-- Comments --
OP sold his 30K car for 10K‚Ä¶ def one of us.
	anger   : 22.74%
	disgust : 8.38%
	fear    : 28.97%
	guilt   : 2.41%
	happy   : 10.26%
	sad     : 13.67%
	shame   : 2.09%
	surprise: 11.49%
You should have double down and sold your wife's car also.....
	anger   : 11.40%
	disgust : 0.80%
	fear    : 3.10%
	guilt   : 0.06%
	happy   : 61.56%
	sad     : 4.96%
	shame   : 0.11%
	surprise: 18.01%
This is sometimes and unfortunately the way
	anger   : 1.61%
	disgust : 0.71%
	fear    : 2.16%
	guilt   : 1.39%
	happy   : 7.36%
	sad     : 85.75%
	shame   : 0.68%
	surprise: 0.34%
Probably got the car after joining the marines lol
	anger   : 1.28%
	disgust : 0.33%
	fear    : 1.23%
	guilt   : 0.36%
	happy   : 89.54%
	sad     : 6.31%
	shame   : 0.39%
	surprise: 0.56%
Oh look this stock is a deal... Let me  wait to see if the price rises... OH sht it rose 300% I waited to long I will buy. Oh crap earnings dropped better sell crap I lost 40% I can't sell I will hold. Crap I lost 50% sunk cos