In [1]:
from sklearn.metrics import confusion_matrix
from wordcloud import WordCloud, STOPWORDS 
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os
import re
import nltk

In [2]:
text_dict = {'text':[], 'happy':[], 'sad':[], 'fear':[], 'anger':[], 'love':[]}

In [3]:
def get_data(url):
    f = open(url, 'r', encoding = "UTF-8")
    text = f.read()
    text_array = text.split('\n')
    emotions = ['happy', 'sad', 'fear', 'anger', 'love']
    for sentence in text_array[:-1]:
        start_index = sentence.index('>')
        end_index = sentence[start_index:].index("<")
        text = sentence[start_index + 1:end_index +3]
        emotion = sentence[1:start_index]
        if emotion in emotions:
            text_dict['text'].append(text)
            for e in emotions:
                if emotion == e:
                    text_dict[emotion].append(1)
                else:
                    text_dict[e].append(0)

In [4]:
def get_data_2(url):
    f = open(url, 'r', encoding = "UTF-8")
    text = f.read()
    text_array = text.split('\n')
    emotions = ['happy', 'sad', 'fear', 'anger', 'love']
    emotion_dict = {"anger": 6, "joy":1, "fear":4, "sadness":2, "love":8, "surprise":3}
    for sentence in text_array:
        if ';' in sentence:
            a = sentence.split(';')
            text = a[0]
            emotion = a[1]
            if emotion == 'joy':
                emotion = 'happy'
            if emotion == 'sadness':
                emotion = 'sad'
            
            if emotion in emotions:
                text_dict['text'].append(text)
                for e in emotions:
                    if emotion == e:
                        text_dict[emotion].append(1)
                    else:
                        text_dict[e].append(0)

In [5]:
get_data('No Cause.txt')
get_data_2("train.txt")
get_data_2("test.txt")
get_data_2("val.txt")

In [6]:
train = pd.DataFrame(text_dict)

In [7]:
print("Shape of training data is", train.shape)

train.head()

Shape of training data is (20580, 6)


Unnamed: 0,text,happy,sad,fear,anger,love
0,This did the trick : the boys now have a more ...,1,0,0,0,0
1,"When Anna left Inspector Aziz , she was much h...",1,0,0,0,0
2,"And though , as Lachlan had planned , they had...",1,0,0,0,0
3,"Honestly , I 'm really happy for you",1,0,0,0,0
4,Lesley was totally happy about it,1,0,0,0,0


In [8]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()

In [9]:
def preprocess(df):
    for index, row in df.iterrows():
        filter_sentence = ''
        sentence = row['text']
    
        # Cleaning the sentence with regex
        sentence = re.sub(r'[^\w\s]', '', sentence)

        # Tokenization
        words = nltk.word_tokenize(sentence)

        # Stopwords removal
        words = [w for w in words if not w in stop_words]
        
        for words in words:
            filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
        
        df.loc[index, 'text'] = filter_sentence
    df = df[['text', 'happy', 'sad', 'fear', 'anger', 'love']]
    return(df)

In [10]:
train = preprocess(train)
train.head()

Unnamed: 0,text,happy,sad,fear,anger,love
0,this trick boy distant friendship david much ...,1,0,0,0,0
1,when anna left inspector aziz much happier,1,0,0,0,0
2,and though lachlan planned expected attack mo...,1,0,0,0,0
3,honestly i really happy,1,0,0,0,0
4,lesley totally happy,1,0,0,0,0


In [11]:
Y = train[['happy', 'sad', 'fear', 'anger', 'love']]

In [12]:
Y = np.array(Y)

In [13]:
X = np.array(train['text'])

In [14]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer

In [15]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)
X_new = []
for line in X:
    token_list = tokenizer.texts_to_sequences([line])[0]
    X_new.append(token_list)
max_sequence_len = max([len(x) for x in X_new])
input_sequences = np.array(pad_sequences(X_new, maxlen=max_sequence_len, padding='pre'))
total_words = len(tokenizer.word_index) + 1
X = input_sequences

In [16]:
from sklearn.model_selection import train_test_split

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2)

In [18]:
from keras.layers import Embedding, Dense, LSTM, Bidirectional, Dropout, Flatten
from keras.models import Sequential

In [19]:
model = Sequential()
model.add(Embedding(total_words, 8, input_length = X.shape[1]))
model.add(Flatten())
model.add(Dense(5, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  #(# Pick a loss function and an optimizer)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 35, 8)             131248    
_________________________________________________________________
flatten (Flatten)            (None, 280)               0         
_________________________________________________________________
dense (Dense)                (None, 5)                 1405      
Total params: 132,653
Trainable params: 132,653
Non-trainable params: 0
_________________________________________________________________
None


In [20]:
model.fit(X_train, y_train, epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x15a512ff640>

In [21]:
y_pred = np.argmax(model.predict(X_test), axis = -1)

In [22]:
y_test = [list(i).index(1) for i in list(y_test)]
y_test = np.array(y_test)

In [23]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

print("Accuracy of model on test set:")
print(accuracy_score(y_test, y_pred))
print("\nThe confusion matrix on test set")
print(confusion_matrix(y_test, y_pred))

Accuracy of model on test set:
0.8598153547133139
The confusion matrix on test set
[[1259   50   27   16   63]
 [  58 1109   23   41    9]
 [  32   34  426   23    4]
 [  21   52   21  495    3]
 [  78   11    5    6  250]]


In [24]:
def preprocess_sentence(sentence):
    
    sentence = re.sub(r'[^\w\s]', '', sentence)
    
    words = nltk.word_tokenize(sentence)
    words = [w for w in words if not w in stop_words]
    
    filter_sentence = ''
    for words in words:
        filter_sentence = filter_sentence  + ' ' + str(lemmatizer.lemmatize(words)).lower()
        
    return(filter_sentence)

In [25]:
def detect(text):
    text = preprocess_sentence(text)
    token_list = tokenizer.texts_to_sequences([text])[0]
    zeros = max(0, max_sequence_len - len(token_list))
    token_list = [0]*zeros + token_list
    token_list = np.array(token_list).reshape(1, max_sequence_len)
    index = np.argmax(model.predict(token_list), axis = -1)[0]
    
    emotions = ['happy', 'sad', 'fear', 'anger', 'love']
    return(emotions[index])

In [26]:
detect("I feel tired, sore, and lonely. I just want somebody to hold")

'sad'

In [27]:
detect("Not great. I feel suicidal and I have no one to talk to")

'happy'

In [28]:
detect("Happy, optimistic, inspired, enthusiastic, upbeat, silly, joyful")

'happy'