### Importing Librararies

In [1]:
import pandas as pd
import numpy as np

from functions import *

from tqdm import tqdm_notebook as tqdm

import nltk
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet
from nltk.stem import SnowballStemmer
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

import re
from bs4 import BeautifulSoup

from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix



[nltk_data] Downloading package wordnet to /Users/leo/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /Users/leo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/leo/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### Data Exploration and Preprocessing

In [2]:
emotion_dict = {1:'Anger', 2:'Anticipation', 3:'Disgust', 4:'Fear', 5:'Joy', 6:'Sadness', 7:'Surprise', 8:'Trust'}

In [3]:
#Opening and storing train data
train_data = open('data/training_set.txt', "r")
train_data = train_data.readlines()

# remove /n at the end of each line
for index, line in enumerate(train_data):
      train_data[index] = line.strip()

#Opening and storing dev data
dev_data = open('data/dev_set.txt', "r")
dev_data = dev_data.readlines()

# remove /n at the end of each line
for index, line in enumerate(dev_data):
      dev_data[index] = line.strip()

train_data[:5], dev_data[:5]
#It seems that every line is a different training sentence followed by its respective emotion

(['sentence\temotion',
  "I'm too old to be traded in .\t6",
  'Mother said you could always tell a lady by her hands .\t8',
  "I always said I'd leave off when the time came .\t6",
  "He'll be safe with me .\t2"],
 ['sentence\temotion',
  'What happens to the gold in our safe ?\t4',
  'Natural to get cold feet .\t8',
  'Not very lucky , is he ?\t7',
  "I'm just a little anxious to get up there and whoop ET's ass , that's all .\t2"])

In [4]:
#Loading training and dev data
train_df = txt_to_df(train_data)
dev_df = txt_to_df(dev_data)
train_df.head()

Unnamed: 0,sentence,emotion
1,I'm too old to be traded in .,6
2,Mother said you could always tell a lady by he...,8
3,I always said I'd leave off when the time came .,6
4,He'll be safe with me .,2
5,Lay off .,1


In [5]:
#High level inspection
dev_df.head()

Unnamed: 0,sentence,emotion
1,What happens to the gold in our safe ?,4
2,Natural to get cold feet .,8
3,"Not very lucky , is he ?",7
4,I'm just a little anxious to get up there and ...,2
5,Did you think we don't know about your affair ...,1


In [6]:
#Visualising the distributions of different emotions

label_counter(train_df,"emotion", set_name= "Training"), label_counter(dev_df, "emotion", "Dev")

(None, None)

In [7]:
word_counter(train_df["sentence"], "Training"), word_counter(dev_df["sentence"], "Dev")

(None, None)

In [8]:
#Applying the clean function found in functions.py to train and test
train_clean = clean(list(train_df["sentence"]), True, True)
dev_clean = clean(list(dev_df["sentence"]), True,True)

  0%|          | 0/14000 [00:00<?, ?it/s]


No parser was explicitly specified, so I'm using the best available HTML parser for this system ("lxml"). This usually isn't a problem, but if you run this code on another system, or in a different virtual environment, it may use a different parser and behave differently.





  0%|          | 0/1000 [00:00<?, ?it/s]

In [9]:
#Inspecting train_clean
train_clean[:10]

['i m too old to be trade in',
 'mother said you could alway tell a ladi by her hand',
 'i alway said i d leav off when the time came',
 'he ll be safe with me',
 'lay off',
 'you tell him to take care of you',
 'i hope so',
 'whi do you want to shut me out in the cold like this',
 'you taught me a lesson man',
 'i ll do everyth i can to make a success of it']

In [10]:
#Adding clean sentences back to original dataframe format
train_clean = pd.DataFrame(zip(train_clean, train_df.emotion), columns = train_df.columns, index = train_df.index)
dev_clean = pd.DataFrame(zip(dev_clean, dev_df.emotion), columns = dev_df.columns, index = dev_df.index)

In [11]:
#Inspecting the data
train_clean.sample(5)

Unnamed: 0,sentence,emotion
9012,then what did you just stuff under your bed noth,4
11834,miss windi the villa ha been ransack,4
11892,i guess that s for me to know and you to find ...,2
5837,are you okay i,2
11411,you ve been serv,1


In [12]:
#Verifying the distribution of words after the cleaning
word_counter(train_clean["sentence"], "Training"), word_counter(dev_clean["sentence"], "Dev")

(None, None)

In [13]:
#Initialising the CountVectorizer and fitting to train_clean
cv = CountVectorizer(max_df=0.85, stop_words = "english", binary=True)

X_train = cv.fit_transform(train_clean["sentence"])
y_train = np.array(train_clean["emotion"])

X_test = cv.transform(dev_clean["sentence"])
y_test = np.array(dev_clean["emotion"])

In [14]:
#Verifying the shape of the vecotrs that correspond to each sentence
X_train.toarray()[0].shape

(5144,)

In [15]:
#Verifying list of stop words used by CountVectorizer
cv.get_stop_words()

frozenset({'a',
           'about',
           'above',
           'across',
           'after',
           'afterwards',
           'again',
           'against',
           'all',
           'almost',
           'alone',
           'along',
           'already',
           'also',
           'although',
           'always',
           'am',
           'among',
           'amongst',
           'amoungst',
           'amount',
           'an',
           'and',
           'another',
           'any',
           'anyhow',
           'anyone',
           'anything',
           'anyway',
           'anywhere',
           'are',
           'around',
           'as',
           'at',
           'back',
           'be',
           'became',
           'because',
           'become',
           'becomes',
           'becoming',
           'been',
           'before',
           'beforehand',
           'behind',
           'being',
           'below',
           'beside',
           'besides'

### KNN Model

In [16]:
#Creating baseline model and recording results
modelknn = KNeighborsClassifier(
    n_neighbors=5, 
    weights='distance', 
    algorithm='brute', 
    leaf_size=30, p=2,        
    metric='cosine', 
    metric_params=None, 
    n_jobs=-1
)

modelknn.fit(X_train,y_train)

KNeighborsClassifier(algorithm='brute', metric='cosine', n_jobs=-1,
                     weights='distance')

In [17]:
prediction = modelknn.predict(X_test)

In [18]:
print(classification_report(prediction, y_test, target_names=emotion_dict.values()))

              precision    recall  f1-score   support

       Anger       0.56      0.33      0.42       355
Anticipation       0.37      0.30      0.33       208
     Disgust       0.04      0.10      0.06        30
        Fear       0.19      0.24      0.22        82
         Joy       0.32      0.35      0.34        88
     Sadness       0.18      0.31      0.23        51
    Surprise       0.21      0.28      0.24        71
       Trust       0.23      0.31      0.26       115

    accuracy                           0.31      1000
   macro avg       0.26      0.28      0.26      1000
weighted avg       0.37      0.31      0.33      1000



In [19]:
#Using gridsearch to try and find better model parameters
from sklearn.model_selection import GridSearchCV

k_range = list(range(1,100))
weight_options = ["uniform", "distance"]

param_grid = dict(n_neighbors = k_range, weights = weight_options)

knn = modelknn

grid = GridSearchCV(knn, param_grid, cv = 10, scoring = "accuracy")
grid.fit(X_train,y_train)

print (grid.best_score_)
print (grid.best_params_)
print (grid.best_estimator_)

0.31435714285714284
{'n_neighbors': 84, 'weights': 'distance'}
KNeighborsClassifier(algorithm='brute', metric='cosine', n_jobs=-1,
                     n_neighbors=84, weights='distance')


### NN

In [20]:
#Creating baseline model and recording results
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(hidden_layer_sizes=(100,100,100,100), activation='relu', solver='adam', max_iter=1000)
mlp.fit(X_train,y_train)

MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100), max_iter=1000)

In [21]:
nn_prediction = mlp.predict(X_test)
print(classification_report(nn_prediction, y_test, target_names=emotion_dict.values()))

              precision    recall  f1-score   support

       Anger       0.41      0.34      0.37       255
Anticipation       0.33      0.31      0.32       181
     Disgust       0.17      0.25      0.20        53
        Fear       0.25      0.25      0.25       106
         Joy       0.38      0.42      0.40        88
     Sadness       0.26      0.26      0.26        87
    Surprise       0.26      0.30      0.28        84
       Trust       0.27      0.29      0.28       146

    accuracy                           0.31      1000
   macro avg       0.29      0.30      0.30      1000
weighted avg       0.32      0.31      0.31      1000



In [22]:
#Using gridsearch to try and find better parameters

parameter_space = {
    'hidden_layer_sizes': [(100,100,100), (50,100,50), (20,30,40,100)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant','adaptive'],
}

clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=3)
clf.fit(X_train, y_train)



GridSearchCV(cv=3,
             estimator=MLPClassifier(hidden_layer_sizes=(100, 100, 100, 100),
                                     max_iter=1000),
             n_jobs=-1,
             param_grid={'activation': ['tanh', 'relu'],
                         'alpha': [0.0001, 0.001],
                         'hidden_layer_sizes': [(100, 100, 100), (50, 100, 50),
                                                (20, 30, 40, 100)],
                         'learning_rate': ['constant', 'adaptive'],
                         'solver': ['sgd', 'adam']})

In [23]:
print (clf.best_score_)
print (clf.best_params_)
print (clf.best_estimator_)

0.3068575133617453
{'activation': 'tanh', 'alpha': 0.001, 'hidden_layer_sizes': (50, 100, 50), 'learning_rate': 'adaptive', 'solver': 'sgd'}
MLPClassifier(activation='tanh', alpha=0.001, hidden_layer_sizes=(50, 100, 50),
              learning_rate='adaptive', max_iter=1000, solver='sgd')
