In [80]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/himani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [81]:
# normalisation of the dataset
def normalize(X):
    from sklearn.preprocessing import StandardScaler
    X = StandardScaler().fit_transform(X)
    return X

In [82]:
#read the data set
data = pd.read_csv("./Data/phrases.csv")
# data=pd.DataFrame(data)
# print type(data)
print data['Emotion'].unique()

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


## We are applying an LSTM based model for the emotion prediction. a special kind of RNN, capable of learning long-term dependencies.

## Preprocess the data set via cleaning, tokenisation and lemmatization

In [83]:
# print data
def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
        row['Phrase'] = row['Phrase'].replace('[','')
        row['Phrase'] = row['Phrase'].replace(']','')
        row['Phrase'] = row['Phrase'].strip()
        row['Phrase'] = row['Phrase'].translate(translator,string.punctuation)
    return data
data = clean_dataset(data)
print data.head()

   Emotion                                             Phrase
0      joy  On days when I feel closing to my partner and ...
1     fear  Every time I imagine that someone I love or I ...
2    anger  When I had been obviously unjustly treated and...
3  sadness  When I think about the short time that we live...
4  disgust  At a gathering I found myself involuntarily si...


In [84]:
def tokenise(data):
    ## Convert words to lower case and split them    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    
    for index,row in data.iterrows():
        text = row['Phrase'].lower().split(' ')
        text = [w.strip() for w in text if not w in stops and len(w) >= 2]
        text = " ".join(text)
        row['Phrase']=text
    # split the dataset into tokens
    return data
data = tokenise(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,days feel closing partner friends when feel pe...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility of eluc...
3,sadness,think short time live relate the periods life ...
4,disgust,gathering found involuntarily sitting next two...


In [85]:
def lemmatization(dataset):
    stemmer = SnowballStemmer('english')
    list_of_words=[]
    for index,row in dataset.iterrows():
        text = row['Phrase'].split()
        stemmed_words = [stemmer.stem(word) for word in text]
        list_of_words.append(stemmed_words)
        text = " ".join(stemmed_words)
        row['Phrase'] = text
    data['list_of_words'] = list_of_words
    return dataset
data = lemmatization(data)
data

Unnamed: 0,Emotion,Phrase,list_of_words
0,joy,day feel close partner friend when feel peac a...,"[day, feel, close, partner, friend, when, feel..."
1,fear,everi time imagin someon love could contact se...,"[everi, time, imagin, someon, love, could, con..."
2,anger,obvious unjust treat possibl of elucid,"[obvious, unjust, treat, possibl, of, elucid]"
3,sadness,think short time live relat the period life th...,"[think, short, time, live, relat, the, period,..."
4,disgust,gather found involuntarili sit next two peopl ...,"[gather, found, involuntarili, sit, next, two,..."
5,shame,realiz direct feel discont with partner way tr...,"[realiz, direct, feel, discont, with, partner,..."
6,guilt,feel guilti realiz consid materi thing more im...,"[feel, guilti, realiz, consid, materi, thing, ..."
7,joy,girlfriend taken exam went parent place,"[girlfriend, taken, exam, went, parent, place]"
8,fear,first time realiz mean death,"[first, time, realiz, mean, death]"
9,anger,car overtak anoth forc drive road,"[car, overtak, anoth, forc, drive, road]"


In [96]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(data['Phrase'])

sequences = tokenizer.texts_to_sequences(data['Phrase'])
tokenized_data = pad_sequences(sequences, maxlen=10)
# tokenized_data


In [97]:
from sklearn.preprocessing import OneHotEncoder
# enc = OneHotEncoder(handle_unknown='ignore')
# X = data['Emotion'].unique()
# # print X
# X=X.reshape(-1,1)
# # print X
# enc.fit(X)
# enc.categories_
emotion_data=data['Emotion'].values
# emotion_data=emotion_data.reshape(-1,1)
# emotion_data=enc.transform(emotion_data).toarray()

print emotion_data

# Import LabelEncoder
from sklearn import preprocessing
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
emo_enc=le.fit_transform(emotion_data)
print emo_enc


['joy' 'fear' 'anger' ... 'disgust' 'shame' 'guilt']
[4 2 0 ... 1 6 3]


In [98]:
train_phrase, validate_phrase = np.split(tokenized_data,[int(.8*len(tokenized_data))])
# train_label,validate_label=np.split(emotion_data,[int(.8*len(emotion_data))])
train_label,validate_label=np.split(emo_enc,[int(.8*len(emo_enc))])
train_label

array([4, 2, 0, ..., 6, 3, 4])

In [99]:
# Using sklearn's Multinomial Naive Bayes
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

clf = MultinomialNB()
clf.fit(train_phrase,train_label)
Y_test_pred = clf.predict(validate_phrase)
print Y_test_pred
sklearn_score_train = clf.score(train_phrase,train_label)
print("Sklearn's score on training data :",sklearn_score_train)
sklearn_score_test = clf.score(validate_phrase,validate_label)
print("Sklearn's score on testing data :",sklearn_score_test)
print("Classification report for testing data :-")
print(classification_report(validate_label, Y_test_pred))


from sklearn.metrics import accuracy_score
roc_auc_score(Y_test_pred, validate_label)

[5 0 2 ... 4 4 6]
("Sklearn's score on training data :", 0.16614932200620813)
("Sklearn's score on testing data :", 0.14435009797517961)
Classification report for testing data :-
              precision    recall  f1-score   support

           0       0.15      0.09      0.11       219
           1       0.11      0.19      0.14       220
           2       0.16      0.21      0.18       219
           3       0.10      0.03      0.04       218
           4       0.16      0.31      0.21       218
           5       0.15      0.13      0.14       218
           6       0.19      0.06      0.10       219

   micro avg       0.14      0.14      0.14      1531
   macro avg       0.15      0.14      0.13      1531
weighted avg       0.15      0.14      0.13      1531



ValueError: multiclass format is not supported