In [22]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package stopwords to /home/himani/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
#read the data set
data = pd.read_csv("./Data/phrases.csv")
# data = pd.read_csv("./Data/phrases.csv")
# data=pd.DataFrame(data)
# print type(data)
# print data.shape
data=data.dropna()
print data['Emotion'].unique()
# print data.shape

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


## We are applying an LSTM based model for the emotion prediction. a special kind of RNN, capable of learning long-term dependencies.

## Preprocess the data set via cleaning, tokenisation and lemmatization

In [24]:
# print data
import re

def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
#         print index,row['Phrase']
        row['Phrase'] = row['Phrase'].replace('[','')
        row['Phrase'] = row['Phrase'].replace(']','')
        row['Phrase'] = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', row['Phrase'])
        row['Phrase'] = re.sub('@[^\s]+','USER', row['Phrase'])
        row['Phrase'] = row['Phrase'].lower().replace("ё", "е")
        row['Phrase'] = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', row['Phrase'])
        row['Phrase'] = re.sub(' +',' ', row['Phrase'])
        row['Phrase'] = row['Phrase'].strip()
        row['Phrase'] = row['Phrase'].translate(translator,string.punctuation)
#         print row['Phrase']
    return data
data = clean_dataset(data)
print data.head()

   Emotion                                             Phrase
0      joy  on days when i feel closing to my partner and ...
1     fear  every time i imagine that someone i love or i ...
2    anger  when i had been obviously unjustly treated and...
3  sadness  when i think about the short time that we live...
4  disgust  at a gathering i found myself involuntarily si...


In [25]:
def bag_of_words(data):
    count_vect=CountVectorizer()
    count_vect.fit(data['Phrase']) #creates vocab of words
#     print (count_vect.vocabulary_)
    data_phrase=count_vect.transform(data['Phrase'])
#     print(data_phrase.shape)
#     print(type(data_phrase))
#     print(data_phrase.toarray())
    return data_phrase,count_vect
data_phrase,count_vect=bag_of_words(data)
data_phrase.shape


(7652, 8930)

In [26]:
def tf_idf(data_phrase):
    tfidf_transformer = TfidfTransformer()
    phrase_tfidf = tfidf_transformer.fit_transform(data_phrase)
    return phrase_tfidf.toarray(),tfidf_transformer
phrase_tfidf,tfidf_transformer=tf_idf(data_phrase)
phrase_tfidf.shape

(7652, 8930)

In [27]:
# Import LabelEncoder
#creating labelEncoder
le = preprocessing.LabelEncoder()
# Converting string labels into numbers.
emotion_data=data['Emotion'].values
emo_enc=le.fit_transform(emotion_data)
print emo_enc



[4 2 0 ... 1 6 3]


In [28]:
train_phrase, validate_phrase = np.split(phrase_tfidf,[int(.8*len(phrase_tfidf))])
# train_label,validate_label=np.split(emotion_data,[int(.8*len(emotion_data))])
train_label,validate_label=np.split(emo_enc,[int(.8*len(emo_enc))])
print "data",train_phrase.shape,validate_phrase.shape
print "label",train_label.shape,validate_label.shape

data (6121, 8930) (1531, 8930)
label (6121,) (1531,)


In [39]:
clf = MultinomialNB().fit(train_phrase, train_label)
label_predicted = clf.predict(validate_phrase)

from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score

sklearn_score_train = clf.score(train_phrase,train_label)
print("Sklearn's score on training data :",sklearn_score_train)
sklearn_score_test = clf.score(validate_phrase,validate_label)
print("Sklearn's score on testing data :",sklearn_score_test)
print("Classification report for testing data :-")
print(classification_report(validate_label, label_predicted))

("Sklearn's score on training data :", 0.8033001143603986)
("Sklearn's score on testing data :", 0.5329849771391247)
Classification report for testing data :-
              precision    recall  f1-score   support

           0       0.42      0.46      0.44       219
           1       0.62      0.34      0.44       220
           2       0.69      0.65      0.67       219
           3       0.44      0.61      0.52       218
           4       0.61      0.68      0.64       218
           5       0.64      0.47      0.54       218
           6       0.44      0.52      0.47       219

   micro avg       0.53      0.53      0.53      1531
   macro avg       0.55      0.53      0.53      1531
weighted avg       0.55      0.53      0.53      1531



In [30]:
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.model_selection import train_test_split, GridSearchCV

text_clf = Pipeline([('vect', CountVectorizer(stop_words="english",encoding='latin-1')),
                     ('tfidf', TfidfTransformer()),
                     ('clf', MultinomialNB())])

tuned_parameters = {
    'vect__ngram_range': [(1, 1), (1, 2), (2, 2)],
    'tfidf__use_idf': (True, False),
    'tfidf__norm': ('l1', 'l2'),
    'clf__alpha': [1, 1e-1, 1e-2]
}


In [31]:
x_train, x_test, y_train, y_test = train_test_split(data['Phrase'], emo_enc, test_size=0.20, random_state=42)

In [32]:
print x_train.shape,x_test.shape,y_train.shape,y_test.shape

(6121,) (1531,) (6121,) (1531,)


In [33]:
from sklearn.metrics import classification_report

clf_tune = GridSearchCV(text_clf, tuned_parameters, cv=10, scoring="f1_weighted")
clf_tune.fit(x_train, y_train)

print(classification_report(y_test, clf_tune.predict(x_test), digits=4))

              precision    recall  f1-score   support

           0     0.4267    0.4486    0.4374       214
           1     0.6590    0.5455    0.5969       209
           2     0.6300    0.6810    0.6545       210
           3     0.5023    0.4672    0.4842       229
           4     0.6211    0.7260    0.6695       219
           5     0.6239    0.6104    0.6171       231
           6     0.5261    0.5068    0.5163       219

   micro avg     0.5689    0.5689    0.5689      1531
   macro avg     0.5699    0.5694    0.5680      1531
weighted avg     0.5694    0.5689    0.5675      1531



In [34]:
print(clf_tune.best_params_)
print(clf_tune.best_score_)

{'vect__ngram_range': (1, 2), 'tfidf__use_idf': True, 'tfidf__norm': 'l2', 'clf__alpha': 1}
0.5512129556996596


In [52]:
test_sentence=raw_input("Enter text:")
test_sen=[]
test_sen.append(test_sentence)
bow=count_vect.transform(test_sen)
tf_test=tfidf_transformer.transform(bow)
print tf_test
predicted_test= clf.predict(tf_test)
le.inverse_transform(predicted_test)

Enter text:call me
  (0, 4874)	0.37377239775751403
  (0, 1145)	0.9275204551246289


array(['anger'], dtype=object)