In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#read the data set
data = pd.read_csv("./Data/phrases.csv")
data=data.dropna()
# print type(data)
print data['Emotion'].unique()

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


## We are applying an LSTM based model for the emotion prediction. a special kind of RNN, capable of learning long-term dependencies.

## Preprocess the data set via cleaning, tokenisation and lemmatization

In [3]:
# print data
def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
        row['Phrase'] = row['Phrase'].replace('[','')
        row['Phrase'] = row['Phrase'].replace(']','')
        row['Phrase'] = row['Phrase'].strip()
        row['Phrase'] = row['Phrase'].translate(translator,string.punctuation)
    return data
data = clean_dataset(data)
print data.head()

   Emotion                                             Phrase
0      joy  On days when I feel closing to my partner and ...
1     fear  Every time I imagine that someone I love or I ...
2    anger  When I had been obviously unjustly treated and...
3  sadness  When I think about the short time that we live...
4  disgust  At a gathering I found myself involuntarily si...


In [4]:
def tokenise(data):
    ## Convert words to lower case and split them    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    
    for index,row in data.iterrows():
        text = row['Phrase'].lower().split(' ')
        text = [w.strip() for w in text if not w in stops and len(w) >= 2]
        text = " ".join(text)
        row['Phrase'] = text
    # split the dataset into tokens
    return data
data = tokenise(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,days feel closing partner friends when feel pe...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility of eluc...
3,sadness,think short time live relate the periods life ...
4,disgust,gathering found involuntarily sitting next two...


In [5]:
def lemmatization(dataset):
    stemmer = SnowballStemmer('english')
    list_of_words=[]
    for index,row in dataset.iterrows():
        text = row['Phrase'].split()
        stemmed_words = [stemmer.stem(word) for word in text]
        list_of_words.append(stemmed_words)
        text = " ".join(stemmed_words)
        row['Phrase'] = text
    data['list_of_words'] = list_of_words
    return dataset
data = lemmatization(data)
data

Unnamed: 0,Emotion,Phrase,list_of_words
0,joy,day feel close partner friend when feel peac a...,"[day, feel, close, partner, friend, when, feel..."
1,fear,everi time imagin someon love could contact se...,"[everi, time, imagin, someon, love, could, con..."
2,anger,obvious unjust treat possibl of elucid,"[obvious, unjust, treat, possibl, of, elucid]"
3,sadness,think short time live relat the period life th...,"[think, short, time, live, relat, the, period,..."
4,disgust,gather found involuntarili sit next two peopl ...,"[gather, found, involuntarili, sit, next, two,..."
5,shame,realiz direct feel discont with partner way tr...,"[realiz, direct, feel, discont, with, partner,..."
6,guilt,feel guilti realiz consid materi thing more im...,"[feel, guilti, realiz, consid, materi, thing, ..."
7,joy,girlfriend taken exam went parent place,"[girlfriend, taken, exam, went, parent, place]"
8,fear,first time realiz mean death,"[first, time, realiz, mean, death]"
9,anger,car overtak anoth forc drive road,"[car, overtak, anoth, forc, drive, road]"


In [6]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(data['Phrase'])

sequences = tokenizer.texts_to_sequences(data['Phrase'])
tokenized_data = pad_sequences(sequences, maxlen=50)
# tokenized_data


Using TensorFlow backend.


In [7]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = data['Emotion'].unique()
# print X
X=X.reshape(-1,1)
# print X
enc.fit(X)
enc.categories_
emotion_data=data['Emotion'].values
emotion_data=emotion_data.reshape(-1,1)
emotion_data=enc.transform(emotion_data).toarray()

In [8]:
train_phrase, validate_phrase = np.split(tokenized_data,[int(.8*len(tokenized_data))])
test_t,test_v = np.split(data['Phrase'],[int(.8*len(data['Phrase']))])
train_label,validate_emotion=np.split(emotion_data,[int(.8*len(emotion_data))])
# train_label

In [9]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(LSTM(100, dropout=0.2, return_sequences=True, recurrent_dropout=0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_phrase, np.array(train_label), epochs=5,  validation_data=(validate_phrase, validate_emotion))

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Train on 6121 samples, validate on 1531 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f0fecbc6450>

In [10]:
predicted_emo=model.predict(validate_phrase)

In [11]:
test=(enc.inverse_transform(predicted_emo)).reshape(-1)
actual = (enc.inverse_transform(validate_emotion)).reshape(-1)
test

array(['shame', 'anger', 'sadness', ..., 'disgust', 'shame', 'fear'],
      dtype=object)

In [12]:
from sklearn.metrics import accuracy_score
accuracy_score(actual,test)

0.5225342913128674

In [13]:
def data_preprocess(text):
#     cleaning
    translator = string.maketrans('', '')
    text = text.replace('[','')
    text = text.replace(']','')
    text = text.strip()
    text = text.translate(translator,string.punctuation)
#     stop word removal
    stops = set(stopwords.words("english"))
    text = text.lower().split(' ')
    text = [w.strip() for w in text if not w in stops and len(w) >= 2]
    text = " ".join(text)
#     lemmatization
    stemmer = SnowballStemmer('english')
    text = text.split()
    stemmed_words = [stemmer.stem(word) for word in text]
    text = " ".join(stemmed_words)
    
    return text

In [29]:
sequences = tokenizer.texts_to_sequences([data_preprocess('I am happy!')])
test_data = pad_sequences(sequences, maxlen=50)
pred_emo = model.predict(test_data)
test_op = (enc.inverse_transform(pred_emo)).reshape(-1)
test_op

array(['joy'], dtype=object)