In [7]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [8]:
data = pd.read_csv("./Data/phrases.csv")
# data=pd.DataFrame(data)
# print type(data)
data = data.dropna()
print data['Emotion'].unique()

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


In [9]:
# print data
def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
#         print(type(row['Phrase']))
        row['Phrase'] = str(row['Phrase']).replace('[','')
        row['Phrase'] = str(row['Phrase']).replace(']','')
        row['Phrase'] = str(row['Phrase']).strip()
        row['Phrase'] = str(row['Phrase']).translate(translator,string.punctuation)
    return data
data = clean_dataset(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,On days when I feel closing to my partner and ...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [10]:
def tokenise(data):
    ## Convert words to lower case and split them    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    
    for index,row in data.iterrows():
        text = str(row['Phrase']).lower().split(' ')
        text = [w.strip() for w in text if not w in stops and len(w) >= 2]
        text = " ".join(text)
        row['Phrase'] = text
    # split the dataset into tokens
    return data
data = tokenise(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,days feel closing partner friends when feel pe...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility of eluc...
3,sadness,think short time live relate the periods life ...
4,disgust,gathering found involuntarily sitting next two...


In [11]:
def lemmatization(dataset):
    stemmer = SnowballStemmer('english')
    list_of_words=[]
    for index,row in dataset.iterrows():
        text = str(row['Phrase']).split()
        stemmed_words = [stemmer.stem(word) for word in text]
        list_of_words.append(stemmed_words)
        text = " ".join(stemmed_words)
        row['Phrase'] = text
    data['list_of_words'] = list_of_words
    return dataset
data = lemmatization(data)
data.head()

Unnamed: 0,Emotion,Phrase,list_of_words
0,joy,day feel close partner friend when feel peac a...,"[day, feel, close, partner, friend, when, feel..."
1,fear,everi time imagin someon love could contact se...,"[everi, time, imagin, someon, love, could, con..."
2,anger,obvious unjust treat possibl of elucid,"[obvious, unjust, treat, possibl, of, elucid]"
3,sadness,think short time live relat the period life th...,"[think, short, time, live, relat, the, period,..."
4,disgust,gather found involuntarili sit next two peopl ...,"[gather, found, involuntarili, sit, next, two,..."


In [12]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(data['Phrase'])

sequences = tokenizer.texts_to_sequences(data['Phrase'])
tokenized_data = pad_sequences(sequences, maxlen=50)

Using TensorFlow backend.


In [13]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = data['Emotion'].unique()
# print X
X=X.reshape(-1,1)
# print X
enc.fit(X)
enc.categories_
emotion_data=data['Emotion'].values
emotion_data=emotion_data.reshape(-1,1)
emotion_data=enc.transform(emotion_data).toarray()

In [14]:
train_phrase, validate_phrase = np.split(tokenized_data,[int(.8*len(tokenized_data))])
train_label,validate_emotion=np.split(emotion_data,[int(.8*len(emotion_data))])
# l= str(data['Phrase']).split('\n')
# for index,row in data.iterrows():
#     if type(row['Phrase'])==float:
#         print(row['Phrase'])
print(tokenized_data.shape)
print(emotion_data.shape)

(7652, 50)
(7652, 7)


In [15]:
model = Sequential()
model.add(Embedding(20000, 100, input_length=50))
model.add(Conv1D (kernel_size = (4), filters = 40, activation='relu'))
print(model.input_shape)
print(model.output_shape)
model.add(MaxPooling1D(pool_size = (4), strides=(1)))
print(model.output_shape)
model.add(LSTM(70, dropout=0.2, return_sequences=True, recurrent_dropout=0.2))
model.add(LSTM(35, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(7, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(train_phrase, np.array(train_label), epochs=7)

Instructions for updating:
Colocations handled automatically by placer.
(None, 50)
(None, 47, 40)
(None, 44, 40)
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Epoch 1/7
Epoch 2/7
Epoch 3/7
Epoch 4/7
Epoch 5/7
Epoch 6/7
Epoch 7/7


<keras.callbacks.History at 0x7f364987af50>

In [16]:
predicted_emo=model.predict(validate_phrase)

In [17]:
test=(enc.inverse_transform(predicted_emo)).reshape(-1)
actual = (enc.inverse_transform(validate_emotion)).reshape(-1)
test

array(['disgust', 'fear', 'sadness', ..., 'disgust', 'guilt', 'fear'],
      dtype=object)

In [18]:
from sklearn.metrics import accuracy_score
accuracy_score(actual,test)

0.4800783801436969