In [3]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/priya/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [4]:
# normalisation of the dataset
def normalize(X):
    from sklearn.preprocessing import StandardScaler
    X = StandardScaler().fit_transform(X)
    return X

In [5]:
#read the data set
data = pd.read_csv("./Data/phrases.csv")
# data=pd.DataFrame(data)
# print type(data)
print data['Emotion'].unique()

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


## We are applying an LSTM based model for the emotion prediction. a special kind of RNN, capable of learning long-term dependencies.

## Preprocess the data set via cleaning, tokenisation and lemmatization

In [6]:
# print data
def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
        row['Phrase'] = row['Phrase'].replace('[','')
        row['Phrase'] = row['Phrase'].replace(']','')
        row['Phrase'] = row['Phrase'].strip()
        row['Phrase'] = row['Phrase'].translate(translator,string.punctuation)
    return data
data = clean_dataset(data)
print data.head()

   Emotion                                             Phrase
0      joy  On days when I feel closing to my partner and ...
1     fear  Every time I imagine that someone I love or I ...
2    anger  When I had been obviously unjustly treated and...
3  sadness  When I think about the short time that we live...
4  disgust  At a gathering I found myself involuntarily si...


In [8]:
def tokenise(data):
    ## Convert words to lower case and split them    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    
    for index,row in data.iterrows():
        text = row['Phrase'].lower().split(' ')
        text = [w.strip() for w in text if not w in stops and len(w) >= 2]
        text = " ".join(text)
    # split the dataset into tokens
    return data
data = tokenise(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,On days when I feel closing to my partner and ...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [9]:
def lemmatization(dataset):
    stemmer = SnowballStemmer('english')
    list_of_words=[]
    for index,row in dataset.iterrows():
        text = row['Phrase'].split()
        stemmed_words = [stemmer.stem(word) for word in text]
        list_of_words.append(stemmed_words)
        text = " ".join(stemmed_words)
        row['Phrase'] = text
    data['list_of_words'] = list_of_words
    return dataset
data = lemmatization(data)
data

Unnamed: 0,Emotion,Phrase,list_of_words
0,joy,on day when i feel close to my partner and oth...,"[on, day, when, i, feel, close, to, my, partne..."
1,fear,everi time i imagin that someon i love or i co...,"[everi, time, i, imagin, that, someon, i, love..."
2,anger,when i had been obvious unjust treat and had n...,"[when, i, had, been, obvious, unjust, treat, a..."
3,sadness,when i think about the short time that we live...,"[when, i, think, about, the, short, time, that..."
4,disgust,at a gather i found myself involuntarili sit n...,"[at, a, gather, i, found, myself, involuntaril..."
5,shame,when i realiz that i was direct the feel of di...,"[when, i, realiz, that, i, was, direct, the, f..."
6,guilt,i feel guilti when when i realiz that i consid...,"[i, feel, guilti, when, when, i, realiz, that,..."
7,joy,after my girlfriend had taken her exam we went...,"[after, my, girlfriend, had, taken, her, exam,..."
8,fear,when for the first time i realiz the mean of d...,"[when, for, the, first, time, i, realiz, the, ..."
9,anger,when a car is overtak anoth and i am forc to d...,"[when, a, car, is, overtak, anoth, and, i, am,..."


In [11]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, LSTM, Conv1D, MaxPooling1D, Dropout, Activation
from keras.layers.embeddings import Embedding
vocabulary_size = 20000
tokenizer = Tokenizer(num_words= vocabulary_size)
tokenizer.fit_on_texts(data['Phrase'])

sequences = tokenizer.texts_to_sequences(data['Phrase'])
tokenized_data = pad_sequences(sequences, maxlen=50)
# tokenized_data


In [12]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = data['Emotion'].unique()
# print X
X=X.reshape(-1,1)
# print X
enc.fit(X)
enc.categories_
emotion_data=data['Emotion'].values
emotion_data=emotion_data.reshape(-1,1)
emotion_data=enc.transform(emotion_data).toarray()

In [13]:
train_phrase, validate_phrase = np.split(tokenized_data,[int(.8*len(tokenized_data))])
train_label,validate_emotion=np.split(emotion_data,[int(.8*len(emotion_data))])
# train_label