In [1]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
import string
import nltk
nltk.download('stopwords')
from nltk.stem import SnowballStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /home/priya/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
data = pd.read_csv("./Data/phrases.csv")
# data=pd.DataFrame(data)
# print type(data)
print data['Emotion'].unique()

['joy' 'fear' 'anger' 'sadness' 'disgust' 'shame' 'guilt']


In [3]:
def clean_dataset(data):
    translator = string.maketrans('', '')
    for index,row in data.iterrows():
        row['Phrase'] = row['Phrase'].replace('[','')
        row['Phrase'] = row['Phrase'].replace(']','')
        row['Phrase'] = row['Phrase'].strip()
        row['Phrase'] = row['Phrase'].translate(translator,string.punctuation)
    return data
data = clean_dataset(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,On days when I feel closing to my partner and ...
1,fear,Every time I imagine that someone I love or I ...
2,anger,When I had been obviously unjustly treated and...
3,sadness,When I think about the short time that we live...
4,disgust,At a gathering I found myself involuntarily si...


In [4]:
def tokenise(data):
    ## Convert words to lower case and split them    
    ## Remove stop words
    stops = set(stopwords.words("english"))
    
    for index,row in data.iterrows():
        text = row['Phrase'].lower().split(' ')
        text = [w.strip() for w in text if not w in stops and len(w) >= 2]
        text = " ".join(text)
        row['Phrase'] = text
    # split the dataset into tokens
    return data
data = tokenise(data)
data.head()

Unnamed: 0,Emotion,Phrase
0,joy,days feel closing partner friends when feel pe...
1,fear,every time imagine someone love could contact ...
2,anger,obviously unjustly treated possibility of eluc...
3,sadness,think short time live relate the periods life ...
4,disgust,gathering found involuntarily sitting next two...


In [5]:
def lemmatization(dataset):
    stemmer = SnowballStemmer('english')
    list_of_words=[]
    for index,row in dataset.iterrows():
        text = row['Phrase'].split()
        stemmed_words = [stemmer.stem(word) for word in text]
        list_of_words.append(stemmed_words)
        text = " ".join(stemmed_words)
        row['Phrase'] = text
    data['list_of_words'] = list_of_words
    return dataset
data = lemmatization(data)
print((data['Phrase']).tolist())
data.head()

[u'day feel close partner friend when feel peac also experi close contact peopl regard great', u'everi time imagin someon love could contact serious ill even death', u'obvious unjust treat possibl of elucid', u'think short time live relat the period life think use short time', u'gather found involuntarili sit next two peopl express opinion consid low discrimin', u'realiz direct feel discont with partner way tri put blame on instead sort feeli', u'feel guilti realiz consid materi thing more import care relat feel selfcent', u'girlfriend taken exam went parent place', u'first time realiz mean death', u'car overtak anoth forc drive road', u'recent thought hard work take studi how one want tri someth els read theoret book english understand', u'found bristl liver past tube', u'tire unmotiv shout girlfriend and brought negat side charact actual not import', u'think studi enough weekend think abl accomplish someth time', u'pass examin think well', u'one arrang meet someon person arriv late m

Unnamed: 0,Emotion,Phrase,list_of_words
0,joy,day feel close partner friend when feel peac a...,"[day, feel, close, partner, friend, when, feel..."
1,fear,everi time imagin someon love could contact se...,"[everi, time, imagin, someon, love, could, con..."
2,anger,obvious unjust treat possibl of elucid,"[obvious, unjust, treat, possibl, of, elucid]"
3,sadness,think short time live relat the period life th...,"[think, short, time, live, relat, the, period,..."
4,disgust,gather found involuntarili sit next two peopl ...,"[gather, found, involuntarili, sit, next, two,..."


In [9]:
from __future__ import print_function
import os
import sys
from keras.models import Sequential
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense,LSTM, Input, GlobalMaxPooling1D,Flatten,Activation
from keras.layers import Conv1D, MaxPooling1D, Embedding
from keras.models import Model
from keras.initializers import Constant

def glove(data):
    BASE_DIR = ''
    GLOVE_DIR = os.path.join(BASE_DIR, 'glove.6B')
    TEXT_DATA_DIR = os.path.join(BASE_DIR, 'Data')
    MAX_SEQUENCE_LENGTH = 50
    MAX_NUM_WORDS = 20000
    EMBEDDING_DIM = 300
    VALIDATION_SPLIT = 0.2

    # first, build index mapping words in the embeddings set
    # to their embedding vector

    print('Indexing word vectors.')

    embeddings_index = {}
    with open(os.path.join(GLOVE_DIR, 'glove.6B.300d.txt')) as f:
        for line in f:
            word, coefs = line.split(' ',1)#maxsplit=1
            coefs = np.fromstring(coefs, 'f', sep=' ')
            embeddings_index[word] = coefs

    print('Found %s word vectors.' % len(embeddings_index))

    # second, prepare text samples and their labels
    print('Processing text dataset')

    texts = []  # list of text samples
    labels_index = {}  # dictionary mapping label name to numeric id
    labels = []  # list of label ids
    texts = (data['Phrase']).tolist()
    labels = data['Emotion'].tolist()
    for x in data['Emotion'].unique():
        label_id = len(labels_index)
        labels_index[x] = label_id
    
    from sklearn import preprocessing
    le = preprocessing.LabelEncoder()
    labels = le.fit_transform(labels)

    print('Found %s texts.' % len(texts))

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=MAX_SEQUENCE_LENGTH)
    print('Shape of data tensor:', data.shape)
    print('Shape of label tensor:', labels.shape)

    # split the data into a training set and a validation set
    indices = np.arange(data.shape[0])
    np.random.shuffle(indices)
    data = data[indices]
    labels = labels[indices]
    num_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

    x_train = data[:-num_validation_samples]
    y_train = labels[:-num_validation_samples]
    print('label', y_train)
    x_val = data[-num_validation_samples:]
    y_val = labels[-num_validation_samples:]

    print('Preparing embedding matrix.')

    # prepare embedding matrix
    num_words = min(MAX_NUM_WORDS, len(word_index)) + 1
    embedding_matrix = np.zeros((num_words, EMBEDDING_DIM))
    for word, i in word_index.items():
        if i > MAX_NUM_WORDS:
            continue
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            # words not found in embedding index will be all-zeros.
            embedding_matrix[i] = embedding_vector

    # load pre-trained word embeddings into an Embedding layer
    # note that we set trainable = False so as to keep the embeddings fixed
    embedding_layer = Embedding(num_words,
                                EMBEDDING_DIM,
                                embeddings_initializer=Constant(embedding_matrix),
                                input_length=MAX_SEQUENCE_LENGTH,
                                trainable=False)

    print('Training model.')
    model = Sequential()
    model.add(embedding_layer)
#     model.add(Conv1D (kernel_size = (4), filters = 40, activation='relu'))
#     model.add(MaxPooling1D(pool_size = (4), strides=(1)))
#     print( model.output_shape)
#     model.add(LSTM(70, dropout=0.2, return_sequences=True, recurrent_dropout=0.2))
#     model.add(LSTM(35, dropout=0.2, recurrent_dropout=0.2))
#     model.add(Dense(7, activation='softmax'))
#     model.compile(loss='sparse_categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
#     model.fit(x_train, y_train,
#               epochs=20,
#     validation_data=(x_val, y_val))
    model.add(LSTM(512, return_sequences=True))
    model.add(LSTM(256, return_sequences=False))
    model.add(Dense(512))
    model.add(Activation('relu'))
    model.add(Dense(7))
    model.add(Activation('softmax'))


    #sgd = optimizers.SGD(lr=0.001, decay=1e-6, momentum=0.9, nesterov=True)
    model.compile(loss='sparse_categorical_crossentropy',optimizer='adam' ,metrics=['acc'])
    model.fit(x_train, y_train,
              epochs=30,
              batch_size=100,
    validation_data=(x_val, y_val))
## compille it here according to instructions

#model.compile()
    model.summary()

In [None]:
glove(data)