In [13]:
import pandas as pd
import numpy as np
import os
import nltk
from nltk.corpus import stopwords
import gensim, logging

In [2]:
tr_f = './Data/train.tsv'
train = pd.DataFrame.from_csv(tr_f, sep='\t')
te_f = './Data/test.tsv'
test = pd.DataFrame.from_csv(te_f, sep='\t')

In [3]:
from nltk.tokenize import RegexpTokenizer

def tokenize_stopwords(df):
    # Tokenize and remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    tokens = tokenizer.tokenize(df['Phrase'])
    #tokens = nltk.word_tokenize(df['Phrase'])
    return [t.lower() for t in tokens if t.lower() not in (english_sw + ['rrb', 'lrb'])] 

def keep_first(group):
    return pd.Series({"Phrase": group["Phrase"].iloc[0], "Sentiment": group["Sentiment"].iloc[0]})

In [4]:
full = pd.concat([train, test])

In [5]:
datas = full.copy()

In [6]:
full = full.groupby("SentenceId").apply(keep_first)

In [7]:
english_sw = []

In [8]:
full['Phrase tokenized'] = full.apply(tokenize_stopwords, axis=1)

In [9]:
full.head()

Unnamed: 0_level_0,Phrase,Sentiment,Phrase tokenized
SentenceId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,A series of escapades demonstrating the adage ...,1.0,"[a, series, of, escapades, demonstrating, the,..."
2,"This quiet , introspective and entertaining in...",4.0,"[this, quiet, introspective, and, entertaining..."
3,"Even fans of Ismail Merchant 's work , I suspe...",1.0,"[even, fans, of, ismail, merchant, s, work, i,..."
4,A positively thrilling combination of ethnogra...,3.0,"[a, positively, thrilling, combination, of, et..."
5,Aggressive self-glorification and a manipulati...,1.0,"[aggressive, self, glorification, and, a, mani..."


In [10]:
sentences = full["Phrase tokenized"]

In [None]:
model = gensim.models.Word2Vec(sentences, min_count=1)

In [None]:
def vectorize(row):
    return [model[word] for word in row['Phrase tokenized']]

In [None]:
full["vectors"] = full.apply(vectorize, axis = 1)

In [None]:
full.head()

In [None]:
model.similarity('good', 'bad')

## Using pre-trained GloVe 

In [25]:
from keras.utils import np_utils

In [11]:
from collections import Counter
import operator

words = []
for i in range(full.shape[0]):
    for word in full['Phrase tokenized'].iloc[i]:
        words.append(word)
dic1 = Counter(words)
print(len(dic1))

sorted_words = sorted(dic1.items(), key=operator.itemgetter(1), reverse=True)
#print(sorted_words)
maxDictLength = len(dic1)
word_dict = dict([ (sorted_words[i][0], i+2)for i in range(maxDictLength)])
#print(word_dictionary)
#sorted_dic = sorted(word_dictionary.items(), key=operator.itemgetter(1))
#print(sorted_dic)
oovf = 1

def words_to_dict(row):
    return [[word_dict[r] if (r in word_dict) else oovf] for r in row["Phrase tokenized"]]

17691


In [None]:
full["Dict values"] = full.apply(words_to_dict, axis=1)

In [53]:
embeddings_index = {}
GLOVE_DIR = 'Data/glove.6B/'
f = open(os.path.join(GLOVE_DIR, 'glove.6B.100d.txt'))
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))


Found 400000 word vectors.


In [60]:
EMBEDDING_DIM = 100
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [54]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

tokenizer = Tokenizer(nb_words=15000)
tokenizer.fit_on_texts(train["Phrase"])
sequences = tokenizer.texts_to_sequences(train["Phrase"])

word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))

Found 15288 unique tokens.


## Keras Neural Network

In [55]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
#from keras.layers import Convolution2D, MaxPooling2D
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers import GlobalAveragePooling1D, GlobalMaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence
from keras.utils import np_utils

In [57]:
x = pad_sequences(sequences)
labels = np_utils.to_categorical(np.asarray(train["Sentiment"]))

In [58]:
print(x.shape)

(156060, 49)


In [65]:
idx = np.random.permutation(np.arange(x.shape[0]))
ll = int(tr_ratio*x.shape[0])

x_train = x[idx[:ll]]
y_train = labels[idx[:ll]]

x_test = x[idx[ll:]]
y_test = labels[idx[ll:]]

In [66]:
max_words = x.shape[1]

In [67]:
print(x_train.shape)
print(y_train.shape)

(124848, 49)
(124848, 5)


[   0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0 4070 5049]


In [68]:
model = Sequential()


model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, weights=[embedding_matrix], input_length=max_words, trainable=False))
model.add(Dropout(0.2))

model.add(Conv1D(128, 3, border_mode='valid', activation='relu'))

model.add(GlobalMaxPooling1D())

model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(5, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer = 'adam', metrics=['categorical_accuracy'])

In [51]:
from sklearn.utils import class_weight
class_w = class_weight.compute_class_weight('balanced', np.unique(y_train_int), y_train_int)

In [71]:
model.fit(x_train, y_train, validation_data=(x_test, y_test), nb_epoch=5, batch_size=32, class_weight='auto', verbose=1)

Train on 124848 samples, validate on 31212 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f1687e9add8>