In [1]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from gensim.models import Word2Vec, KeyedVectors

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

In [2]:
data_no_trans_stem = pd.read_csv('../data/preproc_no_trans_stem.csv')
data_trans = pd.read_csv('../data/preproc_trans.csv')
data_stem = pd.read_csv('../data/preproc_stem.csv')
data_trans_stem = pd.read_csv('../data/preproc_trans_stem.csv')

First, make sure the negative and positive comments are even in numbers.

In [3]:
data_no_trans_stem.rating.value_counts()

0    1698
1    1204
Name: rating, dtype: int64

Since there are more negatives, drop random negative sentiment comments

In [4]:
negative_indices = data_no_trans_stem.index[data_no_trans_stem.rating == 0].tolist()
diff = abs(np.diff(data_no_trans_stem.rating.value_counts().values)[0])
indices = np.random.choice(negative_indices, diff, replace=False)
data_no_trans_stem = data_no_trans_stem.drop(indices)
data_trans = data_trans.drop(indices)
data_stem = data_stem.drop(indices)
data_trans_stem = data_trans_stem.drop(indices)

In [5]:
sentences_no_trans_stem = [[word for word in str(body).split()] for body in data_no_trans_stem.body]
sentences_trans = [[word for word in str(body).split()] for body in data_trans.body]
sentences_stem = [[word for word in str(body).split()] for body in data_stem.body]
sentences_trans_stem = [[word for word in str(body).split()] for body in data_trans_stem.body]

In [7]:
# constants
seed = 1234
min_word_count = 1
random_state = 42
word_vectors_file = 'GoogleNews-vectors-negative300.bin'

Word2vec model based on all datasets

In [8]:
#word2vec = KeyedVectors.load_word2vec_format(word_vectors_file, binary=True)

In [20]:
word2vec = Word2Vec(
#sentences=sentences_no_trans_stem+sentences_trans+sentences_stem+sentences_trans_stem,
sentences=sentences_no_trans_stem,
seed=seed,
min_count=min_word_count,
)

In [23]:
word2vec.wv.most_similar('hate')

[('just', 0.9971455335617065),
 ('video', 0.9971255660057068),
 ('why', 0.9971076846122742),
 ('he', 0.9970792531967163),
 ('was', 0.9970666170120239),
 ('for', 0.9969887733459473),
 ('like', 0.9969620704650879),
 ('not', 0.9968778491020203),
 ('your', 0.9968664646148682),
 ('so', 0.9968242049217224)]

In [25]:
pretrained_weights = word2vec.wv.vectors
vocab_size, emdedding_size = word2vec.wv.vectors.shape

Tokenizer based on all dataset

In [26]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_no_trans_stem)

In [27]:
words = tokenizer.word_index

Creating word embeddings

In [17]:
# used if pretrained word2vec model is used
#embeddings = np.zeros((vocab_size, emdedding_size))
#for word, i in words.items():
#    if word in word2vec.vocab:
#        embeddings[i-1] = word2vec[word]
#print('Null word embeddings: ',  np.sum(np.sum(embeddings, axis=1) == 0))

In [40]:
def build_model(X_train, Y_train, X_val, Y_val, es_patience=10, epochs=100, batch_size=128, optimizer='adam'):
    model = Sequential()
    model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[word2vec.wv.vectors]))
    model.add(LSTM(emdedding_size, activation='sigmoid', return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss = 'binary_crossentropy', optimizer=optimizer, metrics = ['accuracy'])
    
    early_stopping = EarlyStopping(monitor='val_loss', patience=es_patience)
    hist = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), 
                 epochs=epochs, batch_size=batch_size, verbose = 3, callbacks=[early_stopping])
    return model, hist

In [29]:
X = tokenizer.texts_to_sequences(sentences_no_trans_stem)
X = pad_sequences(X)
Y = data_trans_stem.rating.values

In [30]:
#X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = random_state)
#X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state = random_state)
index = int(len(X) * 0.8)
# train set
X_t = X[:index]
Y_t = Y[:index]
# test set
X_tt = X[index:]
Y_tt = Y[index:]

In [55]:
# grid search parameters
optimizers = ['rmsprop', 'adam']
es_patience = [2, 6, 10]
epochs = [50]
batch_sizes = [8, 16, 32, 64]

In [41]:
# Cross validation setup
sk = StratifiedKFold(n_splits = 5, random_state = random_state, shuffle = True) 

In [56]:
results = []
for o in optimizers:
    optimizer_results = []
    for train_index, val_index in sk.split(X_t,Y_t):
        X_train, X_val = X_t[train_index], X_t[val_index]
        y_train, y_val = Y_t[train_index], Y_t[val_index]
        model, hist = build_model(X_train, y_train, X_val, y_val, 6, 50, 16, o)
        loss, acc = model.evaluate(X_val, y_val, verbose = 4, batch_size = 32)
        optimizer_results.append([loss, acc])
    results.append(optimizer_results)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Ep

In [61]:
# adam vs rmsprop
rms_results = np.array(results[0])
adam_results = np.array(results[1])
print('Average rmsprop: ', np.mean(rms_results[:,1]))
print('Average adam: ',  np.mean(adam_results[:,1]))

Average rmsprop:  0.8100890040397644
Average adam:  0.7934718012809754


In [240]:
best_results = {
    'optimizer': '',
    'es_patience': 0,
    'epochs': 0,
    'batches': 0
}
curr_acc = 0
curr_loss = 999

In [242]:
#Gridsearch
for o in optimizers:
    for es in es_patience:
        for e in epochs:
            for b in batch_sizes:
                print('Optimizer: ', o, ', ES: ', es, ', epochs: ', e, ', batches: ', b)
                #cross validation
                for train_index, val_index in sk.split(X_t,Y_t):
                    X_train, X_val = X_t[train_index], X_t[val_index]
                    y_train, y_val = Y_t[train_index], Y_t[val_index]
                    model, hist = build_model(X_train, y_train, X_val, y_val, es, e, b, o)
                    loss, acc = model.evaluate(X_val, y_val, verbose = 4, batch_size = 32)
                    if acc > curr_acc:
                        best_results['optimizer'] = o
                        best_results['es_patience'] = es
                        best_results['epochs'] = e
                        best_results['batches'] = b
                        curr_acc = acc
                        curr_loss = loss
                        

Optimizer:  rmsprop , ES:  2 , epochs:  50 , batches:  8
Optimizer:  rmsprop , ES:  2 , epochs:  50 , batches:  16
Optimizer:  rmsprop , ES:  2 , epochs:  50 , batches:  32
Optimizer:  rmsprop , ES:  2 , epochs:  50 , batches:  64
Optimizer:  rmsprop , ES:  6 , epochs:  50 , batches:  8
Optimizer:  rmsprop , ES:  6 , epochs:  50 , batches:  16
Optimizer:  rmsprop , ES:  6 , epochs:  50 , batches:  32
Optimizer:  rmsprop , ES:  6 , epochs:  50 , batches:  64
Optimizer:  rmsprop , ES:  10 , epochs:  50 , batches:  8
Optimizer:  rmsprop , ES:  10 , epochs:  50 , batches:  16
Optimizer:  rmsprop , ES:  10 , epochs:  50 , batches:  32
Optimizer:  rmsprop , ES:  10 , epochs:  50 , batches:  64
Optimizer:  adam , ES:  2 , epochs:  50 , batches:  8
Optimizer:  adam , ES:  2 , epochs:  50 , batches:  16
Optimizer:  adam , ES:  2 , epochs:  50 , batches:  32
Optimizer:  adam , ES:  2 , epochs:  50 , batches:  64
Optimizer:  adam , ES:  6 , epochs:  50 , batches:  8
Optimizer:  adam , ES:  6 , ep

In [243]:
best_results

{'optimizer': 'rmsprop', 'es_patience': 6, 'epochs': 50, 'batches': 16}

In [244]:
curr_acc

0.7567567825317383

In [245]:
curr_loss

0.6021022796630859