In [2]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from gensim.models import Word2Vec, KeyedVectors

import tensorflow as tf

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, Dropout, Activation, Input, Bidirectional, Conv1D, Conv2D, MaxPooling1D, MaxPooling2D, Flatten, TimeDistributed, GlobalMaxPooling1D, GlobalMaxPool1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
from tensorflow.keras.optimizers import Adam

from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from tensorflow import keras 
from tensorflow.python.keras import backend as k

In [3]:
# https://github.com/tensorflow/tensorflow/issues/33721
TF_FORCE_GPU_ALLOW_GROWTH=1
physical_devices = tf.config.list_physical_devices('GPU')
tf.config.experimental.set_memory_growth(physical_devices[0], True)
import os
os.environ["TF_FORCE_GPU_ALLOW_GROWTH"]="true"

Load IMDB dataset

In [4]:
# IMDB dataset
from keras.datasets import imdb

In [5]:
word_count = 5000
(X_train_imdb, y_train_imdb), (X_test_imdb, y_test_imdb) = imdb.load_data(num_words=word_count)
max_length = 500
X_train_imdb = sequence.pad_sequences(X_train_imdb, maxlen=max_length)
X_test_imdb = sequence.pad_sequences(X_test_imdb, maxlen=max_length)

Load my dataset

In [7]:
data_no_trans_stem = pd.read_csv('../data/preproc_no_trans_stem.csv')
#data_trans = pd.read_csv('../data/preproc_trans.csv')
data_stem = pd.read_csv('../data/preproc_stem.csv')
#data_trans_stem = pd.read_csv('../data/preproc_trans_stem.csv')

First, make sure the negative and positive comments are even in numbers.

In [8]:
data_no_trans_stem.rating.value_counts()

-1    3053
 0    1950
 1    1379
Name: rating, dtype: int64

Since there are more negatives, drop random negative sentiment comments.

Remove neutral sentiment comments from the training data, but keep it for word vector training.

In [9]:
w2v_no_trans_stem = data_no_trans_stem.copy()
w2v_stem = data_stem.copy()

data_no_trans_stem = data_no_trans_stem.loc[data_no_trans_stem.rating != -1]
data_stem = data_stem.loc[data_stem.rating != -1]

negative_indices = data_no_trans_stem.index[data_no_trans_stem.rating == 0].tolist()
diff = abs(np.diff(data_no_trans_stem.rating.value_counts().values)[0])
indices = np.random.choice(negative_indices, diff, replace=False)
data_no_trans_stem = data_no_trans_stem.drop(indices)
#data_trans = data_trans.drop(indices)
data_stem = data_stem.drop(indices)
#data_trans_stem = data_trans_stem.drop(indices)

In [10]:
sentences_no_trans_stem = [[word for word in str(body).split()] for body in data_no_trans_stem.body]
#sentences_trans = [[word for word in str(body).split()] for body in data_trans.body]
sentences_stem = [[word for word in str(body).split()] for body in data_stem.body]
#sentences_trans_stem = [[word for word in str(body).split()] for body in data_trans_stem.body]
sentences_w2v_no_trans_stem = [[word for word in str(body).split()] for body in w2v_no_trans_stem.body]
sentences_w2v_stem = [[word for word in str(body).split()] for body in w2v_stem.body]

In [11]:
# constants
seed = 1234
min_word_count = 1
random_state = 42

Word2vec model based on all datasets

In [12]:
#word2vec = KeyedVectors.load_word2vec_format(word_vectors_file, binary=True)
word2vec = Word2Vec(sentences=sentences_w2v_no_trans_stem, seed=random_state, min_count=min_word_count)
#word2vec =  Word2Vec.load('../models/word2vec.model')

In [13]:
word2vec.wv.most_similar('hate')

[('how', 0.9998462200164795),
 ('have', 0.9998425245285034),
 ('but', 0.9998337030410767),
 ('why', 0.9998334646224976),
 ('get', 0.99983149766922),
 ('laugh', 0.9998247027397156),
 ('game', 0.9998226761817932),
 ('much', 0.999820351600647),
 ('now', 0.999815046787262),
 ('should', 0.9998099207878113)]

In [14]:
pretrained_weights = word2vec.wv.vectors
vocab_size, emdedding_size = word2vec.wv.vectors.shape

Tokenizer based on all dataset

In [15]:
tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(sentences_no_trans_stem)

Make sure the sentences are padded so that they all are the same length

In [16]:
X = tokenizer.texts_to_sequences(sentences_no_trans_stem)
X = sequence.pad_sequences(X)
Y = data_no_trans_stem.rating.values

Split the data  to 70% for training and 30% for testing. 10% of the training data goes into validation.

In [17]:
X_t, X_tt, Y_t, Y_tt = train_test_split(X, Y, test_size = 0.3, random_state = random_state)

Different NN architectures

In [18]:
def lstm_model(model, params):
    model.add(LSTM(params['lstm_units'], return_sequences=True, go_backwards=True))
    return model

def bilstm_model(model, params):
    model.add(Bidirectional(LSTM(params['lstm_units'], return_sequences=True)))
    return model

def cnn_model(model, params):
    model.add(Conv1D(params['filters'], params['kernels'], padding='same'))
    model.add(MaxPooling1D(pool_size=params['pools']))
    return model

def cnnlstm_model(model,params):
    model.add(cnn_model(model, params))
    model.add(lstm_model(model, params))
    return model

def cnnbilstm_model(model, params):
    model.add(cnn_model(model, params))
    model.add(bilstm_model(model, params))
    return model

Model building functions

In [19]:
def make_model(input_layer, add_layers, params):
    model = Sequential()
    model.add(input_layer)
    model = add_hidden_layers(model, params, add_layers)
    if params['timedist_output']:
        model.add(TimeDistributed(Dense(1, activation='sigmoid')))
    else:
        model.add(Dense(1, activation='sigmoid'))
    return model

def add_hidden_layers(model, params, add_layers):
    if add_layers == 'lstm':
        model = lstm_model(model, params)
    elif add_layers == 'bilstm':
        model = bilstm_model(model, params)
    elif add_layers == 'cnn':
        model = cnn_model(model, params)
    elif add_layers == 'cnnlstm':
        model = cnnlstm_model(model, params)
    elif add_layers == 'cnnbilstm':
        model = cnnbilstm_model(model, params)
    return model

def compile_model(model, params):
    model.compile(loss = 'binary_crossentropy', optimizer=str(params['optimizer']), metrics = ['accuracy'])
    return model

def fit_model(model, model_name, params, X_train, Y_train, X_val, Y_val):
    early_stopping = EarlyStopping(monitor='val_loss', patience=10)
    #path = '../models/checkpoints/'
    #param_values = "-".join(str(v) for v in params.values() if type(v) != Embedding)
    #print(param_values)
    #file_name = path+param_values+'.h5'
    #model_checkpoint = ModelCheckpoint(file_name, save_best_only=True, save_weights_only=True)
    callbacks = [early_stopping]#, model_checkpoint]
    hist = model.fit(X_train, Y_train, validation_data=(X_val, Y_val), epochs=30, batch_size=params['batch'], verbose = 2, callbacks=[callbacks])
    return model, hist

def cross_val(X, Y, model, params):
    temp_acc = 0
    best_model = None
    best_hist = None
    for train_index, val_index in sk.split(X,Y):
        X_train, X_val = X[train_index], X[val_index]
        y_train, y_val = Y[train_index], Y[val_index]
        model, hist = fit_model(model, params['models'], params, X_train, y_train, X_val, y_val)
        loss, acc = model.evaluate(X_tt, Y_tt, verbose = 2, batch_size = params['batch'])
        if acc > temp_acc:
            best_model = model
            best_hist = hist
            temp_acc = acc
    return best_hist, temp_acc, best_model
        
        

In [20]:
# Grid search parameters
optimizers = ['rmsprop', 'adam']
batch_sizes = [64, 256]
lstm_units = [32, 64, 128, 256]
filters = lstm_units
kernels = [2, 3]
pools = [2, 3]
timedist_output = [True, False]
embed_layers = [
    Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[word2vec.wv.vectors]), # with w2v vector weights
    Embedding(X.max()+1, 32, input_length=X.shape[1])
]

In [21]:
# Cross validation setup
sk = StratifiedKFold(n_splits = 3, random_state = random_state, shuffle = True) 

In [22]:
# For tracking best performing model
current_acc = 0
best_results = None

In [23]:
# For tracking the result of all the tests
results = []
params = dict()

In [23]:
# Gridsearch with cross validation performed. The average result is saved
models = ['lstm', 'bilstm']
for m in models:
    print('Running ', m, ' model')
    params['models'] = m
    for o in optimizers:
        params['optimizer'] = o
        for batch in batch_sizes:
            params['batch'] = batch
            for lu in lstm_units:
                params['lstm_units'] = lu
                for e in embed_layers:
                    for t in timedist_output:
                        params['timedist_output'] = t
                        model = make_model(e, m, params)
                        model = compile_model(model, params)
                        best_hist, temp_acc, best_model = cross_val(X_t, Y_t, model, params)
                        results.append([best_hist, temp_acc, best_model])
                        if temp_acc > current_acc:
                            current_acc = temp_acc
                            params['e'] = e
                            best_results = params

bilstm-adam-256-256-False
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30


Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30


In [24]:
results

[[<tensorflow.python.keras.callbacks.History at 0x268150bd208>,
  0.793963611125946,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x2680d750088>],
 [<tensorflow.python.keras.callbacks.History at 0x26850df4488>,
  0.8084225058555603,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x268416c5788>],
 [<tensorflow.python.keras.callbacks.History at 0x268724f95c8>,
  0.7979141473770142,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x26850e07808>],
 [<tensorflow.python.keras.callbacks.History at 0x2688494ce08>,
  0.8012832999229431,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x26872509f08>],
 [<tensorflow.python.keras.callbacks.History at 0x2688c618308>,
  0.8117917776107788,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x26880f877c8>],
 [<tensorflow.python.keras.callbacks.History at 0x2688c2c03c8>,
  0.803422212600708,
  <tensorflow.python.keras.engine.sequential.Sequential at 0x2688c4b7dc8>],
 [<tensorflow.python.keras.cal

In [25]:
results_np = np.array(results)
accuracy_results = results_np[:, 1]
indx = accuracy_results.argmax(axis=0)
hist, acc, model = results[indx]
print('YouTube dataset accuracy: %.4f' % acc)
import json
json.dump(hist.history, open('../models/history/bilstmadam256False.json', 'w'))

YouTube dataset accuracy: 0.8237


In [26]:
model_architecture = []
models = ['lstm', 'bilstm']
for m in models:
    params['models'] = m
    for o in optimizers:
        params['optimizer'] = o
        for batch in batch_sizes:
            params['batch'] = batch
            for lu in lstm_units:
                params['lstm_units'] = lu
                for e in embed_layers:
                    for t in timedist_output:
                        params['timedist_output'] = t
                        params['e'] = e
                        model_architecture.append(params)

In [27]:
best_model_architecture = model_architecture[indx]
# Since the shape of data is different, redo embedding layer
best_model_architecture['e'] = Embedding(word_count, 32, input_length=max_length)
print(best_model_architecture)
# Test best performing architecture on IMDB data
model = Sequential()
model.add(best_model_architecture['e'])
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer='adam', metrics = ['accuracy'])
model.fit(X_train_imdb, y_train_imdb, epochs=10, batch_size=256)
imdb_loss, imdb_acc = model.evaluate(X_test_imdb, y_test_imdb, verbose=0)
print("Accuracy: %.2f%%" % (imdb_acc*100))

{'models': 'bilstm', 'optimizer': 'adam', 'batch': 256, 'lstm_units': 256, 'timedist_output': False, 'e': <tensorflow.python.keras.layers.embeddings.Embedding object at 0x000001C933A02C48>, 'kernels': 2, 'pools': 2, 'filters': 256}
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Accuracy: 74.60%


In [28]:
results2 = []

In [None]:
# Gridsearch with cross validation performed. The average result is saved
models = ['cnn', 'cnnlstm', 'cnnbilstm']
for m in models:
    print('Running ', m, ' model')
    params['models'] = m
    for o in optimizers:
        params['optimizer'] = o
        for f in filters:
            params['filters'] = f
            for batch in batch_sizes:
                params['batch'] = batch
                if m == 'cnn':
                    for k in kernels:
                        params['kernels'] = k
                        for p in pools:
                            params['pools'] = p
                            for e in embed_layers:
                                for t in timedist_output:
                                    params['timedist_output'] = t
                                    model = make_model(e, m, params)
                                    model = compile_model(model, params)
                                    best_hist, temp_acc, best_model = cross_val(X_t, Y_t, model, params)
                                    results2.append([best_hist, temp_acc, best_model])
                                    if temp_acc > current_acc:
                                        current_acc = temp_acc
                                        params['e'] = e
                                        best_results = params
                else: #cnn lstm and cnn bilstm. saves iterations
                    for k in kernels:
                        params['kernels'] = k
                        for p in pools:
                            params['pools'] = p
                            for lu in lstm_units:
                                params['lstm_units'] = lu
                                for e in embed_layers:
                                    for t in timedist_output:
                                        params['timedist_output'] = t
                                        model = make_model(e, m, params)
                                        model = compile_model(model, params)
                                        best_hist, temp_acc, best_model = cross_val(X_t, Y_t, model, params)
                                        results2.append([best_hist, temp_acc, best_model])
                                        if temp_acc > current_acc:
                                            current_acc = temp_acc
                                            params['e'] = e
                                            best_results = params