In [117]:
import pandas as pd
pd.options.mode.chained_assignment = None
import numpy as np

from gensim.models import Word2Vec

from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, Dropout, Activation, Input
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers.merge import concatenate
from keras.layers.normalization import BatchNormalization
from keras.models import Model
from keras.callbacks import EarlyStopping
from keras.optimizers import Adam

from sklearn.model_selection import train_test_split

In [2]:
data_no_trans_stem = pd.read_csv('../data/preproc_no_trans_stem.csv')
data_trans = pd.read_csv('../data/preproc_trans.csv')
data_stem = pd.read_csv('../data/preproc_stem.csv')
data_trans_stem = pd.read_csv('../data/preproc_trans_stem.csv')

First, make sure the negative and positive comments are even in numbers.

In [3]:
data_no_trans_stem.rating.value_counts()

0    778
1    555
Name: rating, dtype: int64

Since there are more negatives, drop random negative sentiment comments

In [4]:
negative_indices = data_no_trans_stem.index[data_no_trans_stem.rating == 0].tolist()
diff = abs(np.diff(data_no_trans_stem.rating.value_counts().values)[0])
indices = np.random.choice(negative_indices, diff, replace=False)
data_no_trans_stem = data_no_trans_stem.drop(indices)
data_trans = data_trans.drop(indices)
data_stem = data_stem.drop(indices)
data_trans_stem = data_trans_stem.drop(indices)

In [5]:
sentences_no_trans_stem = [[word for word in str(body).split()] for body in data_no_trans_stem.body]
sentences_trans = [[word for word in str(body).split()] for body in data_trans.body]
sentences_stem = [[word for word in str(body).split()] for body in data_stem.body]
sentences_trans_stem = [[word for word in str(body).split()] for body in data_trans_stem.body]

In [6]:
# constants
seed = 1234
min_word_count = 1
random_state = 42

Word2vec model based on all datasets

In [27]:
word2vec = Word2Vec(
sentences=sentences_no_trans_stem+sentences_trans+sentences_stem+sentences_trans_stem,
seed=seed,
min_count=min_word_count,
)

In [28]:
pretrained_weights = word2vec.wv.vectors
vocab_size, emdedding_size = word2vec.wv.vectors.shape

Tokenizer based on all dataset

In [108]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences_trans_stem)

In [109]:
words = tokenizer.word_index

Creating word embeddings

In [110]:
embeddings = np.zeros((vocab_size, emdedding_size))
for word, i in words.items():
    if word in word2vec.wv.vocab:
        embeddings[i-1] = word2vec.wv[word]

In [118]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=emdedding_size, weights=[word2vec.wv.vectors]))
model.add(LSTM(units=emdedding_size))
model.add(BatchNormalization())
model.add(Dense(16, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss = 'binary_crossentropy', optimizer=Adam(learning_rate = 1e-4), metrics = ['accuracy'])

In [119]:
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

In [130]:
X = tokenizer.texts_to_sequences(sentences_trans_stem)
X = pad_sequences(X)
Y = data_trans_stem.rating.values

In [131]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.3, random_state = random_state)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size = 0.1, random_state = random_state)

In [132]:
model.fit(X_train, Y_train, validation_data=(X_val, Y_val), 
                 epochs=100, batch_size=128, verbose = 2, callbacks=[early_stopping])

Epoch 1/100
6/6 - 1s - loss: 0.0189 - accuracy: 0.9957 - val_loss: 0.8634 - val_accuracy: 0.7179
Epoch 2/100
6/6 - 0s - loss: 0.0198 - accuracy: 0.9943 - val_loss: 0.7580 - val_accuracy: 0.7436
Epoch 3/100
6/6 - 0s - loss: 0.0197 - accuracy: 0.9957 - val_loss: 0.7710 - val_accuracy: 0.7564
Epoch 4/100
6/6 - 0s - loss: 0.0242 - accuracy: 0.9943 - val_loss: 0.8503 - val_accuracy: 0.7308
Epoch 5/100
6/6 - 0s - loss: 0.0184 - accuracy: 0.9971 - val_loss: 0.8960 - val_accuracy: 0.7308
Epoch 6/100
6/6 - 1s - loss: 0.0228 - accuracy: 0.9943 - val_loss: 1.0370 - val_accuracy: 0.7179
Epoch 7/100
6/6 - 1s - loss: 0.0203 - accuracy: 0.9957 - val_loss: 1.1513 - val_accuracy: 0.7179
Epoch 8/100
6/6 - 1s - loss: 0.0236 - accuracy: 0.9914 - val_loss: 1.2327 - val_accuracy: 0.7051
Epoch 9/100
6/6 - 1s - loss: 0.0199 - accuracy: 0.9943 - val_loss: 1.2310 - val_accuracy: 0.7308
Epoch 10/100
6/6 - 1s - loss: 0.0272 - accuracy: 0.9914 - val_loss: 1.1312 - val_accuracy: 0.6923
Epoch 11/100
6/6 - 1s - loss:

<tensorflow.python.keras.callbacks.History at 0x2b444a94b48>

In [125]:
model.evaluate(X_test, Y_test, verbose = 2, batch_size = 32)

11/11 - 0s - loss: 1.1345 - accuracy: 0.7057


[1.1344636678695679, 0.7057057023048401]