# Book Reviews Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
import gensim
import keras

## Data Inspection

In [2]:
filename = "bookReviewsData.csv"
df = pd.read_csv(filename, header=0)
df.head

<bound method NDFrame.head of                                                  Review  Positive Review
0     This was perhaps the best of Johannes Steinhof...             True
1     This very fascinating book is a story written ...             True
2     The four tales in this collection are beautifu...             True
3     The book contained more profanity than I expec...            False
4     We have now entered a second time of deep conc...             True
...                                                 ...              ...
1968  I purchased the book with the intention of tea...             True
1969  There are so many design books, but the Graphi...             True
1970  I am thilled to see this book being available ...             True
1971  As many have stated before me the book starts ...            False
1972  I love this book! It is a terrific blend of ha...             True

[1973 rows x 2 columns]>

In [3]:
df.shape

(1973, 2)

In [4]:
df.describe

<bound method NDFrame.describe of                                                  Review  Positive Review
0     This was perhaps the best of Johannes Steinhof...             True
1     This very fascinating book is a story written ...             True
2     The four tales in this collection are beautifu...             True
3     The book contained more profanity than I expec...            False
4     We have now entered a second time of deep conc...             True
...                                                 ...              ...
1968  I purchased the book with the intention of tea...             True
1969  There are so many design books, but the Graphi...             True
1970  I am thilled to see this book being available ...             True
1971  As many have stated before me the book starts ...            False
1972  I love this book! It is a terrific blend of ha...             True

[1973 rows x 2 columns]>

In [5]:
nan_count = np.sum(df.isnull(), axis=0)
nan_count

Review             0
Positive Review    0
dtype: int64

In [6]:
np.sum(df["Positive Review"] == True, axis=0)

980

In [7]:
np.sum(df["Positive Review"] == False, axis=0)

993

## Preprocessing

In [8]:
X = df["Review"]
y = df["Positive Review"]

In [9]:
# Preprocess text by removing stop words, converting to lowercase, removing punctuation, and tokenizing
X = X.apply(lambda row: gensim.utils.simple_preprocess(row))
X.head()

0    [this, was, perhaps, the, best, of, johannes, ...
1    [this, very, fascinating, book, is, story, wri...
2    [the, four, tales, in, this, collection, are, ...
3    [the, book, contained, more, profanity, than, ...
4    [we, have, now, entered, second, time, of, dee...
Name: Review, dtype: object

In [10]:
# Create training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=1234)

In [11]:
# Produce word embeddings
word2vec_model = gensim.models.Word2Vec(X_train, vector_size=100, window=5, min_count=2)

In [12]:
vocabulary_size = len(word2vec_model.wv.key_to_index)
vocabulary_size

11106

In [13]:
X_train.head()

202     [bought, bead, fantasies, and, bead, fantasies...
1009    [this, is, wonderful, book, aimed, at, helping...
139     [will, not, try, to, say, that, asserted, the,...
1640    [agree, with, the, above, comments, all, one, ...
1159    [few, years, ago, had, the, pleasure, of, meet...
Name: Review, dtype: object

In [14]:
X_test.head()

1692    [bought, this, book, this, weekend, as, we, re...
1744    [when, first, came, to, iran, black, clad, wom...
1236    [this, book, is, packed, full, of, incredible,...
21      [while, this, book, is, good, attempt, at, pla...
894     [if, your, looking, to, increase, your, person...
Name: Review, dtype: object

In [15]:
# Replace every word with its corresponding word embedding
words = set(word2vec_model.wv.index_to_key)

X_train_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                                    for training_example in X_train], dtype=object)

X_test_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                                   for training_example in X_test], dtype=object)

In [16]:
for w in range(5):
    print("Number of word vectors in training example {0}: {1}".format(w, len(X_train_word_embeddings[w])))

Number of word vectors in training example 0: 64
Number of word vectors in training example 1: 115
Number of word vectors in training example 2: 172
Number of word vectors in training example 3: 35
Number of word vectors in training example 4: 65


In [17]:
for w in range(5):
    print("Number of word vectors in test example {0}: {1}".format(w, len(X_test_word_embeddings[w])))

Number of word vectors in test example 0: 124
Number of word vectors in test example 1: 78
Number of word vectors in test example 2: 111
Number of word vectors in test example 3: 41
Number of word vectors in test example 4: 21


In [18]:
# Take average of word embeddings to get feature vector for each example
X_train_feature_vector = []
for w in X_train_word_embeddings:
    if w.size:
        X_train_feature_vector.append(w.mean(axis=0))
    else:
        X_train_feature_vector.append(np.zeros(100, dtype=float))
        
X_test_feature_vector = []
for w in X_test_word_embeddings:
    if w.size:
        X_test_feature_vector.append(w.mean(axis=0))
    else:
        X_test_feature_vector.append(np.zeros(100, dtype=float))

In [19]:
for w in range(5):
    print("Length of training example {0}: {1}".format(w, len(X_train_feature_vector[w])))

Length of training example 0: 100
Length of training example 1: 100
Length of training example 2: 100
Length of training example 3: 100
Length of training example 4: 100


## Modeling

In [20]:
# Logistic regression
model = LogisticRegression(max_iter=200)
model.fit(X_train_feature_vector, y_train)

probability_predictions = model.predict_proba(X_test_feature_vector)[:, 1]
class_label_predictions = model.predict(X_test_feature_vector)

auc = roc_auc_score(y_test, probability_predictions)
print("AUC on test data: {:.4f}".format(auc))

AUC on test data: 0.7900


In [21]:
# Neural network
nn_model = keras.Sequential()

input_layer = keras.layers.InputLayer(shape=(100,))
nn_model.add(input_layer)

hidden_layer_1 = keras.layers.Dense(units=64, activation='relu')
nn_model.add(hidden_layer_1)

hidden_layer_2 = keras.layers.Dense(units=32, activation='relu')
nn_model.add(hidden_layer_2)

hidden_layer_3 = keras.layers.Dense(units=16, activation='relu')
nn_model.add(hidden_layer_3)

output_layer = keras.layers.Dense(units=1, activation='sigmoid')
nn_model.add(output_layer)

nn_model.summary()

In [22]:
optimizer = keras.optimizers.SGD(learning_rate=0.1)
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)
nn_model.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [23]:
class ProgBarLoggerNEpochs(keras.callbacks.Callback):
    def __init__(self, num_epochs: int, every_n: int = 50):
        self.num_epochs = num_epochs
        self.every_n = every_n
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.every_n == 0:
            s = 'Epoch [{}/{}]'.format(epoch + 1, self.num_epochs)
            logs_s = ['{}: {:.4f}'.format(k.capitalize(), v) for k, v in logs.items()]
            s_list = [s] + logs_s
            print(', '.join(s_list))

In [24]:
num_epochs = 100
history = nn_model.fit(
    np.array(X_train_feature_vector),
    y_train,
    epochs=num_epochs,
    verbose=0,
    validation_split=0.2,
    callbacks=[ProgBarLoggerNEpochs(num_epochs, every_n=5)]
)

Epoch [5/100], Accuracy: 0.4887, Loss: 0.6936, Val_accuracy: 0.5831, Val_loss: 0.6921
Epoch [10/100], Accuracy: 0.5106, Loss: 0.6925, Val_accuracy: 0.4958, Val_loss: 0.6913
Epoch [15/100], Accuracy: 0.5141, Loss: 0.6911, Val_accuracy: 0.6310, Val_loss: 0.6880
Epoch [20/100], Accuracy: 0.5500, Loss: 0.6890, Val_accuracy: 0.5380, Val_loss: 0.6848
Epoch [25/100], Accuracy: 0.5415, Loss: 0.6874, Val_accuracy: 0.5493, Val_loss: 0.6795
Epoch [30/100], Accuracy: 0.5577, Loss: 0.6842, Val_accuracy: 0.5606, Val_loss: 0.6732
Epoch [35/100], Accuracy: 0.5725, Loss: 0.6809, Val_accuracy: 0.5493, Val_loss: 0.6729
Epoch [40/100], Accuracy: 0.5718, Loss: 0.6812, Val_accuracy: 0.5915, Val_loss: 0.6661
Epoch [45/100], Accuracy: 0.5824, Loss: 0.6798, Val_accuracy: 0.5887, Val_loss: 0.6622
Epoch [50/100], Accuracy: 0.5775, Loss: 0.6794, Val_accuracy: 0.6169, Val_loss: 0.6629
Epoch [55/100], Accuracy: 0.5775, Loss: 0.6767, Val_accuracy: 0.6225, Val_loss: 0.6601
Epoch [60/100], Accuracy: 0.5831, Loss: 0.67