# Book Reviews Sentiment Analysis

In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
import keras

## Data Inspection

In [2]:
filename = "bookReviewsData.csv"
df = pd.read_csv(filename, header=0)
df.head

<bound method NDFrame.head of                                                  Review  Positive Review
0     This was perhaps the best of Johannes Steinhof...             True
1     This very fascinating book is a story written ...             True
2     The four tales in this collection are beautifu...             True
3     The book contained more profanity than I expec...            False
4     We have now entered a second time of deep conc...             True
...                                                 ...              ...
1968  I purchased the book with the intention of tea...             True
1969  There are so many design books, but the Graphi...             True
1970  I am thilled to see this book being available ...             True
1971  As many have stated before me the book starts ...            False
1972  I love this book! It is a terrific blend of ha...             True

[1973 rows x 2 columns]>

In [3]:
df.shape

(1973, 2)

In [4]:
df.describe

<bound method NDFrame.describe of                                                  Review  Positive Review
0     This was perhaps the best of Johannes Steinhof...             True
1     This very fascinating book is a story written ...             True
2     The four tales in this collection are beautifu...             True
3     The book contained more profanity than I expec...            False
4     We have now entered a second time of deep conc...             True
...                                                 ...              ...
1968  I purchased the book with the intention of tea...             True
1969  There are so many design books, but the Graphi...             True
1970  I am thilled to see this book being available ...             True
1971  As many have stated before me the book starts ...            False
1972  I love this book! It is a terrific blend of ha...             True

[1973 rows x 2 columns]>

In [5]:
nan_count = np.sum(df.isnull(), axis=0)
nan_count

Review             0
Positive Review    0
dtype: int64

In [6]:
np.sum(df["Positive Review"] == True, axis=0)

980

In [7]:
np.sum(df["Positive Review"] == False, axis=0)

993

## Preprocessing

In [8]:
X = df["Review"]
y = df["Positive Review"]

In [9]:
# Create training and test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.10, random_state=1234)

### Word Embeddings

In [10]:
# Preprocess text by removing stop words, converting to lowercase, removing punctuation, and tokenizing
X_train_preprocessed = X_train.apply(lambda row: gensim.utils.simple_preprocess(row))
X_test_preprocessed = X_test.apply(lambda row: gensim.utils.simple_preprocess(row))

In [24]:
# Produce word embeddings
word2vec_model = gensim.models.Word2Vec(X_train_preprocessed, vector_size=100, window=20, min_count=5)

In [25]:
we_vocab_size = len(word2vec_model.wv.key_to_index)
we_vocab_size

5045

In [26]:
X_train_preprocessed.head()

202     [bought, bead, fantasies, and, bead, fantasies...
1009    [this, is, wonderful, book, aimed, at, helping...
139     [will, not, try, to, say, that, asserted, the,...
1640    [agree, with, the, above, comments, all, one, ...
1159    [few, years, ago, had, the, pleasure, of, meet...
Name: Review, dtype: object

In [27]:
X_test_preprocessed.head()

1692    [bought, this, book, this, weekend, as, we, re...
1744    [when, first, came, to, iran, black, clad, wom...
1236    [this, book, is, packed, full, of, incredible,...
21      [while, this, book, is, good, attempt, at, pla...
894     [if, your, looking, to, increase, your, person...
Name: Review, dtype: object

In [28]:
# Replace every word with its corresponding word embedding
words = set(word2vec_model.wv.index_to_key)

X_train_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                                    for training_example in X_train_preprocessed], dtype=object)

X_test_word_embeddings = np.array([np.array([word2vec_model.wv[word] for word in words if word in training_example])
                                   for training_example in X_test_preprocessed], dtype=object)

In [29]:
for w in range(5):
    print("Number of word vectors in training example {0}: {1}".format(w, len(X_train_word_embeddings[w])))

Number of word vectors in training example 0: 59
Number of word vectors in training example 1: 103
Number of word vectors in training example 2: 155
Number of word vectors in training example 3: 35
Number of word vectors in training example 4: 60


In [30]:
for w in range(5):
    print("Number of word vectors in test example {0}: {1}".format(w, len(X_test_word_embeddings[w])))

Number of word vectors in test example 0: 114
Number of word vectors in test example 1: 74
Number of word vectors in test example 2: 103
Number of word vectors in test example 3: 38
Number of word vectors in test example 4: 18


In [31]:
# Take average of word embeddings to get feature vector for each example
X_train_feature_vector = []
for w in X_train_word_embeddings:
    if w.size:
        X_train_feature_vector.append(w.mean(axis=0))
    else:
        X_train_feature_vector.append(np.zeros(100, dtype=float))
        
X_test_feature_vector = []
for w in X_test_word_embeddings:
    if w.size:
        X_test_feature_vector.append(w.mean(axis=0))
    else:
        X_test_feature_vector.append(np.zeros(100, dtype=float))

In [32]:
for w in range(5):
    print("Length of training example {0}: {1}".format(w, len(X_train_feature_vector[w])))

Length of training example 0: 100
Length of training example 1: 100
Length of training example 2: 100
Length of training example 3: 100
Length of training example 4: 100


### TF-IDF

In [43]:
tfidf_vectorizer = TfidfVectorizer(min_df=5, ngram_range=(1, 5))
tfidf_vectorizer.fit(X_train)

In [44]:
X_train_tfidf = tfidf_vectorizer.transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [45]:
tfidf_vocab_size = len(tfidf_vectorizer.vocabulary_)
tfidf_vocab_size

13067

## Modeling

### Logistic Regression with Word Embeddings

In [46]:
model_1 = LogisticRegression(max_iter=200)
model_1.fit(X_train_feature_vector, y_train)

probability_predictions_1 = model_1.predict_proba(X_test_feature_vector)[:, 1]
class_label_predictions_1 = model_1.predict(X_test_feature_vector)

auc = roc_auc_score(y_test, probability_predictions_1)
print("AUC on test data: {:.4f}".format(auc))

accuracy = accuracy_score(y_test, class_label_predictions_1)
print("Accuracy on test data: {:.4f}".format(accuracy))

AUC on test data: 0.8051
Accuracy on test data: 0.7374


### Logistic Regression with TF-IDF

In [47]:
model_2 = LogisticRegression(max_iter=200)
model_2.fit(X_train_tfidf, y_train)

probability_predictions_2 = model_2.predict_proba(X_test_tfidf)[:, 1]
class_label_predictions_2 = model_2.predict(X_test_tfidf)

auc = roc_auc_score(y_test, probability_predictions_2)
print("AUC on test data: {:.4f}".format(auc))

accuracy = accuracy_score(y_test, class_label_predictions_2)
print("Accuracy on test data: {:.4f}".format(accuracy))

AUC on test data: 0.9437
Accuracy on test data: 0.8939


### Neural Network with TF-IDF

In [48]:
model_3 = keras.Sequential()

input_layer = keras.layers.InputLayer(shape=(tfidf_vocab_size,))
model_3.add(input_layer)

hidden_layer_1 = keras.layers.Dense(units=64, activation='relu')
model_3.add(hidden_layer_1)

hidden_layer_2 = keras.layers.Dense(units=32, activation='relu')
model_3.add(hidden_layer_2)

hidden_layer_3 = keras.layers.Dense(units=16, activation='relu')
model_3.add(hidden_layer_3)

output_layer = keras.layers.Dense(units=1, activation='sigmoid')
model_3.add(output_layer)

model_3.summary()

In [49]:
optimizer = keras.optimizers.SGD(learning_rate=0.1)
loss_fn = keras.losses.BinaryCrossentropy(from_logits=False)
model_3.compile(optimizer=optimizer, loss=loss_fn, metrics=['accuracy'])

In [50]:
class ProgBarLoggerNEpochs(keras.callbacks.Callback):
    def __init__(self, num_epochs: int, every_n: int = 50):
        self.num_epochs = num_epochs
        self.every_n = every_n
    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.every_n == 0:
            s = 'Epoch [{}/{}]'.format(epoch + 1, self.num_epochs)
            logs_s = ['{}: {:.4f}'.format(k.capitalize(), v) for k, v in logs.items()]
            s_list = [s] + logs_s
            print(', '.join(s_list))

In [51]:
num_epochs = 50
history = model_3.fit(
    X_train_tfidf.toarray(),
    y_train,
    epochs=num_epochs,
    verbose=0,
    validation_split=0.2,
    callbacks=[ProgBarLoggerNEpochs(num_epochs, every_n=5)]
)

Epoch [5/50], Accuracy: 0.5394, Loss: 0.6909, Val_accuracy: 0.4930, Val_loss: 0.6916
Epoch [10/50], Accuracy: 0.6549, Loss: 0.6750, Val_accuracy: 0.6592, Val_loss: 0.6770
Epoch [15/50], Accuracy: 0.7232, Loss: 0.5626, Val_accuracy: 0.6085, Val_loss: 0.5987
Epoch [20/50], Accuracy: 0.8373, Loss: 0.3751, Val_accuracy: 0.5662, Val_loss: 0.8867
Epoch [25/50], Accuracy: 0.9120, Loss: 0.2541, Val_accuracy: 0.8338, Val_loss: 0.3769
Epoch [30/50], Accuracy: 0.9958, Loss: 0.0411, Val_accuracy: 0.8254, Val_loss: 0.3974
Epoch [35/50], Accuracy: 1.0000, Loss: 0.0064, Val_accuracy: 0.8282, Val_loss: 0.4370
Epoch [40/50], Accuracy: 1.0000, Loss: 0.0029, Val_accuracy: 0.8282, Val_loss: 0.4557
Epoch [45/50], Accuracy: 1.0000, Loss: 0.0018, Val_accuracy: 0.8282, Val_loss: 0.4753
Epoch [50/50], Accuracy: 1.0000, Loss: 0.0013, Val_accuracy: 0.8282, Val_loss: 0.4912


In [55]:
loss, accuracy = model_3.evaluate(X_test_tfidf.toarray(), y_test, verbose=0)
print("Accuracy on test data: {:.4f}".format(accuracy))

Accuracy on test data: 0.8535
