# Project 5

In [None]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

import string # contains punctuation to remove
from cleantext import clean # contains emojis to remove
import re # used for working with string data

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet') # Needed for lemmatization
nltk.download('stopwords')
nltk.download('omw-1.4')

from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

The data for this project came from Kaggle. It contains The qualitative ratings and numerical ratings for books on the Goodreads website, as well as other variables (such as the number of votes and comments the rating text recieved). 

In [None]:
# import data 
goodreads = pd.read_csv("data/goodreads_train.csv")

In [None]:
goodreads.head()

### General Preprocessing

In [None]:
# Look at how much data we have
print(f"The dataset has {len(goodreads)} samples.")

We really don't need that much data for the purpose of this project. We should cut it down. Let's look at other factors before deciding how much to cut.

In [None]:
# Descriptive stats for the numerical variables
goodreads.describe()

It really doesn't look like there are that many votes or comments, let's drop those and just use the text to make things easier. We also don't need the ids and the dates. 

In [None]:
goodreads = goodreads[["review_text", "rating"]]

In [None]:
# It looks like the ratings are really skewed
sns.countplot(x = goodreads.rating, palette = "coolwarm")

We want to recode the ratings to a binary variable. Given the skew, let's do the following:
- **1 - 3 = 0 (not great)**  
- **4 - 5 = 1 (great)**

In [None]:
# Recode rating
not_great = [1, 2, 3]
great = [4, 5]

conditions = [goodreads.rating.isin(not_great), goodreads.rating.isin(great)]
values = [0, 1]


goodreads["rating"]  = np.select(conditions, values)

In [None]:
sns.countplot(x = goodreads.rating, palette = "coolwarm")

This is still a lot of data. The text preprocessing is going to take forever. Let's take a sample of 50,000 of each rating (so 100,000 total).

In [None]:
goodreads = goodreads.groupby("rating").sample(50000, random_state = 10).reset_index(drop = True)

In [None]:
goodreads.rating.value_counts()

### Text Cleaning

In [None]:
# Create a function to clean the review text
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
# Add "book" to the stopwords, given the context of this analysis.
# Note: This only improved the F1 score by .005 by slightly increasing the recall. 
# So I opted to leave it out since it will probably be important for bi-grams.
# stop_words.add("book") 

def clean_text(text):
    cleaned_text = text.lower() # Make everything lowercase
    cleaned_text = "".join(char for char in cleaned_text if char not in string.punctuation) # Remove punctuation
    cleaned_text = re.sub(r"http\S+", "", cleaned_text) # remove URLs from the review
    cleaned_text = re.sub("\n", "", cleaned_text) # I noticed multiple reviews had  the new line symbol (see below cell), so remove those
    cleaned_text = re.sub('  ', ' ', cleaned_text) # remove extra spaces (if any)
    cleaned_text = cleaned_text.split(' ') # Temporarily tokenize for stopword removal and lemmatization
    
    # The following line of code does a couple things:
    ### 1: Removes the word if it is a stopword (e.g., common word that will not be useful, like "the")
    ### 2: Lemmatizes the word (e.g., convert to base form so similar words are not counted as separate words)
    cleaned_text = [lemmatizer.lemmatize(word) for word in cleaned_text if word not in stop_words]
    cleaned_text = ' '.join(cleaned_text) # get back to a string
       
    return cleaned_text

In [None]:
# Clean the text in each review
goodreads.review_text = goodreads.review_text.apply(lambda review: clean_text(review))
# Check first 5 reviews
goodreads.review_text.head()

### Train/Test Split and Bag of Words
Split the data, create a bag of words from the training data.

In [None]:
train, test = train_test_split(goodreads, random_state = 10)

In [None]:
# Separate predictors and outcomes
X_train = train.review_text.to_numpy()
y_train = train.rating.to_numpy()

X_test = test.review_text.to_numpy()
y_test = test.rating.to_numpy()

In [None]:
CV = CountVectorizer(stop_words = None, max_features = 5000) # Only get the top 5000 words
CV.fit(X_train)
X_train_uni = CV.transform(X_train)
# The training data is now a Bag of Words (a matrix of word counts)
X_train_uni.shape

In [None]:
# Get the top ten most frequent words in the vocabulary
vocabulary = CV.vocabulary_.items() # Get all of the words in the vocabulary
totals = X_train_uni.sum(axis = 0) # count words
frequencies = [(word, totals[0, index]) for word, index in vocabulary] # Get the frequencies for all the words
frequencies = sorted(frequencies, key = lambda x: x[1], reverse  = True) # sort words based on the frequencies
frequencies[ : 10]

In [None]:
# We use the vocabulary from the training data to transform the tokens in the test set. 
X_test_uni = CV.transform(X_test)

### Model fit and accuracy

'Base' Model

In [None]:
model = MultinomialNB()

print("Mean Accuracies:")
scores = cross_val_score(model, X_train_uni, y_train, n_jobs=-1)
display(pd.DataFrame(scores))
print("F1 Scores:")
scores = cross_val_score(model, X_train_uni, y_train, scoring="f1", n_jobs=-1)
display(pd.DataFrame(scores))

Hyper-Parameter Optimizations

In [None]:
param_grid = {
    "alpha": [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]
}

model = MultinomialNB()
search = GridSearchCV(model, param_grid=param_grid, scoring="f1", n_jobs=-1, return_train_score=True)
search.fit(X_train_uni, y_train)

# display(pd.DataFrame(search.cv_results_))
print(f"Best f1 Score: {search.best_score_}")
print(f"Best Estimator: {search.best_estimator_}")

### Final Model

In [None]:
model = MultinomialNB(alpha=100)
model.fit(X_train_uni, y_train) # fit best model

y_pred = model.predict(X_test_uni) # make predictions

# calculate scoring metrics
(p, r, f, s) = precision_recall_fscore_support(y_test, y_pred, pos_label=1, labels=[0])
print(f"precision: {p[0]}\nrecall: {r[0]}\nfscore: {f[0]}")

# plot confusion matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure()
ax = sns.heatmap(cm, annot=True, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['not-great', 'great'])

Try using bigrams (e.g., word pairs) instead of just single words.

In [None]:
CV_bi_gram = CountVectorizer(stop_words = None, ngram_range=(2,2), max_features = 10000) # Only get the top 10000 bi-grams
CV_bi_gram.fit(X_train)
X_train_bi = CV_bi_gram.transform(X_train)
# The training data is now a Bag of Words (a matrix of bi-gram counts)
X_train_bi.shape

In [None]:
# Get the top ten most frequent bi-grams in the vocabulary
vocabulary_bi = CV_bi_gram.vocabulary_.items() # Get all of the bi-grams in the vocabulary
totals_bi = X_train_bi.sum(axis = 0) # count bi_grams
frequencies_bi = [(word, totals_bi[0, index]) for word, index in vocabulary_bi] # Get the frequencies for all the bi-grams
frequencies_bi = sorted(frequencies_bi, key = lambda x: x[1], reverse  = True) # sort bi-grams based on the frequencies
frequencies_bi[ : 10]

In [None]:
# We use the vocabulary from the training data to transform the tokens in the test set. 
X_test_bi = CV_bi_gram.transform(X_test)

### Model fit and accuracy with bi-grams

'Base' Model

In [None]:
model = MultinomialNB()

print("Mean Accuracies (bi-gram):")
scores = cross_val_score(model, X_train_bi, y_train, n_jobs=-1)
display(pd.DataFrame(scores))
print("F1 Scores (bi_gram):")
scores = cross_val_score(model, X_train_bi, y_train, scoring="f1", n_jobs=-1)
display(pd.DataFrame(scores))

Hyper-Parameter Optimizations

In [None]:
param_grid = {
    "alpha": [0.00001, 0.0001, 0.001, 0.1, 1, 10, 100, 1000]
}

model = MultinomialNB()
search_bi = GridSearchCV(model, param_grid=param_grid, scoring="f1", n_jobs=-1, return_train_score=True)
search_bi.fit(X_train_bi, y_train)

# display(pd.DataFrame(search.cv_results_))
print(f"Best f1 Score (bi-gram): {search_bi.best_score_}")
print(f"Best Estimator (bi_gram): {search_bi.best_estimator_}")

### Final Model for bi-grams

In [None]:
model = MultinomialNB(alpha=10)
model.fit(X_train_bi, y_train) # fit best model

y_pred_bi = model.predict(X_test_bi) # make predictions

# calculate scoring metrics
(p, r, f, s) = precision_recall_fscore_support(y_test, y_pred_bi, pos_label=1, labels=[0])
print(f"precision: {p[0]}\nrecall: {r[0]}\nfscore: {f[0]}")

# plot confusion matrix
cm = confusion_matrix(y_test, y_pred_bi)
plt.figure()
ax = sns.heatmap(cm, annot=True, fmt='g')
ax.set_xlabel('Predicted labels')
ax.set_ylabel('True labels')
ax.set_title('Confusion Matrix')
ax.xaxis.set_ticklabels(['not-great', 'great'])

Note: Doubling the training data (e.g., from 10000 total to 20000 total) increased the f1 scores by about .04 each. Increasing to 100000 total only increased the scores by about .02.