In [1]:
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
import re
from bs4 import BeautifulSoup

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/juliachen/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/juliachen/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
#! pip install bs4 # in case you don't have it installed
#! pip install contractions

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Kitchen_v1_00.tsv.gz

## Read Data

In [3]:
df = pd.read_csv("amazon_reviews_us_Kitchen_v1_00.tsv", 
                 sep='\t',
                 #usecols = ['star_rating','review_body'],
                 error_bad_lines=False,
                 warn_bad_lines=False)

## Keep Reviews and Ratings

In [4]:
review_df = df[["star_rating","review_body"]]

It important for us to check if there is any missing value in our dataset, beacause the missing value does not helpful to tell us any thing to predict the sentiment later.

By calling `review_df.isnull().sum()`, there are 3 missing values on star_rating, and 246 missing values on review_body, so we will go head drop them.

In [5]:
# drop na
# review_df.isnull().sum()
review_df_withoutmissing = review_df.dropna()

In [6]:
# three sample reviews
review_df_withoutmissing.head(3)

Unnamed: 0,star_rating,review_body
0,5.0,Beautiful. Looks great on counter.
1,5.0,I personally have 5 days sets and have also bo...
2,5.0,Fabulous and worth every penny. Used for clean...


Now we can look at how many review in each rating.

In [7]:
print("Count of review in each star rating:", "\n" , 
      review_df_withoutmissing["star_rating"].value_counts(), "\n")

Count of review in each star rating: 
 5.0    3124595
4.0     731701
1.0     426870
3.0     349539
2.0     241939
Name: star_rating, dtype: int64 



There are 426870 reviews in 1 star, 241939 reviews in 2 stars, 349539 reviews in 3 stars, 731701 reviews in 4 stars, 3124595 reviews in 5 stars.

# Labelling Reviews:
## The reviews with rating 4,5 are labelled to be 1 and 1,2 are labelled as 0. Discard the reviews with rating 3'

In [8]:
# creat a new column named label 
review_df_new = review_df_withoutmissing.copy()
review_df_new.loc[3, 'sentiment'] = None

# get the index of row where star_rating is 4, 5
# and get the index of row where star_rating is 1, 2
row_idx_1 = review_df_new[review_df_new['star_rating']>=4].index
row_idx_0 = review_df_new[review_df_new['star_rating']<=2].index

In [9]:
# assigning the labels 
review_df_new.loc[row_idx_1,'sentiment'] = 1
review_df_new.loc[row_idx_0,'sentiment'] = 0

In [10]:
positive = review_df_new[review_df_new['sentiment']==1]
print("There are", positive['star_rating'].count(), "positive reviews.")

There are 3856296 positive reviews.


In [11]:
negative = review_df_new[review_df_new['sentiment']==0]
print("There are", negative['star_rating'].count(), "negative reviews.")

There are 668809 negative reviews.


In [12]:
review3 = review_df_new[review_df_new['star_rating']==3]['star_rating']
print("There are", review3.count(), "reviews with the rating 3 stars.","\n")

There are 349539 reviews with the rating 3 stars. 



After we labeled reviews, we have 3856296 positive reviews, 668809 negative reviews, and 349539 reviews that got 3 stars which we will discard in later analysis. We can clearly see that our data now is imbalanced, since we have over 3 millon postive reviews, but only have less than 1 millon negative reviews.   

 ## We select 200000 reviews randomly with 100,000 positive and 100,000 negative reviews.



In [13]:
# get 100,000 sample from both positive and negative reviews

positive_sample = positive.sample(n = 100000, random_state=36)
negative_sample = negative.sample(n = 100000, random_state=36)

In [14]:
# combine two dataframes pandas and shuffle

samples = pd.concat([positive_sample, negative_sample])

In [15]:
# split data into 80% training dataset and 20% testing dataset.

from sklearn.model_selection import train_test_split
training_review, testing_review = train_test_split(samples, test_size=.2, random_state=42)

Now we have 160000 training data, and 40000 testing data. We will build models using features from training and see how it works on the testing. 

# Data Cleaning

## Convert the all reviews into the lower case.

In [16]:
print("Average Length of Review Before Data Cleaning: ",
      (training_review["review_body"].apply(len).sum() + \
       testing_review["review_body"].apply(len).sum())/200000)

Average Length of Review Before Data Cleaning:  323.05735


In [17]:
pd.options.mode.chained_assignment = None 
training_review.loc[:, "review_body"] = training_review.loc[:, "review_body"].str.lower()
testing_review.loc[:, "review_body"] = testing_review.loc[:, "review_body"].str.lower()

## remove the HTML and URLs from the reviews

In [18]:
# code ref: https://www.kaggle.com/hemrajsukriya/amazon-reviews-for-sentiment-analysis 
def remove_url(text):
    url=re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r" ",text)
def remove_html(text):
    cleanr = re.compile('<.*?>')
    return cleanr.sub(r" ",text)

In [19]:
# review url and html and url
testing_review["review_body"] = testing_review["review_body"].map(lambda x:remove_url(x))
testing_review["review_body"] = testing_review["review_body"].map(lambda x:remove_html(x))
training_review["review_body"] = training_review["review_body"].map(lambda x:remove_url(x))
training_review["review_body"] = training_review["review_body"].map(lambda x:remove_html(x))

## perform contractions on the reviews.

In [20]:
#code ref: https://www.geeksforgeeks.org/nlp-expand-contractions-in-text-processing/
import contractions

training_review["review_body"] = training_review["review_body"].apply(
    lambda x: ' '.join(contractions.fix(word) for word in x.split()))
testing_review["review_body"] = testing_review["review_body"].apply(
    lambda x: ' '.join(contractions.fix(word) for word in x.split()))

## remove non-alphabetical characters

In [21]:
regex = '[^a-zA-Z]'
training_review["review_body"] = training_review["review_body"].replace(regex, ' ', regex=True)
testing_review["review_body"] = testing_review["review_body"].replace(regex, ' ', regex=True)

## Remove the extra spaces between the words

In [22]:
#code reference: https://stackoverflow.com/questions/43071415/remove-multiple-blanks-in-dataframe

training_review["review_body"] = training_review["review_body"].replace('\s+', ' ', regex=True)
testing_review["review_body"] = testing_review["review_body"].replace('\s+', ' ', regex=True)

In [23]:
print("Average Length of Review After Data Cleaning: ",
      (training_review["review_body"].apply(len).sum() + \
       testing_review["review_body"].apply(len).sum())/200000) 

Average Length of Review After Data Cleaning:  309.89008


# Pre-processing

In [24]:
print("Average Length of Review Before Pre-processing: ",
      (training_review["review_body"].apply(len).sum()+\
       testing_review["review_body"].apply(len).sum())/200000)

Average Length of Review Before Pre-processing:  309.89008


## remove the stop words 

In [25]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

training_review["review_body"] = training_review["review_body"].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words))
testing_review["review_body"] = testing_review["review_body"].apply(
    lambda x: ' '.join(word for word in x.split() if word not in stop_words))

## perform lemmatization  

In [26]:
from nltk.stem import WordNetLemmatizer
lemmatizer = nltk.stem.WordNetLemmatizer()

training_review["review_body"] = training_review["review_body"].apply(
    lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))
testing_review["review_body"] = testing_review["review_body"].apply(
    lambda x: ' '.join(lemmatizer.lemmatize(word) for word in x.split()))

In [27]:
print("Average Length of Review After Pre-processing: ",
      (training_review["review_body"].apply(len).sum()+\
       testing_review["review_body"].apply(len).sum())/200000, "\n")

Average Length of Review After Pre-processing:  189.89327 



# TF-IDF Feature Extraction

In [28]:
from sklearn.feature_extraction.text import TfidfVectorizer

# convert raw review_body to a matrix of TF-IDF features
tfidf = TfidfVectorizer(ngram_range = (1, 2))


# transform training_review and testing_review to vectorized train_x_tfidf and test_x_tfidf
train_x_tfidf = tfidf.fit_transform(np.array(training_review["review_body"]))
test_x_tfidf = tfidf.transform(np.array(testing_review["review_body"]))

The default **TfidfVectorizer** have incorporated smooth_idf and use_idf. I used (1,2) ngram range in my case, which means both unigram and bigram will be used. N-gram refers to a string of n words in a row, so now, 1 word and 2 words will be considered. I use this is because it can boosted my prediction. It will make sense, since there are many words will always appear together.  


**tfidf.fit** function will calculate the parameters from the data, the **tfidf.transform** is basically applying the parameters to the data, **tfidf.fit_ransform** combine fit and transform function. In our case, we will have to get all necessary information from train data then apply it to test data. 

# Perceptron

In [29]:
from sklearn.linear_model import Perceptron

# fit the training data on the classifier
Percet= Perceptron()
Percet.fit(train_x_tfidf,training_review["sentiment"])

Perceptron()

Perceptron is a single neuron that can be used for classification problems. We may consider it as a linear model, but it is bit different from linear regression. the perceptron predicts a binary class label with $\pm(w^Tx_i)$ , whereas linear regression predicts a real value with $w^Tx_i$. In my perceptron model, I used defalut parameters, where is no penalty. 

In [30]:
# predict the labels on traning data
predictions_Percet_train = Percet.predict(train_x_tfidf)
# predict the labels on testing data
predictions_Percet_test = Percet.predict(test_x_tfidf)

In [31]:
# model accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Training Perceptron Accuracy Score:", accuracy_score(predictions_Percet_train, training_review["sentiment"]))
print("Training Perceptron Precision Score:",precision_score(predictions_Percet_train, training_review["sentiment"]))
print("Training Perceptron Recall Score:",recall_score(predictions_Percet_train, training_review["sentiment"]))
print("Training Perceptron F1 Score:", f1_score(predictions_Percet_train, training_review["sentiment"]),"\n")

print("Testing Perceptron Accuracy Score:", accuracy_score(predictions_Percet_test, testing_review["sentiment"]))
print("Testing Perceptron Precision Score:",precision_score(predictions_Percet_test, testing_review["sentiment"]))
print("Testing Perceptron Recall Score:",recall_score(predictions_Percet_test, testing_review["sentiment"]))
print("Testing Perceptron F1 Score:", f1_score(predictions_Percet_test, testing_review["sentiment"]),"\n")

Training Perceptron Accuracy Score: 0.99358125
Training Perceptron Precision Score: 0.9937630457334983
Training Perceptron Recall Score: 0.9934029686641012
Training Perceptron F1 Score: 0.9935829745755829 

Testing Perceptron Accuracy Score: 0.89785
Testing Perceptron Precision Score: 0.9022657930275596
Testing Perceptron Recall Score: 0.8943037033364731
Testing Perceptron F1 Score: 0.8982671048700328 



From the list of prediction scores, we can see that all training prediction scores are all over 99%. Testing prediction scores are a bit lower than training which tells us that our model is not overfitting and 90% also looks very decent. 

# SVM

In [32]:
from sklearn import svm
from sklearn.svm import LinearSVC

# fit the training data on the classifier
SVM = svm.LinearSVC()
SVM.fit(train_x_tfidf,training_review["sentiment"])

LinearSVC()

LinearSVC tends to be faster to converge given a large sample because it can deal with sparse dataset, in our case the traning data is very large, when fitting to the regular svc model with linear kernel will be hard to converge.



In [33]:
# predict the labels on traning data
predictions_SVM_train = SVM.predict(train_x_tfidf)
# predict the labels on testing data
predictions_SVM_test = SVM.predict(test_x_tfidf)

In [34]:
# model accuracy
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

print("Training SVM Accuracy Score:", accuracy_score(predictions_SVM_train, training_review["sentiment"]))
print("Training SVM Precision Score:",precision_score(predictions_SVM_train, training_review["sentiment"]))
print("Training SVM Recall Score:",recall_score(predictions_SVM_train, training_review["sentiment"]))
print("Training SVM F1 Score:", f1_score(predictions_SVM_train, training_review["sentiment"]), "\n")

print("Testing SVM Accuracy Score:", accuracy_score(predictions_SVM_test, testing_review["sentiment"]))
print("Testing SVM Precision Score:",precision_score(predictions_SVM_test, testing_review["sentiment"]))
print("Testing SVM Recall Score:",recall_score(predictions_SVM_test, testing_review["sentiment"]))
print("Testing SVM F1 Score:", f1_score(predictions_SVM_test, testing_review["sentiment"]),"\n")

Training SVM Accuracy Score: 0.9949125
Training SVM Precision Score: 0.9961253390328346
Training SVM Recall Score: 0.9937157890799366
Training SVM F1 Score: 0.9949191051632877 

Testing SVM Accuracy Score: 0.91585
Testing SVM Precision Score: 0.9165207822737959
Testing SVM Recall Score: 0.9152389990509965
Testing SVM F1 Score: 0.9158794421952317 



Looking at the results above, all training prediction scores are all over 99%; all testing prediction scores are a bit lower than training data. Precision and recall are very close, which means the number of false negative and false positive are similar. The overall accuracy is over 91.5%.

# Logistic Regression

In [35]:
from sklearn.linear_model import LogisticRegression

# fit the training data on the classifier
Logit= LogisticRegression(C = 10, penalty = 'l2', solver = 'liblinear')
Logit.fit(train_x_tfidf,training_review["sentiment"])

LogisticRegression(C=10, solver='liblinear')

For linear model, I applied GridSearch to select hyper-parameters. C is the inverse regularization parameter, and the larger C the less penalty for the parameters norm function. The penatly I choose L2 norm, which add squared magnitude as penalty to the loss function. 

In [36]:
# predict the labels on traning data
predictions_Logit_train = Logit.predict(train_x_tfidf)
# predict the labels on testing data
predictions_Logit_test = Logit.predict(test_x_tfidf)

In [37]:
# model accuracy
print("Training Logistic Accuracy Score:", accuracy_score(predictions_Logit_train, training_review["sentiment"]))
print("Training Logistic Precision Score:",precision_score(predictions_Logit_train, training_review["sentiment"]))
print("Training Logistic Recall Score:",recall_score(predictions_Logit_train, training_review["sentiment"]))
print("Training Logistic F1 Score:", f1_score(predictions_Logit_train, training_review["sentiment"]), "\n")

print("Testing Logistic Accuracy Score:", accuracy_score(predictions_Logit_test, testing_review["sentiment"]))
print("Testing Logistic Precision Score:",precision_score(predictions_Logit_test, testing_review["sentiment"]))
print("Testing Logistic Recall Score:",recall_score(predictions_Logit_test, testing_review["sentiment"]))
print("Testing Logistic F1 Score:", f1_score(predictions_Logit_test, testing_review["sentiment"]),"\n")

Training Logistic Accuracy Score: 0.99331875
Training Logistic Precision Score: 0.9944504855825115
Training Logistic Recall Score: 0.9922058163316206
Training Logistic F1 Score: 0.9933268828615125 

Testing Logistic Accuracy Score: 0.916625
Testing Logistic Precision Score: 0.9161706597309058
Testing Logistic Recall Score: 0.9169503404084902
Testing Logistic F1 Score: 0.9165603342590508 



The results of logistic regression are quiet simiar to LinearSVC. Again, all training prediction scores are all over 99%. Testing accuracy, precision, recall and F1 scores are all over 91.6%. 

# Naive Bayes

In [38]:
from sklearn import naive_bayes

# fit the training data on the classifier
Naive = naive_bayes.MultinomialNB()
Naive.fit(train_x_tfidf,training_review["sentiment"])

MultinomialNB()

The Naive Bayes classifier's idea is created based on Bayesian Theorem1. The fundamental assumption hold by this classifier is to treat each word independently. That is to say, the Naive Bayes classifier will ignore all rules, such as the occurrence of one word does not  affect the probability of the other word's happening but keeping track of the likelihood of
the labels by given words or phrases.

In [39]:
# predict the labels on training data
predictions_Naive_train = Naive.predict(train_x_tfidf)
# predict the labels on testing data
predictions_Naive_test = Naive.predict(test_x_tfidf)

In [40]:
# model accuracy
print("Training Naive Bayes Accuracy Score:", accuracy_score(predictions_Naive_train, training_review["sentiment"]))
print("Training Naive Bayes Precision Score:",precision_score(predictions_Naive_train, training_review["sentiment"]))
print("Training Naive Bayes Recall Score:",recall_score(predictions_Naive_train, training_review["sentiment"]))
print("Training Naive Bayes F1 Score:", f1_score(predictions_Naive_train, training_review["sentiment"]), "\n")

print("Testing Naive Bayes Accuracy Score:", accuracy_score(predictions_Naive_test, testing_review["sentiment"]))
print("Testing Naive Bayes Precision Score:",precision_score(predictions_Naive_test, testing_review["sentiment"]))
print("Testing Naive Bayes Recall Score:",recall_score(predictions_Naive_test, testing_review["sentiment"]))
print("Testing Naive Bayes F1 Score:", f1_score(predictions_Naive_test, testing_review["sentiment"]),"\n")

Training Naive Bayes Accuracy Score: 0.95515
Training Naive Bayes Precision Score: 0.9438299148824478
Training Naive Bayes Recall Score: 0.9657011317859198
Training Naive Bayes F1 Score: 0.9546402700345128 

Testing Naive Bayes Accuracy Score: 0.8967
Testing Naive Bayes Precision Score: 0.8679037663182114
Testing Naive Bayes Recall Score: 0.9208724725362204
Testing Naive Bayes F1 Score: 0.8936038726954373 



Compared to all three models above, the Naive Bayes model is a bit worse. The testing accuracy is close to the F1 score, but the precision score is lower than recall. It tells us that false positive is higher than false negative. The testing prediction result shows that the precision score is 86.79%, but the recall rate is 92.87%. Therefore, we can conclude that the model yields more false positives. 

To sum up, logistic regression gives the highest scores in all testing prediction scores (91.6%). LinearSVC also gives good results on all prediction scores (91.5%). Perceptron is also reasonable, which gives about 90% on all scores. For those three models, the training accuracy are all reach 99%. The naive Bayes model has imbalanced precision and recall on training data, and this is also amplified on the test set. 