In [48]:
import pandas as pd
import numpy as np
import nltk
import re
from bs4 import BeautifulSoup

In [49]:
! pip install bs4 # in case you don't have it installed

# Dataset: https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Beauty_v1_00.tsv.gz



## Read Data

In [50]:
dataframe = pd.read_csv("https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz", sep="\t", on_bad_lines="skip")

  dataframe = pd.read_csv("https://web.archive.org/web/20201127142707if_/https://s3.amazonaws.com/amazon-reviews-pds/tsv/amazon_reviews_us_Office_Products_v1_00.tsv.gz", sep="\t", on_bad_lines="skip")


## Keep Reviews and Ratings

In [51]:
df = dataframe[["star_rating", "review_body"]]
print( "- Three sample reviews before data cleaning + preprocessing.\n")
df.head(3)

- Three sample reviews before data cleaning + preprocessing.



Unnamed: 0,star_rating,review_body
0,5,Great product.
1,5,What's to say about this commodity item except...
2,5,"Haven't used yet, but I am sure I will like it."


In [52]:
df.loc[:, 'star_rating'] = pd.to_numeric(df.star_rating, errors="coerce")
df = df.dropna()
rating_stats = df.star_rating.value_counts()
for i , j in rating_stats.items():
    print(f'Number of reviews having rating {i} : {j}')

Number of reviews having rating 5.0 : 1582704
Number of reviews having rating 4.0 : 418348
Number of reviews having rating 1.0 : 306967
Number of reviews having rating 3.0 : 193680
Number of reviews having rating 2.0 : 138381


 ## We form three classes and select 100000 reviews randomly from each class.



In [53]:
print( "\n\n- Statistics of three classes :\n")
distribution = df.star_rating.value_counts()
print (f'Positive review: {distribution[4]+distribution[5]}, Neutral review: {distribution[3]}, Negative review: {distribution[1]+distribution[2]}')

# droping rows with neutral reviews i.e 3
df.drop(df[df.star_rating == 3].index, inplace=True)
df["class"] = df.star_rating.apply( lambda x : 1 if x > 3 else 0)
df1 = df[df["class"] == 1].sample(100000)
df2 = df[df["class"] == 0].sample(100000)

dataset = pd.concat([df1,df2],ignore_index=True)
dataset.head(3)



- Statistics of three classes :

Positive review: 2001052, Neutral review: 193680, Negative review: 445348


Unnamed: 0,star_rating,review_body,class
0,5.0,I ordered them from Discount Office Supplies a...,1
1,5.0,Really nice metallic colors. I use them to sig...,1
2,4.0,Just so you know it prints okay at best photos...,1


# Data Cleaning



# Pre-processing

In [54]:
# Average review length
print("Average reiviews length before cleaning data :",dataset.review_body.apply(len).mean(),end=",")


# - convert the all reviews into the lower case.
dataset.review_body = dataset.review_body.str.lower()

# - remove the HTML and URLs from the reviews
dataset.review_body = dataset.review_body.str.replace(r'<[^<>]*>', '', regex=True)
dataset.review_body = dataset.review_body.apply(lambda x : re.sub('http[s]?://\S+','', x))

# - remove non-alphabetical characters
dataset.review_body = dataset.review_body.apply(lambda x : re.sub('[^a-z\s\']', '', x))

# - remove extra spaces
dataset.review_body = dataset.review_body.str.strip()

# - remove multiple spaces in between review body
dataset.review_body = dataset.review_body.str.replace(r'^\s*|\s\s*', ' ', regex=True)

# - perform contractions on the reviews=
contraction_dict = {
    "i'm": "i am",
    "you're": "you are",
    "he's": "he is",
    "she's": "she is",
    "they're": "they are",
    "we're": "we are",
    "it's": "it is",
    "that's": "that is",
    "here's": "here is",
    "there's": "there is",
    "who's": "who is",
    "where's": "where is",
    "when's": "when is",
    "why's": "why is",
    "what's": "what is",
    "how's": "how is",
    "everybody's": "everybody is",
    "nobody's": "nobody is",
    "something's": "something is",
    "so's": "so is",
    "i'll": "i will",
    "you'll": "you will",
    "he'll": "he will",
    "she'll": "she will",
    "they'll": "they will",
    "it'll": "it will",
    "we'll": "we will",
    "that'll": "that will",
    "this'll": "this will",
    "these'll": "these will",
    "there'll": "there will",
    "where'll": "where will",
    "who'll": "who will",
    "what'll": "what will",
    "how'll": "how will",
    "i've": "i have",
    "you've": "you have",
    "he's": "he has",
    "she's": "she has",
    "we've": "we have",
    "they've": "they have",
    "should've": "should have",
    "could've": "could have",
    "would've": "would have",
    "might've": "might have",
    "must've": "must have",
    "what've": "what have",
    "what's": "what has",
    "where've": "where have",
    "where's": "where has",
    "there've": "there have",
    "there's": "there has",
    "these've": "these have",
    "who's": "who has",
    "don't": "do not",
    "can't": "cannot",
    "mustn't": "must not",
    "aren't": "are not",
    "couldn't": "could not",
    "wouldn't": "would not",
    "shouldn't": "should not",
    "isn't": "is not",
    "doesn't": "does not",
    "didn't": "did not",
    "hasn't": "has not",
    "hadn't": "had not",
    "haven't": "have not",
    "wasn't": "was not",
    "won't": "will not",
    "weren't": "were not",
    "ain't": "am not",
    "let's": "let us",
    "y'all": "you all",
    "where'd": "where did",
    "how'd": "how did",
    "why'd": "why did",
    "who'd": "who did",
    "when'd": "when did",
    "what'd": "what did",
    "g'day": "good day",
    "ma'am": "madam",
    "o'clock": "of the clock"
}

dataset.review_body = dataset.review_body.apply( lambda x : " ".join([contraction_dict[word] if word in contraction_dict else word for word in x.split()]) )

# Average review length
print(" Average reiviews length after cleaning data :",dataset.review_body.apply(len).mean())

Average reiviews length before cleaning data : 317.917045, Average reiviews length after cleaning data : 302.11495


## remove the stop words

In [55]:
from nltk.corpus import stopwords

# Average review length
review_length_before_preprocessing = dataset.review_body.apply(len).mean()

nltk.download('stopwords')
stop_words = stopwords.words('english')
dataset["review_body_lammatized"] = dataset.review_body.apply( lambda x : ' '.join([i for i in x.split() if i not in (stop_words)]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## perform lemmatization  

In [56]:
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import word_tokenize, pos_tag
from collections import defaultdict
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

lemmatizer = WordNetLemmatizer()

def lemmatize_word_according_pos ( statement ):

  tag_map = defaultdict(lambda : wn.NOUN)
  tag_map['J'] = wn.ADJ
  tag_map['V'] = wn.VERB
  tag_map['R'] = wn.ADV

  tokens = word_tokenize(statement)
  answer = []
  for token, tag in pos_tag(tokens):
      answer.append(lemmatizer.lemmatize(token, tag_map[tag[0]]))
  return ' '.join(answer)

dataset["review_body_lammatized"] = dataset.review_body_lammatized.apply( lambda x : lemmatize_word_according_pos(x))
print("Three sample reviews after data cleaning + preprocessing.")
print("Average reiviews length before Pre-processing data :",review_length_before_preprocessing,end=",")
print("Average reiviews length after Pre-processing data :",dataset.review_body_lammatized.apply(len).mean())

[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/hrishikesh/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


Three sample reviews after data cleaning + preprocessing.
Average reiviews length before Pre-processing data : 302.11495,Average reiviews length after Pre-processing data : 181.883135


In [57]:
dataset.head(3)


Unnamed: 0,star_rating,review_body,class,review_body_lammatized
0,5.0,i ordered them from discount office supplies a...,1,ordered discount office supply receive day ord...
1,5.0,really nice metallic colors i use them to sign...,1,really nice metallic color use sign canvas wra...
2,4.0,just so you know it prints okay at best photos...,1,know print okay best photo high contrast low d...


# TF-IDF Feature Extraction

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000)
review_list = dataset.review_body_lammatized.tolist()
result = tfidf.fit_transform(review_list)
features = tfidf.get_feature_names_out()
tfidf_dataset = pd.DataFrame( data = result.toarray(), columns = features )


In [59]:
tfidf_dataset.head(3)


Unnamed: 0,aa,aaa,abandon,ability,able,absolute,absolutely,absorb,absurd,abuse,...,youtube,yr,yrs,zebra,zero,zip,zipper,zire,zone,zoom
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.248742,0.0,0.0,0.0,0.0,0.0


In [60]:
# Train and Test split
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(tfidf_dataset, dataset["class"], test_size=0.2, random_state=42)

In [61]:
print(x_train.shape, y_train.shape, x_test.shape, y_test.shape)

(160000, 5000) (160000,) (40000, 5000) (40000,)


# Perceptron

In [62]:
from sklearn.linear_model import Perceptron 
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

perceptron = Perceptron(eta0=0.1) 
perceptron.fit(x_train, y_train) 
train_pred = perceptron.predict(x_train)

accuracy = accuracy_score(y_train, train_pred) 
precision = precision_score(y_train, train_pred) 
recall = recall_score(y_train, train_pred) 
f1 = f1_score(y_train, train_pred) 

print("Training data Metrix (Perceptron): ","Accuracy :", accuracy , ", Precision :", precision,", Recall :", recall,", F1 Score :", f1)

y_pred = perceptron.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred) 
precision_test = precision_score(y_test, y_pred) 
recall_test = recall_score(y_test, y_pred) 
f1_test = f1_score(y_test, y_pred) 

print("Testing Data Metrix (Perceptron):","Accuracy :", accuracy_test , ", Precision :", precision_test,", Recall :", precision_test,", F1 Score :", f1_test)

Training data Metrix (Perceptron):  Accuracy : 0.872125 , Precision : 0.8612543528640936 , Recall : 0.8871973702301049 , F1 Score : 0.8740333940796926
Testing Data Metrix (Perceptron): Accuracy : 0.8575 , Precision : 0.845993706124425 , Recall : 0.845993706124425 , F1 Score : 0.8597716984845504


# SVM

In [63]:
from sklearn import svm
svm_model = svm.LinearSVC(dual="auto") 
svm_model.fit(x_train, y_train) 
train_pred = svm_model.predict(x_train)

accuracy = accuracy_score(y_train, train_pred) 
precision = precision_score(y_train, train_pred) 
recall = recall_score(y_train, train_pred) 
f1 = f1_score(y_train, train_pred) 

print("Training data Metrix (SVM): ","Accuracy :", accuracy , ", Precision :", precision,", Recall :", recall,", F1 Score :", f1)


y_pred = svm_model.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred) 
precision_test = precision_score(y_test, y_pred) 
recall_test = recall_score(y_test, y_pred) 
f1_test = f1_score(y_test, y_pred) 

print("Testing Data Metrix (SVM):","Accuracy :", accuracy_test , ", Precision :", precision_test,", Recall :", precision_test,", F1 Score :", f1_test)

Training data Metrix (SVM):  Accuracy : 0.89953125 , Precision : 0.9013509780779951 , Recall : 0.8972839876510805 , F1 Score : 0.8993128848189511
Testing Data Metrix (SVM): Accuracy : 0.885875 , Precision : 0.8873267724442659 , Recall : 0.8873267724442659 , F1 Score : 0.8856147736099627


# Logistic Regression

In [66]:
from sklearn.linear_model import LogisticRegression
logistic_reg = LogisticRegression(max_iter=5000) 
logistic_reg.fit(x_train, y_train) 
train_pred = logistic_reg.predict(x_train)

accuracy = accuracy_score(y_train, train_pred) 
precision = precision_score(y_train, train_pred) 
recall = recall_score(y_train, train_pred) 
f1 = f1_score(y_train, train_pred) 

print("Training data Metrix (Logistic Regression): ","Accuracy :", accuracy , ", Precision :", precision,", Recall :", recall,", F1 Score :", f1)


y_pred = logistic_reg.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred) 
precision_test = precision_score(y_test, y_pred) 
recall_test = recall_score(y_test, y_pred) 
f1_test = f1_score(y_test, y_pred) 

print("Testing Data Metrix (Logistic Regression):","Accuracy :", accuracy_test , ", Precision :", precision_test,", Recall :", precision_test,", F1 Score :", f1_test)

Training data Metrix (Logistic Regression):  Accuracy : 0.89613125 , Precision : 0.8995310609116579 , Recall : 0.8918969590160861 , F1 Score : 0.8956977437474503
Testing Data Metrix (Logistic Regression): Accuracy : 0.887375 , Precision : 0.8904012905827788 , Recall : 0.8904012905827788 , F1 Score : 0.8868914609957568


# Naive Bayes

In [65]:
from sklearn.naive_bayes import MultinomialNB
naiveBayes = MultinomialNB()
naiveBayes.fit(x_train,y_train)

train_pred = naiveBayes.predict(x_train)

accuracy = accuracy_score(y_train, train_pred) 
precision = precision_score(y_train, train_pred) 
recall = recall_score(y_train, train_pred) 
f1 = f1_score(y_train, train_pred) 

print("Training data Metrix (Naive Bayes): ","Accuracy :", accuracy , ", Precision :", precision,", Recall :", recall,", F1 Score :", f1)

y_pred = naiveBayes.predict(x_test)

accuracy_test = accuracy_score(y_test, y_pred) 
precision_test = precision_score(y_test, y_pred) 
recall_test = recall_score(y_test, y_pred) 
f1_test = f1_score(y_test, y_pred) 

print("Testing data Metrix (Naive Bayes): ","Accuracy :", accuracy_test , ", Precision :", precision_test,", Recall :", precision_test,", F1 Score :", f1_test)

Training data Metrix (Naive Bayes):  Accuracy : 0.85950625 , Precision : 0.8582513388965002 , Recall : 0.8612871373754797 , F1 Score : 0.8597665583261072
Testing data Metrix (Naive Bayes):  Accuracy : 0.854625 , Precision : 0.8553027265437049 , Recall : 0.8553027265437049 , F1 Score : 0.854424834146952
