In [1]:
import pandas as pd
import nltk
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('punkt')
import re
from bs4 import BeautifulSoup
import contractions

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\jehil\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\jehil\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\jehil\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Read Data

In [2]:
review_data = pd.read_table('amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip')

  review_data = pd.read_table('amazon_reviews_us_Office_Products_v1_00.tsv', on_bad_lines='skip')


## Keep Reviews and Ratings

In [3]:
review_data = review_data[['star_rating', 'review_body']]
review_data['star_rating'] = pd.to_numeric(review_data['star_rating'], errors='coerce')
review_data.dropna(inplace=True)

 ## We form two classes and select 50000 reviews randomly from each class.



In [4]:
positive_reviews = review_data[review_data['star_rating'] >= 4]
negative_reviews = review_data[review_data['star_rating'] < 4]

In [5]:
SEED = 42
NUM_ROWS = 50000
reviews_subset = pd.concat([positive_reviews.sample(NUM_ROWS, random_state=SEED), negative_reviews.sample(NUM_ROWS, random_state=SEED)])

# Data Cleaning



In [6]:
def clean_review_text(text):
    text = BeautifulSoup(text, "html.parser").get_text()
    text = re.sub(r'[^a-zA-Z]', ' ', text)
    text = contractions.fix(text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    return text

In [7]:
reviews_subset['cleaned_text'] = reviews_subset['review_body'].apply(clean_review_text)

  text = BeautifulSoup(text, "html.parser").get_text()
  text = BeautifulSoup(text, "html.parser").get_text()


In [8]:
avg_length_before_cleaning = reviews_subset['review_body'].apply(len).mean()
avg_length_after_cleaning = reviews_subset['cleaned_text'].apply(len).mean()
print(f'Average length of reviews before cleaning: {avg_length_before_cleaning}')
print(f'Average length of reviews after cleaning: {avg_length_after_cleaning}')

Average length of reviews before cleaning: 314.24925
Average length of reviews after cleaning: 299.44586


# Pre-processing

In [9]:
def preprocess_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    words = nltk.word_tokenize(text)
    filtered_words = [word for word in words if word not in stop_words]

    lemmatizer = nltk.stem.WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word) for word in filtered_words]
    
    return ' '.join(lemmatized_words)

In [10]:
reviews_subset['preprocessed_review'] = reviews_subset['cleaned_text'].apply(preprocess_text)

In [11]:
avg_length_before_preprocessing = reviews_subset['cleaned_text'].apply(len).mean()
avg_length_after_preprocessing = reviews_subset['preprocessed_review'].apply(len).mean()
print(f'Average length of reviews before preprocessing: {avg_length_before_preprocessing}')
print(f'Average length of reviews after preprocessing: {avg_length_after_preprocessing}')

Average length of reviews before preprocessing: 299.44586
Average length of reviews after preprocessing: 184.04841


## remove the stop words 

In [None]:
from nltk.corpus import stopwords
 

## perform lemmatization  

In [None]:
from nltk.stem import WordNetLemmatizer


# TF-IDF and BoW Feature Extraction

In [12]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import Perceptron
from sklearn.metrics import precision_score, recall_score, f1_score

In [13]:
X_train, X_test, y_train, y_test = train_test_split(reviews_subset['preprocessed_review'], (reviews_subset['star_rating'] >= 4).astype(int), test_size=0.2, random_state=42)

In [14]:
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [15]:
bow_vectorizer = CountVectorizer()
X_train_bow = bow_vectorizer.fit_transform(X_train)
X_test_bow = bow_vectorizer.transform(X_test)

# Perceptron Using Both Features

In [28]:
perceptron_tfidf = Perceptron()
perceptron_bow = Perceptron()

perceptron_tfidf.fit(X_train_tfidf, y_train)
perceptron_bow.fit(X_train_bow, y_train)

y_pred_tfidf = perceptron_tfidf.predict(X_test_tfidf)
y_pred_bow = perceptron_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'Precision Recall F1 (TF-IDF): {precision_tfidf:.2f} {recall_tfidf:.2f} {f1_tfidf:.2f}')
print(f'Precision Recall F1 (BoW): {precision_bow:.2f} {recall_bow:.2f} {f1_bow:.2f}')


Precision Recall F1 (TF-IDF): 0.78 0.78 0.78
Precision Recall F1 (BoW): 0.81 0.79 0.80


# SVM Using Both Features

In [26]:
from sklearn.svm import SVC

In [29]:
svm_model_tfidf = SVC()
svm_model_bow = SVC()

svm_model_tfidf.fit(X_train_tfidf, y_train)
svm_model_bow.fit(X_train_bow, y_train)


In [30]:

y_pred_tfidf = svm_model_tfidf.predict(X_test_tfidf)
y_pred_bow = svm_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'Precision Recall F1 (TF-IDF): {precision_tfidf:.2f} {recall_tfidf:.2f} {f1_tfidf:.2f}')
print(f'Precision Recall F1 (BoW): {precision_bow:.2f} {recall_bow:.2f} {f1_bow:.2f}')

Precision Recall F1 (TF-IDF): 0.86 0.86 0.86
Precision Recall F1 (BoW): 0.85 0.83 0.84


# Logistic Regression Using Both Features

In [21]:
from sklearn.linear_model import LogisticRegression

In [23]:
lr_model_tfidf = LogisticRegression(max_iter=1000)
lr_model_bow = LogisticRegression(max_iter=1000)

lr_model_tfidf.fit(X_train_tfidf, y_train)
lr_model_bow.fit(X_train_bow, y_train)

y_pred_tfidf = lr_model_tfidf.predict(X_test_tfidf)
y_pred_bow = lr_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'Precision Recall F1 (TF-IDF): {precision_tfidf:.2f} {recall_tfidf:.2f} {f1_tfidf:.2f}')
print(f'Precision Recall F1 (BoW): {precision_bow:.2f} {recall_bow:.2f} {f1_bow:.2f}')

Precision Recall F1 (TF-IDF): 0.86 0.84 0.85
Precision Recall F1 (BoW): 0.83 0.86 0.85


# Naive Bayes Using Both Features

In [24]:
from sklearn.naive_bayes import MultinomialNB

In [25]:
nb_model_tfidf = MultinomialNB()
nb_model_bow = MultinomialNB()

nb_model_tfidf.fit(X_train_tfidf, y_train)
nb_model_bow.fit(X_train_bow, y_train)

y_pred_tfidf = nb_model_tfidf.predict(X_test_tfidf)
y_pred_bow = nb_model_bow.predict(X_test_bow)

precision_tfidf = precision_score(y_test, y_pred_tfidf)
recall_tfidf = recall_score(y_test, y_pred_tfidf)
f1_tfidf = f1_score(y_test, y_pred_tfidf)

precision_bow = precision_score(y_test, y_pred_bow)
recall_bow = recall_score(y_test, y_pred_bow)
f1_bow = f1_score(y_test, y_pred_bow)

print(f'Precision Recall F1 (TF-IDF): {precision_tfidf:.2f} {recall_tfidf:.2f} {f1_tfidf:.2f}')
print(f'Precision Recall F1 (BoW): {precision_bow:.2f} {recall_bow:.2f} {f1_bow:.2f}')

Precision Recall F1 (TF-IDF): 0.83 0.81 0.82
Precision Recall F1 (BoW): 0.78 0.85 0.82
