# **Evaluation of Preprocessing Steps**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer
import string
import re

# Download NLTK resources
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\saman\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
# Load the dataset
df = pd.read_csv("downsampled_dataset.csv")

In [15]:
print(df.shape)
df.head()

(29999, 16)


Unnamed: 0,marketplace,customer_id,review_id,product_id,product_parent,product_title,product_category,star_rating,helpful_votes,total_votes,vine,verified_purchase,review_headline,review_body,review_date,sentiment
0,US,44769940,R315SMIS3GYVLG,B0010K6TXQ,552991445,Pyle PTA1000 1000W Professional Power Amplifier,Electronics,1,0,0,N,Y,Broken on arrival,Only one channel worked.,2015-08-28,Negative
1,US,29650062,R15FX2RLC369WA,B00V3KM1Y4,626268159,iGotTech Cable Clips & Cord Management System:...,Electronics,4,0,0,N,Y,Holding up so far,I've been using these by my bed for a little l...,2015-08-22,Positive
2,US,51949299,R2SXWFPDZZBLIM,B00W9TDOMG,750325480,Lonve Music Player 16GB MP4/MP3 Player Black 1...,Electronics,5,3,3,N,N,"At this price? 5 stars, easily.","OK, I've reviewed a similar player and hated i...",2015-08-25,Positive
3,US,44531528,R2PNB95FOWAFVU,B00R3M4KEU,605481722,Jarv NMotion PRO Sport Wireless Bluetooth Earb...,Electronics,4,0,0,N,N,Good quality headphones.,I've enjoyed these headphones for a few weeks ...,2015-08-16,Positive
4,US,43639082,R3NJ9O0BNG216G,B00HY4PICU,493178902,FRiEQ 3.5mm Male To Male Car and Home Stereo C...,Electronics,5,0,0,N,Y,Five Stars,Awesome cable. Thanks to LOWERPRICEUSA for the...,2015-08-21,Positive


In [None]:
# Actually we only need the content(i.e. review) & the sentiment for classification
df = df[["review_body", "sentiment"]]
df.head(10)

## **Naive Bayes Model without preprocessing**


In [5]:
# split into train and test set 
x_train, x_test = train_test_split(df, test_size=0.2, random_state = 42, stratify=df['sentiment'])

y_train = x_train['sentiment']
y_test = x_test['sentiment']

x_train.drop(columns = ['sentiment'], inplace= True)
x_test.drop(columns = ['sentiment'], inplace= True)
x_train = x_train['review_body']
x_test = x_test['review_body']


print (f"y_train: {y_train.shape}/ x_train: {x_train.shape}")
print (f"y_test: {y_test.shape}/ x_test: {x_test.shape}")





y_train: (23999,)/ x_train: (23999,)
y_test: (6000,)/ x_test: (6000,)


In [6]:
tfidf_vectorizer = TfidfVectorizer()

# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(x_test)


print (f'Train set shape\t:{X_train_tfidf.shape}\nTest set shape\t:{X_test_tfidf.shape}')

Train set shape	:(23999, 20221)
Test set shape	:(6000, 20221)


In [7]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7823333333333333

Classification Report:
              precision    recall  f1-score   support

    Negative       0.97      0.11      0.20      1008
     Neutral       0.00      0.00      0.00       411
    Positive       0.78      1.00      0.88      4581

    accuracy                           0.78      6000
   macro avg       0.58      0.37      0.36      6000
weighted avg       0.76      0.78      0.70      6000


Confusion Matrix:
[[ 115    0  893]
 [   1    0  410]
 [   2    0 4579]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# **Text Preprocessing**

### Lowercasing, Removing punctuation, specific characters, Tokenization & Stopword removal, Stemming/Lemmatization

In [17]:
# Text normalization
def normalize_text(text):
    # Convert text to lowercase
    text = text.lower()
    return text

# Removing punctuation (not used)
def remove_punctuation(text):
    text = str(text)
    punctuations = string.punctuation
    return text.translate(str.maketrans('', '', punctuations))

# Removing special characters (not used)
def remove_spec_char(text):
    text = str(text)
    text = re.sub('[^a-zA-Z0-9]', ' ', text)
    text = re.sub('\s+', ' ', text)
    return text

# Tokenization & Stopword removal
def remove_stopwords(text):
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    return ' '.join([word for word in tokens if word not in stop_words])

# Lemmatization (not used)
def lemmatize_text(text):
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    return ' '.join([lemmatizer.lemmatize(word) for word in tokens])
    #return lemmatized_tokens

# Stemming 
def stem_words(text):
    ps = PorterStemmer()
    return ' '.join([ps.stem(word) for word in text.split()])


In [18]:
# Apply preprocessing steps

# lowercase words, remove punctuation & special characters
df["review_body"] = df["review_body"].apply(normalize_text)
#df["review_body"] = df["review_body"].apply(remove_punctuation)
#df["review_body"] = df["review_body"].apply(remove_spec_char)

#tokenization and stopwords removal, stemming or lemmatization
df["review_body"] = df["review_body"].apply(remove_stopwords)
#df["review_body"] = df["review_body"].apply(lemmatize_text)
df["review_body"] = df["review_body"].apply(stem_words)

In [19]:
df.head(10)

Unnamed: 0,review_body,sentiment
0,one channel worked .,Negative
1,'ve using bed little le month . 've holding pr...,Positive
2,"ok , 've reviewed similar player hated , like ...",Positive
3,'ve enjoyed headphone week . issue bluetooth s...,Positive
4,awesome cable . thanks lowerpriceusa good deal...,Positive
5,great,Positive
6,work fine money .,Positive
7,work great,Positive
8,"reasonable price , prompt service",Positive
9,,Positive


## **Naive Bayes Model with preprocessing**

In [20]:
# split into train and test set 
x_train, x_test = train_test_split(df, test_size=0.2, random_state = 42, stratify=df['sentiment'])

y_train = x_train['sentiment']
y_test = x_test['sentiment']

x_train.drop(columns = ['sentiment'], inplace= True)
x_test.drop(columns = ['sentiment'], inplace= True)
x_train = x_train['review_body']
x_test = x_test['review_body']

print (f"y_train: {y_train.shape}/ x_train: {x_train.shape}")
print (f"y_test: {y_test.shape}/ x_test: {x_test.shape}")


y_train: (23999,)/ x_train: (23999,)
y_test: (6000,)/ x_test: (6000,)


In [21]:
tfidf_vectorizer = TfidfVectorizer()
# Fit and transform the training data
X_train_tfidf = tfidf_vectorizer.fit_transform(x_train)

# Transform the test data
X_test_tfidf = tfidf_vectorizer.transform(x_test)

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Initialize Naive Bayes classifier
nb_classifier = MultinomialNB()

# Train the classifier
nb_classifier.fit(X_train_tfidf, y_train)

# Predict on the test data
y_pred = nb_classifier.predict(X_test_tfidf)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

# Confusion matrix
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))

Accuracy: 0.7848333333333334

Classification Report:
              precision    recall  f1-score   support

    Negative       0.96      0.13      0.23      1008
     Neutral       0.00      0.00      0.00       411
    Positive       0.78      1.00      0.88      4581

    accuracy                           0.78      6000
   macro avg       0.58      0.38      0.37      6000
weighted avg       0.76      0.78      0.71      6000


Confusion Matrix:
[[ 132    0  876]
 [   1    0  410]
 [   4    0 4577]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
