In [9]:
import pandas as pd
import numpy as np
import re
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

In [10]:
data=pd.read_csv('ebay_reviews.csv')
data.head()

Unnamed: 0,category,review title,review content,rating
0,Headsets,Wireless gaming headset,This gaming headset ticks all the boxes # look...,5
1,Headsets,"Good for those with a big head, low budget","Easy setup, rated for 6 hours battery but mine...",3
2,Headsets,MezumiWireless Gaming Headset,I originally bought this wireless headset for ...,5
3,Headsets,HW- S2 great headset.,"This is my 2nd Mezumi headset, It kills the fi...",5
4,Headsets,BEST HEADPHONES I'VE PURCHASED IN MY ENTIRE LIFE,This is probably the best headset I've purchas...,5


In [11]:
# Preprocess the review content
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove special characters and numbers
    text = re.sub(r'[^a-z\s]', '', text)
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text


In [16]:
# Apply preprocessing to the review content
data['cleaned_review'] = data['review content'].apply(preprocess_text)

# Check the cleaned data
data[['review content', 'cleaned_review']].head()

Unnamed: 0,review content,cleaned_review
0,This gaming headset ticks all the boxes # look...,gaming headset ticks boxes looks grate built l...
1,"Easy setup, rated for 6 hours battery but mine...",easy setup rated hours battery mine lasted ses...
2,I originally bought this wireless headset for ...,originally bought wireless headset xbox latest...
3,"This is my 2nd Mezumi headset, It kills the fi...",nd mezumi headset kills first one better range...
4,This is probably the best headset I've purchas...,probably best headset ive purchased till date ...


In [13]:
def label_sentiment(rating):
    if rating <= 2:
        return 'negative'
    elif rating == 3:
        return 'neutral'
    else:
        return 'positive'

In [14]:
data['sentiment'] = data['rating'].apply(label_sentiment)


In [15]:
data[['rating', 'sentiment']].head()


Unnamed: 0,rating,sentiment
0,5,positive
1,3,neutral
2,5,positive
3,5,positive
4,5,positive


In [18]:
X = data['cleaned_review']
y = data['sentiment']

In [19]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [22]:
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)
print(X_train_tfidf.shape, X_test_tfidf.shape)

(35804, 5000) (8952, 5000)


In [25]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix
model = MultinomialNB()
model.fit(X_train_tfidf, y_train)
y_pred = model.predict(X_test_tfidf)


In [26]:
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

    negative       0.86      0.02      0.03       379
     neutral       0.00      0.00      0.00       284
    positive       0.93      1.00      0.96      8289

    accuracy                           0.93      8952
   macro avg       0.59      0.34      0.33      8952
weighted avg       0.89      0.93      0.89      8952

[[   6    0  373]
 [   1    0  283]
 [   0    1 8288]]


In [28]:
import pickle
with open('naive_bayes_model.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

with open('tfidf_vectorizer.pkl', 'wb') as vectorizer_file:
    pickle.dump(vectorizer, vectorizer_file)