In [1]:
import pandas as pd

# Step 1: Load the dataset
data = pd.read_csv("merged_final_dataset.csv")


In [2]:
data.head()

Unnamed: 0,main_category,title_x,average_rating,rating_number,features,description,price,details,parent_asin,rating,title_y,text,user_id
0,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,4,"fresh tasting and smelling, slightly acidic, l...",I happen to have their Instant Pod dual coffee...,AF2BLE54TEMGZ546U763ZHZRXC4A
1,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,5,"dynamic flavor, interesting flavor profile, ha...","I have tried this Leggero light roast, and the...",AF2BLE54TEMGZ546U763ZHZRXC4A
2,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,4,Pricey but so much flavor!,This is a great roast for ppl who aren't into ...,AEUDZQDVSZYCHEXQSXLB6NWQTMHA
3,Grocery,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,"['The food tag has a good tip, directly on the...",[],8.99,"{'Color': 'Yellow,Purple', 'Brand': 'Ktdorns',...",B07PK9L29R,5,Fun!,So much fun to color on cookies. As an artist ...,AGECC4F4CDL2AVODIRNCF3V63BEQ
4,Grocery,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,"['The food tag has a good tip, directly on the...",[],8.99,"{'Color': 'Yellow,Purple', 'Brand': 'Ktdorns',...",B07PK9L29R,5,Perfect touch !,These are perfect for adding creative touches....,AFF6LERKD46F2RLIKAMQTAQPOIWA


In [3]:
# Step 3: Data Preprocessing

# Check for missing values
print(data.isnull().sum())

# If there are missing values, handle them (for example, by imputation or dropping rows/columns)
data.dropna(inplace=True)  # For demonstration, assuming you're dropping rows with missing values

# Check data types of each column
print(data.dtypes)

main_category      0
title_x           13
average_rating     0
rating_number      0
features           0
description        0
price              0
details            0
parent_asin        0
rating             0
title_y           35
text              10
user_id            0
dtype: int64
main_category      object
title_x            object
average_rating    float64
rating_number       int64
features           object
description        object
price             float64
details            object
parent_asin        object
rating              int64
title_y            object
text               object
user_id            object
dtype: object


In [4]:
print(data.isnull().sum())

main_category     0
title_x           0
average_rating    0
rating_number     0
features          0
description       0
price             0
details           0
parent_asin       0
rating            0
title_y           0
text              0
user_id           0
dtype: int64


In [5]:
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.tokenize import word_tokenize

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('vader_lexicon')

def preprocess_text(text):
    # Check if text is not NaN
    if isinstance(text, str):
        # Lowercasing
        text = text.lower()

        # Removing special characters and extra spaces
        text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
        text = re.sub(' +', ' ', text)

        # Tokenization
        tokens = word_tokenize(text)

        # Stopwords removal
        stop_words = set(stopwords.words('english'))
        tokens = [word for word in tokens if word not in stop_words]

        # Lemmatization
        lemmatizer = WordNetLemmatizer()
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

        return ' '.join(tokens)
    else:
        # If text is NaN, return an empty string
        return ''

# Apply preprocessing to product titles
data['review_text'] = data['text'].apply(preprocess_text)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\Dell\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [6]:
data.head()

Unnamed: 0,main_category,title_x,average_rating,rating_number,features,description,price,details,parent_asin,rating,title_y,text,user_id,review_text
0,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,4,"fresh tasting and smelling, slightly acidic, l...",I happen to have their Instant Pod dual coffee...,AF2BLE54TEMGZ546U763ZHZRXC4A,happen instant pod dual coffee maker spot nesp...
1,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,5,"dynamic flavor, interesting flavor profile, ha...","I have tried this Leggero light roast, and the...",AF2BLE54TEMGZ546U763ZHZRXC4A,tried leggero light roast lungo medium roast d...
2,Grocery,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,['USDA ORGANIC: A more environmentally friendl...,['Instant® Compostable Espresso Capsules deliv...,8.49,{'Product Dimensions': '7.38 x 3.86 x 1.5 inch...,B0C2W77WJX,4,Pricey but so much flavor!,This is a great roast for ppl who aren't into ...,AEUDZQDVSZYCHEXQSXLB6NWQTMHA,great roast ppl arent bitter heavy taste like ...
3,Grocery,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,"['The food tag has a good tip, directly on the...",[],8.99,"{'Color': 'Yellow,Purple', 'Brand': 'Ktdorns',...",B07PK9L29R,5,Fun!,So much fun to color on cookies. As an artist ...,AGECC4F4CDL2AVODIRNCF3V63BEQ,much fun color cooky artist really work well g...
4,Grocery,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,"['The food tag has a good tip, directly on the...",[],8.99,"{'Color': 'Yellow,Purple', 'Brand': 'Ktdorns',...",B07PK9L29R,5,Perfect touch !,These are perfect for adding creative touches....,AFF6LERKD46F2RLIKAMQTAQPOIWA,perfect adding creative touch taste coloring a...


In [7]:
# Perform sentiment analysis
sid = SentimentIntensityAnalyzer()

def get_sentiment_score(text):
    sentiment_score = sid.polarity_scores(text)
    return sentiment_score['compound']

data['sentiment_score'] = data['review_text'].apply(get_sentiment_score)

# Convert sentiment scores to binary labels
data['sentiment_label'] = data['sentiment_score'].apply(lambda x: 1 if x >= 0 else 0)

# Drop intermediate columns if necessary
data.drop(['review_text', 'sentiment_score'], axis=1, inplace=True)

# Save or use the resulting DataFrame as needed

In [8]:
data= data.drop(columns=['features',	'description','details','main_category'])


In [9]:

data.rename(columns={'title_x':'product_name'},inplace = True)
data.head()

Unnamed: 0,product_name,average_rating,rating_number,price,parent_asin,rating,title_y,text,user_id,sentiment_label
0,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,8.49,B0C2W77WJX,4,"fresh tasting and smelling, slightly acidic, l...",I happen to have their Instant Pod dual coffee...,AF2BLE54TEMGZ546U763ZHZRXC4A,1
1,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,8.49,B0C2W77WJX,5,"dynamic flavor, interesting flavor profile, ha...","I have tried this Leggero light roast, and the...",AF2BLE54TEMGZ546U763ZHZRXC4A,1
2,"Instant Compostable Espresso Capsules, Lungo M...",4.3,85,8.49,B0C2W77WJX,4,Pricey but so much flavor!,This is a great roast for ppl who aren't into ...,AEUDZQDVSZYCHEXQSXLB6NWQTMHA,1
3,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,8.99,B07PK9L29R,5,Fun!,So much fun to color on cookies. As an artist ...,AGECC4F4CDL2AVODIRNCF3V63BEQ,1
4,"Edible Markers,Food Coloring Markers,Food colo...",4.3,1193,8.99,B07PK9L29R,5,Perfect touch !,These are perfect for adding creative touches....,AFF6LERKD46F2RLIKAMQTAQPOIWA,1


# Decision Tree

In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report


# Split the data into features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
clf = DecisionTreeClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.7425030530553168
Classification Report:
               precision    recall  f1-score   support

           0       0.32      0.32      0.32      4229
           1       0.84      0.84      0.84     17880

    accuracy                           0.74     22109
   macro avg       0.58      0.58      0.58     22109
weighted avg       0.74      0.74      0.74     22109



# GradientBoosting

In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report


# Split the data into features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Gradient Boosting Classifier
clf = GradientBoostingClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.8089013523904293
Classification Report:
               precision    recall  f1-score   support

           0       0.62      0.00      0.00      4229
           1       0.81      1.00      0.89     17880

    accuracy                           0.81     22109
   macro avg       0.72      0.50      0.45     22109
weighted avg       0.77      0.81      0.72     22109



# AdaBoosting

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the AdaBoost Classifier
clf = AdaBoostClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.8088108914921525
Classification Report:
               precision    recall  f1-score   support

           0       0.55      0.00      0.01      4229
           1       0.81      1.00      0.89     17880

    accuracy                           0.81     22109
   macro avg       0.68      0.50      0.45     22109
weighted avg       0.76      0.81      0.72     22109



# XGBoost

In [13]:

import pandas as pd
from sklearn.model_selection import train_test_split
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the XGBoost Classifier
clf = xgb.XGBClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.8041521552309014
Classification Report:
               precision    recall  f1-score   support

           0       0.41      0.06      0.10      4229
           1       0.81      0.98      0.89     17880

    accuracy                           0.80     22109
   macro avg       0.61      0.52      0.50     22109
weighted avg       0.74      0.80      0.74     22109



# KNN

In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report

# Split the data into features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the KNN Classifier
clf = KNeighborsClassifier()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.7849292143470985
Classification Report:
               precision    recall  f1-score   support

           0       0.36      0.16      0.22      4229
           1       0.82      0.93      0.88     17880

    accuracy                           0.78     22109
   macro avg       0.59      0.55      0.55     22109
weighted avg       0.74      0.78      0.75     22109



# Multinomial Naive Bayes

In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report


# Select features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Multinomial Naive Bayes model
clf = MultinomialNB()

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.7178524582749106
Classification Report:
               precision    recall  f1-score   support

           0       0.21      0.18      0.19      4229
           1       0.81      0.85      0.83     17880

    accuracy                           0.72     22109
   macro avg       0.51      0.51      0.51     22109
weighted avg       0.70      0.72      0.71     22109



# Categorical Naive Bayes

In [16]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import CategoricalNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OrdinalEncoder

# Select features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Convert categorical features to ordinal representation
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)

# Initialize the Categorical Naive Bayes Classifier
clf = CategoricalNB()

# Train the classifier
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)


Accuracy: 0.7910353249807771
Classification Report:
               precision    recall  f1-score   support

           0       0.40      0.18      0.24      4229
           1       0.83      0.94      0.88     17880

    accuracy                           0.79     22109
   macro avg       0.61      0.56      0.56     22109
weighted avg       0.75      0.79      0.76     22109



# Random Forest

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, mean_squared_error, mean_absolute_error
import numpy as np
# Select features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Convert categorical features to ordinal representation
encoder = OrdinalEncoder()
X_encoded = encoder.fit_transform(X)


# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_encoded, y, test_size=0.2, random_state=42)



# Initialize and train the Random Forest classifier
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
rf_classifier.fit(X_train, y_train)


# Predict on the testing set
y_pred = rf_classifier.predict(X_test)

# Evaluate the classifier
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Calculate RMSE and MAE
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
mae = mean_absolute_error(y_test, y_pred)
print("RMSE:", rmse)
print("MAE:", mae)

# Print classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.7770591161970238
RMSE: 0.472166161221848
MAE: 0.22294088380297616
Classification Report:
              precision    recall  f1-score   support

           0       0.36      0.22      0.27      4229
           1       0.83      0.91      0.87     17880

    accuracy                           0.78     22109
   macro avg       0.60      0.56      0.57     22109
weighted avg       0.74      0.78      0.75     22109



# Logistic Regression

In [18]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Select features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Logistic Regression model
clf = LogisticRegression()

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred, zero_division='warn')
print("Classification Report:\n", report)


Accuracy: 0.8087204305938758
Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00      4229
           1       0.81      1.00      0.89     17880

    accuracy                           0.81     22109
   macro avg       0.40      0.50      0.45     22109
weighted avg       0.65      0.81      0.72     22109



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SVM

In [19]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# Select features and target
X = data[['average_rating', 'rating_number', 'price', 'rating']]
y = data['sentiment_label']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the SVM model
clf = SVC()

# Train the model
clf.fit(X_train, y_train)

# Make predictions
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

# Generate classification report
report = classification_report(y_test, y_pred)
print("Classification Report:\n", report)
