In [20]:
import os
import json
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import jaccard_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import pickle
import gc

In [10]:
# Data Cleaning

In [11]:
# Download latest version
path = kagglehub.dataset_download("abdallahwagih/amazon-reviews")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/abdallahwagih/amazon-reviews/versions/1


In [12]:
file_path = os.path.join(path, "Cell_Phones_and_Accessories_5.json")

In [13]:
# Read the JSONL data into a DataFrame
data = []
with open(file_path, 'r') as f:
    for line in f:
        data.append(json.loads(line))  # Load each JSON object
df = pd.DataFrame(data)

# Data Cleaning
# Drop rows with missing values
df.dropna(inplace=True)

# Standardize text fields
df['reviewText'] = df['reviewText'].str.lower()
df['reviewText'] = df['reviewText'].str.replace(r'[^\w\s]', '', regex=True)

# Dataset Splitting
train, temp = train_test_split(df, test_size=0.2, random_state=42)  # 80% train
validation, test = train_test_split(temp, test_size=0.5, random_state=42)  # 10% validation, 10% test

# Save splits if needed
train.to_csv("train.csv", index=False)
validation.to_csv("validation.csv", index=False)
test.to_csv("test.csv", index=False)

In [14]:
print(f"Number of rows in the dataset: {df.shape[0]}")

# Display a single entry for confirmation
print(df.iloc[0])

Number of rows in the dataset: 190920
reviewerID                                           A30TL5EWN6DFXT
asin                                                     120401325X
reviewerName                                              christina
helpful                                                      [0, 0]
reviewText        they look good and stick good i just dont like...
overall                                                         4.0
summary                                                  Looks Good
unixReviewTime                                           1400630400
reviewTime                                              05 21, 2014
Name: 0, dtype: object


In [None]:
# Jaccard

In [15]:
# Select relevant text data for analysis
text_data = df['reviewText']

# Create CountVectorizer instance for feature extraction
vectorizer = CountVectorizer(binary=True)
binary_matrix = vectorizer.fit_transform(text_data)

# Compute Jaccard Similarity between first 10 reviews as a sample
print("\nJaccard Similarity Sample (First 10 Reviews):")
for i in range(10):
    for j in range(i + 1, 10):
        vector_i = binary_matrix[i].toarray()[0]
        vector_j = binary_matrix[j].toarray()[0]
        similarity = jaccard_score(vector_i, vector_j)
        print(f"Review {i + 1} vs Review {j + 1}: {similarity:.2f}")


Jaccard Similarity Sample (First 10 Reviews):
Review 1 vs Review 2: 0.11
Review 1 vs Review 3: 0.06
Review 1 vs Review 4: 0.08
Review 1 vs Review 5: 0.04
Review 1 vs Review 6: 0.10
Review 1 vs Review 7: 0.07
Review 1 vs Review 8: 0.05
Review 1 vs Review 9: 0.05
Review 1 vs Review 10: 0.09
Review 2 vs Review 3: 0.21
Review 2 vs Review 4: 0.09
Review 2 vs Review 5: 0.10
Review 2 vs Review 6: 0.16
Review 2 vs Review 7: 0.07
Review 2 vs Review 8: 0.08
Review 2 vs Review 9: 0.07
Review 2 vs Review 10: 0.10
Review 3 vs Review 4: 0.08
Review 3 vs Review 5: 0.20
Review 3 vs Review 6: 0.10
Review 3 vs Review 7: 0.07
Review 3 vs Review 8: 0.13
Review 3 vs Review 9: 0.09
Review 3 vs Review 10: 0.11
Review 4 vs Review 5: 0.07
Review 4 vs Review 6: 0.04
Review 4 vs Review 7: 0.07
Review 4 vs Review 8: 0.04
Review 4 vs Review 9: 0.07
Review 4 vs Review 10: 0.08
Review 5 vs Review 6: 0.03
Review 5 vs Review 7: 0.08
Review 5 vs Review 8: 0.05
Review 5 vs Review 9: 0.06
Review 5 vs Review 10: 0.08
Rev

In [None]:
#Binary Logistic Regression

In [17]:
# Define the target variable (overall rating)
y_train = train['overall']
y_validation = validation['overall']
y_test = test['overall']

In [28]:
# Use CountVectorizer to get binary word representations
vectorizer = CountVectorizer(binary=True)
X_train = vectorizer.fit_transform(train['reviewText'])
X_validation = vectorizer.transform(validation['reviewText'])
X_test = vectorizer.transform(test['reviewText'])

In [22]:
# Logistic Regression Model
logistic = LogisticRegression(max_iter=1000, random_state=42, solver='liblinear')


In [23]:
# Train Logistic Regression
logistic.fit(X_train, y_train)

In [24]:
# Evaluate on validation set
y_validation_pred = logistic.predict(X_validation)
validation_accuracy = accuracy_score(y_validation, y_validation_pred)
print(f"Validation Accuracy: {validation_accuracy:.2f}")
print("\nValidation Classification Report:")
print(classification_report(y_validation, y_validation_pred))


Validation Accuracy: 0.64

Validation Classification Report:
              precision    recall  f1-score   support

         1.0       0.56      0.53      0.54      1280
         2.0       0.25      0.10      0.15      1060
         3.0       0.38      0.28      0.32      2091
         4.0       0.45      0.30      0.36      3996
         5.0       0.73      0.90      0.81     10665

    accuracy                           0.64     19092
   macro avg       0.47      0.42      0.44     19092
weighted avg       0.59      0.64      0.61     19092



In [25]:
# Evaluate on test set
y_test_pred = logistic.predict(X_test)
test_accuracy = accuracy_score(y_test, y_test_pred)
print(f"Test Accuracy: {test_accuracy:.2f}")
print("\nTest Classification Report:")
print(classification_report(y_test, y_test_pred))

Test Accuracy: 0.65

Test Classification Report:
              precision    recall  f1-score   support

         1.0       0.60      0.55      0.58      1317
         2.0       0.32      0.15      0.20      1033
         3.0       0.39      0.28      0.33      2099
         4.0       0.44      0.30      0.35      3932
         5.0       0.73      0.91      0.81     10711

    accuracy                           0.65     19092
   macro avg       0.50      0.44      0.45     19092
weighted avg       0.60      0.65      0.61     19092



In [26]:
# Save the model and vectorizer
with open('logistic_model_jaccard.pkl', 'wb') as model_file:
    pickle.dump(logistic, model_file)
with open('count_vectorizer_jaccard.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

# Cleanup
gc.collect()

28