# Import Dataset
https://jmcauley.ucsd.edu/data/amazon/

In [6]:
import gzip
import json
import pandas as pd
import warnings

# Suppress all warnings
warnings.filterwarnings('ignore')

def load_gz_json(path, max_records=None):
    data = []
    with gzip.open(path, 'rb') as f:
        for i, line in enumerate(f):
            data.append(json.loads(line))
            if max_records and (i + 1) >= max_records:
                break
    return pd.DataFrame(data)

# Load first 100,000 reviews
reviews_df = load_gz_json('data/Electronics.jsonl.gz', max_records=100000)
print(reviews_df.shape)
print(reviews_df.columns)

(100000, 10)
Index(['rating', 'title', 'text', 'images', 'asin', 'parent_asin', 'user_id',
       'timestamp', 'helpful_vote', 'verified_purchase'],
      dtype='object')


In [7]:
# Load metadata
meta_df = load_gz_json('data/meta_Electronics.jsonl.gz', max_records=100000)
print(meta_df.shape)
print(meta_df.columns)

(100000, 16)
Index(['main_category', 'title', 'average_rating', 'rating_number', 'features',
       'description', 'price', 'images', 'videos', 'store', 'categories',
       'details', 'parent_asin', 'bought_together', 'subtitle', 'author'],
      dtype='object')


# Preprocessing

In [8]:
%pip install nltk
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords 
from nltk.stem import WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt')
nltk.download('punkt_tab') 

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess(text):
    tokens = word_tokenize(text.lower())
    tokens = [w for w in tokens if w.isalpha()]
    tokens = [w for w in tokens if w not in stop_words]
    tokens = [lemmatizer.lemmatize(w) for w in tokens]
    return tokens

reviews_df['tokens'] = reviews_df['text'].fillna('').apply(preprocess)
reviews_df['clean_text'] = reviews_df['tokens'].apply(lambda tokens: ' '.join(tokens))



Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\VGoli\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\VGoli\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\VGoli\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\VGoli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\VGoli\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Sentiment Labeling

In [9]:
def get_sentiment_label(rating):
    if rating >= 4:
        return 1   # positive
    elif rating <= 2:
        return 0   # negative
    else:
        return None  # neutral / skip

reviews_df['label'] = reviews_df['rating'].apply(get_sentiment_label)
reviews_df = reviews_df.dropna(subset=['label'])  # remove neutral or missing
reviews_df['label'] = reviews_df['label'].astype(int)


# Logistic Regression with TF-IDF

In [10]:
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, f1_score, classification_report
import numpy as np

X = reviews_df['clean_text'].values
y = reviews_df['label'].values

accuracies = []
fold = 1

def run_log_reg(X, y, vectorizer, model, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True, random_state=42)
    scores = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y)):
        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train_vec = vectorizer.fit_transform(X_train)
        X_test_vec = vectorizer.transform(X_test)

        model.fit(X_train_vec, y_train)
        preds = model.predict(X_test_vec)

        acc = accuracy_score(y_test, preds)
        f1 = f1_score(y_test, preds)
        scores.append((acc, f1))
        print(f"Fold {fold + 1}: Accuracy={acc:.4f}, F1={f1:.4f}")
        print(classification_report(y_test, preds, digits=3))

    avg_acc = np.mean([s[0] for s in scores])
    print("Avg Accuracy:", avg_acc)
    print("Avg F1 Score:", np.mean([s[1] for s in scores]))
    return avg_acc

# 10 different configs for ablation study

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

configs = [
    {"max_features": 10000},
    {"max_features": 5000},
    {"max_features": 100000},
    {"max_features": 500000},
    {"ngram_range": (1, 2)},
    {"use_idf": False},   
    {"stop_words": None}, 
    {"lowercase": True},
    {"lowercase": False},
    {"max_df": 0.9},
    {"max_features": 10000, "ngram_range": (2, 2)},
]

for i, config in enumerate(configs):
    best_acc = 0
    best_model = None
    print(f"Running Configuration {i+1}: {config}")
    vectorizer = TfidfVectorizer(**config)
    model = LogisticRegression(max_iter=1000)
    acc = run_log_reg(X, y, vectorizer, model)
    if  acc > best_acc:
        best_acc = acc
        best_model = model

Running Configuration 1: {'max_features': 10000}
Fold 1: Accuracy=0.9286, F1=0.9599
              precision    recall  f1-score   support

           0      0.856     0.554     0.673      2462
           1      0.935     0.986     0.960     16112

    accuracy                          0.929     18574
   macro avg      0.896     0.770     0.816     18574
weighted avg      0.925     0.929     0.922     18574

Fold 2: Accuracy=0.9280, F1=0.9596
              precision    recall  f1-score   support

           0      0.853     0.552     0.670      2462
           1      0.935     0.985     0.960     16112

    accuracy                          0.928     18574
   macro avg      0.894     0.769     0.815     18574
weighted avg      0.924     0.928     0.921     18574

Fold 3: Accuracy=0.9259, F1=0.9584
              precision    recall  f1-score   support

           0      0.835     0.549     0.663      2461
           1      0.935     0.983     0.958     16113

    accuracy                

# Extreme Error Analysis

In [12]:
# Transform X to TF-IDF features
X_tfidf = vectorizer.transform(X)

best_model.fit(X_tfidf, y)
preds = best_model.predict(X_tfidf)

# Find the most confident wrong predictions
from sklearn.metrics import confusion_matrix

errors = (preds != y)
probs = best_model.predict_proba(X_tfidf)
conf_scores = np.max(probs, axis=1)

# Most confident incorrect predictions
extreme_errors = np.argsort(-conf_scores[errors])[:10]

for idx in np.where(errors)[0][extreme_errors]:
    print(f"\nTrue: {y[idx]}, Pred: {preds[idx]}, Confidence: {conf_scores[idx]:.2f}")
    print(reviews_df.iloc[idx]['clean_text'])



True: 0, Pred: 1, Confidence: 1.00
work great stay using outlet

True: 0, Pred: 1, Confidence: 1.00
heavy bulky easy use

True: 0, Pred: 1, Confidence: 1.00
useful easy use

True: 1, Pred: 0, Confidence: 1.00
dont waste money small wont latch

True: 1, Pred: 0, Confidence: 1.00
pay baby job perfectly waste money expensive webcam

True: 0, Pred: 1, Confidence: 1.00
needed new mouse wireless nice price work great burn battery ridiculous rate good value

True: 0, Pred: 1, Confidence: 0.99
work great large small city forget getting tv reception less populated area like state park people around antenna crank getting tv station get nothing even blue light lit thinking returning crank antenna disappointed

True: 0, Pred: 1, Confidence: 0.99
could great product better sticking remove tape replace tape

True: 0, Pred: 1, Confidence: 0.99
seemed great product mention anywhere would support video playback usb perhaps paid video personality sale page

True: 0, Pred: 1, Confidence: 0.99
thought gr