In [None]:
# pip install kagglehub
# pip install numpy == 1.26.4
# pip install pandas
# pip install plotly >= 24.2
# pip install statsmodel == 2.2.3
# pip install scikit-surprise == 1.1.4
# pip install scikit-learn == 1.6.1
# pip install tqdm == 4.67.1

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("saurav9786/amazon-product-reviews")

print("Path to dataset files:", path)

In [None]:
# import packages
import pandas as pd
import numpy as np
# import seaborn as sns
# import matplotlib.pyplot as plt
import os
# import dash
# import dash_html_components as html
# import dash_cytoscape as cyto
# from matplotlib import colors as mcolors
from itertools import zip_longest
# from ast import literal_eval
import plotly
import plotly.express as px
import plotly.graph_objects as go
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

### Loading Data

In [None]:
rating_df = pd.read_csv('data/ratings_Electronics.csv', names=['userId', 'productId','Rating','timestamp'])
rating_df

In [None]:
rating_df.info()

In [None]:
rating_df.describe()['Rating']

The rating of the product range from 0 to 1, and majority of the products are rated as 5.0.

In [None]:
#Check for missing values
print('Number of missing values across columns: \n', rating_df.isnull().sum())

No missing values that needs to be handle

### Exploratory Data Analysis

In [None]:
# analyse the rating distribution

data = rating_df.groupby(['Rating']).size().reset_index(name='number of ratings')

fig = px.bar(data, x="Rating", y="number of ratings", color="Rating", color_continuous_scale= px.colors.sequential.Blues)
fig.update_layout(title_text='Rating Distribution',
                xaxis_title='Rating',
                yaxis_title='Number of Ratings',
                showlegend=False)
fig.show()

Users tend to give positive reviews unless the product is highly unsatisfactory, in which case they rate it as 1.0.

In [None]:
# get distinct count
print("Total data ")
print("-"*50)
print("\nTotal number of ratings :",rating_df.shape[0])
print("Total number of users :", len(np.unique(rating_df.userId)))
print("Total number of products :", len(np.unique(rating_df.productId)))

In [None]:
# top users who provided the most ratings and average ratings
user_rating_stats = rating_df.groupby(by='userId').agg(total_ratings=('Rating', 'count'),
                                                       avg_rating=('Rating', 'mean'))\
                                                    .sort_values(by='total_ratings', ascending=False)\
                                                    .reset_index()

user_rating_stats

In [None]:
user_rating_stats.describe()

According to the median and Q3, users normally rated once or twice, which may indicate high sparsity.

In [None]:
fig = px.scatter(user_rating_stats, 
                 x="avg_rating",
                 y="total_ratings",
                 title="User Rating Behavior",
                 labels={"total_ratings": "Total Ratings Per User", "avg_rating": "Average Rating"},
                 opacity=0.6,
                 hover_data=["userId"])

fig.update_layout(xaxis=dict(title="Average Rating", type="log"),
                  yaxis=dict(title="Total Ratings Per User"),
                  template="plotly_white")

fig.show()

In [None]:
# top products who provided the most ratings and average ratings
product_rating_stats = rating_df.groupby(by='productId').agg(total_ratings=('Rating', 'count'),
                                                       avg_rating=('Rating', 'mean'))\
                                                    .sort_values(by='total_ratings', ascending=False)\
                                                    .reset_index()

product_rating_stats

In [None]:
product_rating_stats.describe()

Most products received around two ratings each, as reflected in the median, though the mean is much higher. This indicates a long-tail distribution, where a few products are rated very frequently, while most receive only a few ratings. Indicating high sparsity as well.

In [None]:
fig = px.histogram(product_rating_stats, x="total_ratings", nbins=100, log_y=True, 
                   title="Distribution of Total Ratings per Product",
                   labels={"total_ratings": "Total Ratings per Product"},
                   opacity=0.7)

fig.show()

In [None]:
fig = px.scatter(product_rating_stats, 
                 x="avg_rating",
                 y="total_ratings",
                 title="Products Overall Rating",
                 labels={"total_ratings": "Total Ratings Per Product", "avg_rating": "Average Rating"},
                 opacity=0.6,
                 hover_data=["productId"])

fig.update_layout(xaxis=dict(title="Average Rating", type="log"),
                  yaxis=dict(title="Total Ratings Per Product"),
                  template="plotly_white")

fig.show()

In [None]:
# Check for data sparsity
sparsity = rating_df.shape[0] / (rating_df['userId'].nunique() * rating_df['productId'].nunique())
print(f"Dataset sparsity: {sparsity:.6f}")

The sparsity value 0.000004 (or 0.0004%) means that only a tiny fraction of possible user-product interactions have ratings. This indicates a highly sparse dataset.

In general, sparse data means that the data contains missing values, in this case, there's a lot of missing ratings because most users rated only a few products and most products receive ratings from only a few users.

### Data Cleaning
One way to handle sparse data is to filter out some users/products, these users maybe inactive/rarely active while the products may not be popular to determine.

In [None]:
# since the total ratings per user are negatively skewed, we will retain only active users who have more than 3 reviews.
user_product_counts = rating_df.groupby('userId')['productId'].nunique().reset_index()
user_product_counts.rename(columns={'productId': 'total_products'}, inplace=True)
filtered_users = user_product_counts[user_product_counts['total_products'] >= 3]['userId']
rating_df_filtered = rating_df[rating_df['userId'].isin(filtered_users)]

In [None]:
# products with very few reviews may make it difficult to determine whether they genuinely attract user interest. 
# given that 50% of products have at least 2 reviews, I will use this as the threshold.
product_review_counts = rating_df.groupby('productId').value_counts().rename('review_count')
product_review_counts = rating_df.groupby('productId').size().reset_index(name='review_count').query('review_count > 2')
rating_df_filtered = rating_df_filtered[rating_df_filtered['productId'].isin(product_review_counts['productId'])]

In [None]:
# users with an unusually high number of reviews may be outliers or non-genuine users, so they will be filtered out.
Q1 = user_rating_stats["total_ratings"].quantile(0.25)
Q3 = user_rating_stats["total_ratings"].quantile(0.75)
IQR = Q3 - Q1
upper_threshold = Q3 + 1.5 * IQR

filtered_users = user_rating_stats[user_rating_stats["total_ratings"] <= upper_threshold]["userId"]
rating_df_filtered = rating_df_filtered[rating_df_filtered["userId"].isin(filtered_users)]

In [None]:
# get distinct count after cleaning
print("Total data ")
print("-"*50)
print("\nTotal number of ratings :",rating_df_filtered.shape[0])
print("Total number of users :", len(np.unique(rating_df_filtered.userId)))
print("Total number of products :", len(np.unique(rating_df_filtered.productId)))

### Collaborative Filtering (Item-Based)
Collaborative Filtering aims to predict missing ratings in a user-item interaction matrix by leveraging similarities between users or items.
I will start with a memory-based approach, specifically Item-Based Collaborative Filtering (Item-Based CF).

Difference Between User-Based and Item-Based CF
- User-Based CF: Finds users with similar preferences and recommends items liked by those users.
- Item-Based CF: Finds items that are similar to what a user has already rated highly and recommends those items.

In this case, I will predict the ratings for products that users have not yet rated by looking at the similarity between items. If a user has given high ratings to certain products, Item-Based CF will recommend other similar products based on those preferences.

In [None]:
from surprise import Dataset, Reader, KNNWithMeans
from surprise.model_selection import train_test_split
from surprise import accuracy

#### Data Process for Model Building & Train-Test Split

In [None]:
# only take a proportion of data, since the data size is too big (size were in TiB)
sample_df = rating_df_filtered.sample(frac=0.10, random_state=2025)

In [None]:
reader = Reader(rating_scale=(sample_df["Rating"].min(), rating_df["Rating"].max()))
data = Dataset.load_from_df(sample_df[['userId', 'productId', 'Rating']], reader)

In [None]:
trainset, testset = train_test_split(data, test_size=0.3, random_state=2025)

#### Model Training

In [None]:
sim_options = {"name": "cosine",
               "user_based": False}

model = KNNWithMeans(sim_options=sim_options)
model.fit(trainset)

#### Predict & Evaluate

In [None]:
predictions = model.test(testset)

In [None]:
rmse = accuracy.rmse(predictions)

Since the rating scale from 1-5, a RMSE of 1.33 means, on average, the predicted ratings deviate by ~1.33 points from actual ratings.

#### Find similar products

In [None]:
sim_matrix = model.sim

In [None]:
def get_similar_products(product_id, top_n=5):
    try:
        inner_id = model.trainset.to_inner_iid(product_id)
        similarity_scores = sim_matrix[inner_id]
        similar_items = sorted(list(enumerate(similarity_scores)), key=lambda x: x[1], reverse=True)[1:top_n+1]
        return [model.trainset.to_raw_iid(inner_idx) for inner_idx, _ in similar_items]
    except:
        return "Product not found in training data"

In [None]:
# example of the first few products' similar products
for product in sample_df["productId"][:10]:
    print(f"Top 5 similar products to {product}: {get_similar_products(product)}")

### Collaborative Filtering (with TimeSVD++)
Collaborative Filtering aims to predict missing ratings in a user-item interaction matrix by leveraging similarities between users or items.
Given the rating scale of 1 to 5, the previous model could be improved, as a 1.33 standard deviation may misinterpret a mid-range product as good or a bad product as acceptable. 
I will explore a model-based approach that considers timestamps. Since scikit-surprise only has SVD++, hence additional preprocessing is needed to use to incorporate time variable, which will become TimeSVD++.

In [None]:
from sklearn.preprocessing import MinMaxScaler
from scipy.sparse import coo_matrix
from surprise import SVDpp, Dataset, Reader
from surprise import accuracy

#### Data Process

In [None]:
# taking the whole dataset since TimeSVD++can handle larger data
df = rating_df_filtered

In [None]:
scaler = MinMaxScaler(feature_range=(0, 1))
df['rating_scaled'] = scaler.fit_transform(df[['Rating']]) 
df[['Rating', 'rating_scaled']].head()

- TimeSVD++ modifies the bias term to include time-dependent user preferences
- Time-based normalization helps capture how a user's preferences shift over time, this means if a user’s rating pattern changes over time, their bias will be adjusted dynamically.
- Mean timestamp per user acts as a reference point to measure how much a given rating's timestamp deviates from the user’s usual behavior

In [None]:
# compute time based normalization
# mean timestamp
user_mean_time = df.groupby('userId')['timestamp'].mean()
df['time_dev'] = df.apply(lambda row: row['timestamp'] - user_mean_time[row['userId']], axis=1)

# scale time deviation
scaler = MinMaxScaler()
df['time_dev_norm'] = scaler.fit_transform(df[['time_dev']])


Adjust rating based on time bias

In [None]:
alpha_u = 0.004 # alpha_u is a time decay parameter that adjusts the influence of time on the user's rating behaviour
df['adjusted_rating'] = df['Rating'] + (alpha_u * df['time_dev_norm'])

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df[['userId', 'productId', 'adjusted_rating']], reader)

trainset, testset = train_test_split(data, test_size=0.2, random_state=2025)

#### Model Training

In [None]:
model = SVDpp(n_factors=100, lr_all=0.005, reg_all=0.02, verbose=True)
model.fit(trainset)

#### Predict & Evaluate

In [None]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

#### Hyperparameter tuning

In [None]:
model = SVDpp(n_factors=40, lr_all=0.01, reg_all=0.2, n_epochs=25)
model.fit(trainset)

In [None]:
predictions = model.test(testset)
rmse = accuracy.rmse(predictions)

The RMSE has reduce from 1.33 to 1.25. On average, the predicted ratings deviate by ~1.25 points from actual ratings.

#### Find similar products
The SVDpp model learns latent factor representations for both users and products, hence I can access product embeddings (also called item factors) from the trained model to find similar products.

In [None]:
item_factors = model.pu

In [None]:
# because data is too large, unable to pivot directly, using coo_matrix to pivot
user_ids = df['userId'].astype("category").cat.codes # to use coo_matrix, will need to convert to integer
product_ids = df['productId'].astype("category").cat.codes # to use coo_matrix, will need to convert to integer
ratings = df['Rating'].values

# use this to pivot
sparse_matrix = coo_matrix((ratings, (user_ids, product_ids))) # this will save in memory

# store the original ids to map back after pivot
user_mapping = dict(enumerate(df['userId'].astype("category").cat.categories))
product_mapping = dict(enumerate(df['productId'].astype("category").cat.categories))

print(f"Sparse Matrix Shape: {sparse_matrix.shape}")
print(sparse_matrix)

Since cosine similarity can't handle well with large dataset, I am using Nearest Neighors to find top 5 similar products. This will take some time.

In [None]:
from sklearn.neighbors import NearestNeighbors

# use Nearest Neighbors model to fin
k = 5 # k = 3,5,10
knn = NearestNeighbors(n_neighbors=k+1, metric='cosine', algorithm='brute')
knn.fit(sparse_matrix.T)

# get top k similar products
distances, indices = knn.kneighbors(sparse_matrix.T)


For each product, store the top 5 similar products and check whether users who buy this product also purchase any of its similar products.

In [None]:
similar_products_dict = {}
for i, product_idx in enumerate(indices): # mapping back the original product IDs
    original_product_id = product_mapping[i] 
    similar_product_ids = [product_mapping[idx] for idx in product_idx[1:]] # exclude itself
    similar_products_dict[original_product_id] = similar_product_ids

similar_products_df = pd.DataFrame.from_dict(similar_products_dict, orient='index')
similar_products_df.columns = [f"Similar_Product_{i+1}" for i in range(k)]
similar_products_df.reset_index(inplace=True)
similar_products_df.rename(columns={"index": "productId"}, inplace=True)

In [None]:
similar_products_df

Given each product's top 5 similar products, calculate the overlap purchase (users who bought at least one recommended product along with the original) vs. the non-overlap purchase (users who only bought the original product but not the recommendations).  

From there, determine the Overlap Rate, Precision of the top 5 products, Hit Rate, and Lift Score of the recommendation system.

In [None]:
user_purchases_dict = df.groupby("userId")["productId"].apply(list).to_dict()

def compute_overlap_metrics(user_purchases_dict, similar_products_dict):
    total_users = len(user_purchases_dict)  # Total users
    overlapping_users = 0
    total_recommended_purchases = 0
    total_recommendations_made = 0
    total_actual_purchases = 0
    users_with_recommendations = 0

    for user, products in user_purchases_dict.items():
        purchased_set = set(products)
        total_actual_purchases += len(purchased_set)
        overlap_found = False
        received_recommendations = False

        for product in products:
            if product in similar_products_dict:
                recommended_products = set(similar_products_dict[product])
                received_recommendations = True
                total_recommendations_made += len(recommended_products)

                # find which recommended products the user actually bought
                purchased_recommendations = purchased_set.intersection(recommended_products)
                total_recommended_purchases += len(purchased_recommendations)

                if purchased_recommendations:
                    overlap_found = True

        if received_recommendations:
            users_with_recommendations += 1
        if overlap_found:
            overlapping_users += 1

    overlap_rate = overlapping_users / total_users if total_users else 0
    precision_at_k = total_recommended_purchases / total_recommendations_made if total_recommendations_made else 0
    recall_at_k = total_recommended_purchases / total_actual_purchases if total_actual_purchases else 0
    hit_rate = overlapping_users / users_with_recommendations if users_with_recommendations else 0

    return overlap_rate, precision_at_k, recall_at_k, hit_rate

# compute model performance
observed_overlap_rate, precision_at_k, recall_at_k, hit_rate = compute_overlap_metrics(user_purchases_dict, similar_products_dict)

# compute popularity-based random recommendations
popular_products = df["productId"].value_counts().index[:50]  # Top 50 popular products
random_recommendations = {
    product: np.random.choice(popular_products, 5, replace=False).tolist()
    for product in similar_products_dict.keys()
}
random_overlap_rate, _, _, _ = compute_overlap_metrics(user_purchases_dict, random_recommendations)

# compute lift score
lift_score = observed_overlap_rate / random_overlap_rate if random_overlap_rate else 0

print(f"Overlap Rate: {observed_overlap_rate:.4f}")
print(f"Hit Rate: {hit_rate:.4f}")
# overlap rate and hit rate should be the same, since every user will receive recommendation based on what they purchase
print(f"Precision @ 5: {precision_at_k:.4f}")
print(f"Recall @ 5: {recall_at_k:.4f}")
print(f"Lift Score: {lift_score:.4f}")

- Overlap Rate / Hit Rate: 60.17% of users had at least one correct recommendation in their top 5. This means most users received at least one good suggestion.
- Precision: 12.42% of recommended products were actually purchased by users.
- Recall: 62.12% of all purchased products were successfully recommended. This means that across all purchases, a large portion was correctly predicted.
- Lift Score: The model is 13x better than randomly recommending popular products. A lift score this high indicates strong recommendation effectiveness.