In [8]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import mean_squared_error

from scipy.sparse import csr_matrix


In [9]:
#Load Data
DATA_PATH = "/content/drive/MyDrive/Electronics.csv.gz"

df = pd.read_csv(DATA_PATH, compression="gzip")
df.head()


Unnamed: 0,user_id,parent_asin,rating,timestamp
0,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B0047T79VS,3.0,1344406083000
1,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B01HHURN3W,3.0,1408995743000
2,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B00L0YLRUW,1.0,1439226089000
3,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B017T99JPG,5.0,1456772365000
4,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B01LW71IBJ,5.0,1456772571000


In [10]:
#Select & Rename Columns
df = df[['user_id', 'parent_asin', 'rating']]

df = df.rename(columns={
    'user_id': 'user',
    'parent_asin': 'item'
})

df['rating'] = df['rating'].astype(float)

df.head()


Unnamed: 0,user,item,rating
0,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B0047T79VS,3.0
1,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B01HHURN3W,3.0
2,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B00L0YLRUW,1.0
3,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B017T99JPG,5.0
4,AGCI7FAH4GL5FI65HYLKWTMFZ2CQ,B01LW71IBJ,5.0


In [14]:
#Reduce Data Size
#This prevents crashes and shows engineering thinking.
# Keep users with at least 5 interactions
user_counts = df['user'].value_counts()
df = df[df['user'].isin(user_counts[user_counts >= 5].index)]

# Optional: sample for faster experiments
# Ensure the sample size does not exceed the DataFrame's current size
# We will sample 100,000 rows as a reasonable subset.
if len(df) > 100_000:
    df = df.sample(n=100_000, random_state=42)
else:
    # If the dataframe already has less than 100,000 rows, use its current size
    df = df.sample(n=len(df), random_state=42)

df.shape

(10084, 3)

In [15]:
#Train / Test Split
train, test = train_test_split(
    df,
    test_size=0.2,
    random_state=42
)

len(train), len(test)


(8067, 2017)

In [16]:
#MODEL 1: Popularity Baseline
# Popularity baseline (benchmark & cold start)
popular_items = (
    train.groupby('item')
    .size()
    .sort_values(ascending=False)
)

def recommend_popular(n=10):
    return popular_items.head(n).index.tolist()

recommend_popular(10)


['B011BRUOMO',
 'B008J0Z9TA',
 'B01K8B8YA8',
 'B07H65KP63',
 'B08CLNX58K',
 'B0BGNG1294',
 'B07GZFM1ZM',
 'B07T8BSMXD',
 'B097RTX8R9',
 'B0BB6RBYJ6']

In [17]:
#8 — User‑Item Matrix (Sparse)
user_item_matrix = train.pivot_table(
    index='user',
    columns='item',
    values='rating'
)

sparse_matrix = csr_matrix(user_item_matrix.fillna(0))
sparse_matrix.shape


(1633, 7222)

In [18]:
#MODEL 2 (MAIN): Matrix Factorization (SVD)
svd = TruncatedSVD(
    n_components=50,
    random_state=42
)

latent_user_matrix = svd.fit_transform(sparse_matrix)
latent_item_matrix = svd.components_


In [19]:
#Recommendation Function (SVD)
def recommend_svd(user_id, n=10):
    if user_id not in user_item_matrix.index:
        return recommend_popular(n)

    user_idx = user_item_matrix.index.get_loc(user_id)
    scores = np.dot(
        latent_user_matrix[user_idx],
        latent_item_matrix
    )

    recommendations = (
        pd.Series(scores, index=user_item_matrix.columns)
        .sort_values(ascending=False)
        .head(n)
    )

    return recommendations.index.tolist()

recommend_svd(user_item_matrix.index[0])


['B07P9V8GSH',
 'B0BQ5JTZ89',
 'B09G3MBH6V',
 'B093GSFQYW',
 'B07FQK854Y',
 'B071W93333',
 'B0BZYV7R9C',
 'B0B933F7G2',
 'B089PMMT1X',
 'B0BG6VKYMY']

In [21]:
#Evaluation (RMSE)
preds = []
actuals = []

for _, row in test.iterrows():
    if row['user'] in user_item_matrix.index and row['item'] in user_item_matrix.columns:
        user_idx = user_item_matrix.index.get_loc(row['user'])
        item_idx = user_item_matrix.columns.get_loc(row['item'])

        pred = np.dot(
            latent_user_matrix[user_idx],
            latent_item_matrix[:, item_idx]
        )

        preds.append(pred)
        actuals.append(row['rating'])

# Calculate MSE and then take the square root for RMSE
mse = mean_squared_error(actuals, preds)
rmse = np.sqrt(mse)
rmse

np.float64(4.524706940680114)

In [22]:
#Train FINAL SVD on FULL DATA
# ================================
# FINAL MODEL: Train on FULL DATA
# ================================

from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix

# Create full user-item matrix
full_user_item = df.pivot_table(
    index='user',
    columns='item',
    values='rating'
).fillna(0)

sparse_full = csr_matrix(full_user_item)

final_svd = TruncatedSVD(
    n_components=50,
    random_state=42
)

final_user_factors = final_svd.fit_transform(sparse_full)
final_item_factors = final_svd.components_

print("Final SVD trained on full dataset")


Final SVD trained on full dataset


In [23]:
#Save model for Streamlit
import pickle

MODEL_PATH = "/content/drive/MyDrive/final_svd_model.pkl"

with open(MODEL_PATH, "wb") as f:
    pickle.dump({
        "user_factors": final_user_factors,
        "item_factors": final_item_factors,
        "users": full_user_item.index,
        "items": full_user_item.columns
    }, f)

print("Final SVD model saved successfully")


Final SVD model saved successfully
