<a href="https://colab.research.google.com/github/kavi-910/Prefina_Recommendation_System/blob/main/ALS_Recommendation_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
#Environment Setup

#Installing packages
!pip install --quiet \
  pandas numpy scikit-learn \
  lightfm implicit scipy \
  pyarrow tqdm ipywidgets

#Installing visualization tools
!pip install --quiet seaborn matplotlib

In [None]:
#Setting random seed
import random, os, numpy as np
SEED = 42
random.seed(SEED)
np.random.seed(SEED)

In [None]:
#Import Libraries & Configurations

#Core
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

#ML & Recommender
from lightfm import LightFM
from implicit.als import AlternatingLeastSquares

#Utility
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix
from tqdm.notebook import tqdm

#Display settings for clean output
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
tqdm.pandas()

In [None]:
#Uploading datasets
from google.colab import files
import pandas as pd

uploaded = files.upload()

df_products = pd.read_csv('products_H&M.csv')
df_users = pd.read_csv('customers_H&M.csv')
df_interactions = pd.read_csv('customer_interactions_H&M.csv')

In [None]:
print("\n Products Shape: ", df_products.shape)
print("\n Products Sample:")
display(df_products.head())

print("\n Users Shape: ", df_users.shape)
print("\n Users Sample:")
display(df_users.head())

print("\n Interactions Shape: ", df_interactions.shape)
print("\n Interactions Sample:")
display(df_interactions.head())

In [None]:
#Data cleaning

# 1 - Strip(clear) whitespace and lowercase columns
df_products.columns = df_products.columns.str.strip().str.lower()
df_users.columns = df_users.columns.str.strip().str.lower()
df_interactions.columns = df_interactions.columns.str.strip().str.lower()

# 2 - Remove duplicate rows
df_products.drop_duplicates(inplace = True)
df_users.drop_duplicates(inplace = True)
df_interactions.drop_duplicates(inplace = True)

# 3 - Drop missing values
df_products.dropna(how='all', inplace = True)
df_users.dropna(how='all', inplace = True)
df_interactions.dropna(how='all', inplace = True)

# 4 Resetting indexes
df_products.reset_index(drop=True, inplace=True)
df_users.reset_index(drop=True, inplace=True)
df_interactions.reset_index(drop=True, inplace=True)

print("Products updated shape: ", df_products.shape)
print("Users updated shape: ", df_users.shape)
print("Interactions updated shape: ", df_interactions.shape)

In [None]:
#Interaction Event Weighting

# 1 - Standardize column names
event_col = 'event_type' if 'event_type' in df_interactions.columns else 'event'

# 2 - Define custom weighs for each event
event_weights = {
    'click': 2,
    'wishlist': 2.5,
    'add_to_cart': 3,
    'rating': 4,
    'purchase': 20
}

# 3 - Map event types to numerical weights
df_interactions['event_weight'] = df_interactions[event_col].map(event_weights)

# 4 - Display
print("Events mapped: ")
display(df_interactions[[event_col, 'event_weight']].head(10))

In [None]:
#Aggregate user-product scores

# 1 - Group by user_id and product_id
df_user_product_scores = df_interactions.groupby(['user_id','product_id'])['event_weight'].sum().reset_index()

# 2 - Sort by user-product scores
df_user_product_scores.sort_values(by='event_weight', ascending=False, inplace=True)

# 3 - Display
print("Aggregated interaction scores: ")
display(df_user_product_scores.head())

In [None]:
# Encoding user_id and product_id

# 1 - Create user and product encoders
user_encoder = LabelEncoder()
product_encoder = LabelEncoder()

# 2 - Fit and transform the columns
df_user_product_scores['user_idx'] = user_encoder.fit_transform(df_user_product_scores['user_id'])
df_user_product_scores['product_idx'] = product_encoder.fit_transform(df_user_product_scores['product_id'])

# 3 - Save encoded mappings
user2idx = dict(zip(df_user_product_scores['user_id'], df_user_product_scores['user_idx']))
idx2user = dict(zip(df_user_product_scores['user_idx'], df_user_product_scores['user_id']))
#list(user2idx.items())[:10]

product2idx = dict(zip(df_user_product_scores['product_id'], df_user_product_scores['product_idx']))
idx2product = dict(zip(df_user_product_scores['product_idx'], df_user_product_scores['product_id']))

# 4 - Display encoded mappings
print("Encoded Mappings: ")
display(df_user_product_scores.head())



In [None]:
#Building sparse matrix

# 1 - Prepare the daat needed for the matix
user_indices = df_user_product_scores['user_idx'].values
product_indices = df_user_product_scores['product_idx'].values
weights_vec = df_user_product_scores['event_weight'].values

# 2 - Build the sparse matrix

#count how many unique users and products)
num_users = df_user_product_scores['user_idx'].nunique()
num_products = df_user_product_scores['product_idx'].nunique()

#builds matrix
interaction_matrix = csr_matrix((weights_vec, (user_indices, product_indices)), shape=(num_users, num_products))

# 3 - Display matrix shape and density
non_zero = interaction_matrix.count_nonzero()
density = non_zero / (interaction_matrix.shape[0]*interaction_matrix.shape[1])

print("Matrix shape: ", interaction_matrix.shape)
print("Non-zero entries: ", non_zero)
print(f"Matrix density: {density:.4%}")

# Display matrix

# Convert first 5 rows to dense format
matrix_preview = interaction_matrix[:5].toarray()
# Convert into readable df
import pandas as pd
matrix_df = pd.DataFrame(matrix_preview)
display(matrix_df)


In [None]:
# Matrix diagnostics & Sparsity Visualization
# Diagnostics: User/Product Activity stats

import matplotlib.pyplot as plt
import seaborn as sns

# 1 - Total interactions per user
user_activity = interaction_matrix.sum(axis=1).A1
plt.figure(figsize=(10, 4))
sns.histplot(user_activity, bins=30, kde=False, color='pink')
plt.title("Interactions per User")
plt.xlabel("Total Weighted Interactions")
plt.ylabel("Number of Users")
plt.show

In [None]:
# 2 - Total interactions per product
product_activity = interaction_matrix.sum(axis=0).A1
plt.figure(figsize=(10, 4))
sns.histplot(product_activity, bins=30, kde=False, color='purple')
plt.title("Interactions per Product")
plt.xlabel("Total Weighted Interactions")
plt.ylabel("Number of Products")
plt.show

In [None]:
# Sparsity heatmap
plt.figure(figsize=(6, 6))
sns.heatmap(interaction_matrix[:20, :20].toarray(), cmap="Blues", cbar=False)
plt.title("Heatmap of First 20x20 Users-Products")
plt.xlabel("Products")
plt.ylabel("Users")
plt.show()

In [None]:
# Train–Test Split  (reproducible 80/20 leave-one-out)

import random
from scipy.sparse import csr_matrix, lil_matrix

interaction_lil = interaction_matrix.tolil()          # full user×item
test_matrix     = lil_matrix(interaction_matrix.shape)

for u in range(interaction_matrix.shape[0]):
    items = interaction_matrix[u].nonzero()[1]
    if len(items) >= 2:
        test_item = random.choice(items)              # hold-out ONE item
        test_matrix[u, test_item]  = interaction_matrix[u, test_item]
        interaction_lil[u, test_item] = 0             # remove from train

train_matrix = interaction_lil.tocsr()
test_matrix  = test_matrix.tocsr()

print("Train:", train_matrix.shape, "| non-zeros:", train_matrix.count_nonzero())
print("Test :",  test_matrix.shape,  "| non-zeros:", test_matrix.count_nonzero())


In [None]:
# Build confidence matrices
alpha = 40

# user × item : 300 users × 1100 items  (keeps original orientation)
conf_matrix = train_matrix * alpha

# Fit model on the user-item confidence matrix directly
item_user_conf = conf_matrix # Fit on user-item directly

In [None]:
# Precision@K helper

from tqdm.auto import tqdm
import numpy as np

def precision_at_k(model, conf_matrix, test_matrix, item_user_conf, K=10):
    """Mean Precision@K over users with at least one held-out item."""
    precisions = []
    for u in range(test_matrix.shape[0]):
        test_items = test_matrix[u].indices
        if len(test_items) == 0:
            continue

        # extract a 1×items slice for this user
        user_items = item_user_conf[:, u].T.tocsr()
        # now user_items.shape == (1, num_items)

        recs = model.recommend(
            userid=u,
            user_items=user_items,            # 1×num_items CSR
            N=K,
            filter_already_liked_items=True,
            recalculate_user=True
        )
        recommended = recs[0] if isinstance(recs, tuple) else [iid for iid, _ in recs]
        hits = len(np.intersect1d(test_items, recommended))
        precisions.append(hits / K)

    return np.mean(precisions) if precisions else 0.0


In [None]:
from implicit.als import AlternatingLeastSquares

# Create & train the model
als = AlternatingLeastSquares(
    factors=50,
    regularization=0.05,
    iterations=30,
    random_state=42
)
als.fit(item_user_conf)    # item_user_conf is your CSR item×user matrix

# Now you can call precision_at_k on `als`
p10 = precision_at_k(als, conf_matrix, test_matrix, item_user_conf, K=10)
print("Precision@10:", p10)


In [None]:
# ── Hyper-parameter grid search ──
from itertools import product
import pandas as pd
from tqdm.auto import tqdm
from implicit.als import AlternatingLeastSquares

param_grid = {
    "factors":        [20, 50, 100, 150],
    "regularization": [0.005, 0.01, 0.05, 0.1],
    "iterations":     [15, 30],
}

results = []
for f, reg, iters in tqdm(list(product(*param_grid.values()))):
    m = AlternatingLeastSquares(
        factors        = f,
        regularization = reg,
        iterations     = iters,
        random_state   = 42
    )
    m.fit(item_user_conf)
    p10 = precision_at_k(m, conf_matrix, test_matrix, item_user_conf, K=10)
    results.append({
        "factors": f,
        "regularization": reg,
        "iterations": iters,
        "precision@10": round(p10, 4)
    })

grid_results = pd.DataFrame(results).sort_values("precision@10", ascending=False)
display(grid_results.head(10))

best = grid_results.iloc[0]
print("\n🔝 Best params:", dict(best))


In [None]:
from implicit.als import AlternatingLeastSquares

# 1️⃣ Create & train the model (naming it `als`)
als = AlternatingLeastSquares(
    factors=50,
    regularization=0.1,
    iterations=15,
)
als.fit(item_user_conf)    # item_user_conf is your CSR item×user matrix

# 2️⃣ Now you can call precision_at_k on `als`
p10 = precision_at_k(als, conf_matrix, test_matrix, item_user_conf, K=10)
print("Precision@10:", p10)


In [None]:
# ── Evaluation @K using column-vector approach ──
import numpy as np
from tqdm.auto import tqdm
from collections import defaultdict

def _recommend_ids(model, item_user_mat, user_id, K):
    """Return only item IDs from model.recommend (API-agnostic)."""
    user_row = item_user_mat[:, user_id].T.tocsr()  # shape (1, num_items)
    recs = model.recommend(
        userid=user_id,
        user_items=user_row,
        N=K,
        filter_already_liked_items=True,
        recalculate_user=True,
    )
    return recs[0] if isinstance(recs, tuple) else [i for i, _ in recs]

def ranking_metrics(model, item_user_mat, test_mat, K=10):
    sums = defaultdict(float)
    users_evaluated = 0

    for u in tqdm(range(test_mat.shape[0]), desc=f"Eval@{K}"):
        test_items = test_mat[u].indices
        if test_items.size == 0:
            continue
        users_evaluated += 1

        rec_items = _recommend_ids(model, item_user_mat, u, K)
        rec_set, test_set = set(rec_items), set(test_items)

        hits   = len(rec_set & test_set)
        prec   = hits / K
        recall = hits / len(test_set)

        sums["P"]   += prec
        sums["R"]   += recall
        sums["HR"]  += 1 if hits else 0

        # AP for MAP
        ap, num_hits = 0.0, 0
        for rank, item in enumerate(rec_items, 1):
            if item in test_set:
                num_hits += 1
                ap += num_hits / rank
        ap /= len(test_set)
        sums["AP"] += ap

        # DCG / IDCG for NDCG
        dcg  = sum(
            1 / np.log2(rank + 1)
            for rank, item in enumerate(rec_items, 1)
            if item in test_set
        )
        idcg = sum(
            1 / np.log2(rank + 1)
            for rank in range(1, min(len(test_set), K) + 1)
        )
        sums["DCG"]  += dcg
        sums["IDCG"] += idcg

    if users_evaluated == 0:
        return {}

    P    = sums["P"]  / users_evaluated
    R    = sums["R"]  / users_evaluated
    F1   = 2 * P * R / (P + R + 1e-8)
    HR   = sums["HR"] / users_evaluated
    MAP  = sums["AP"] / users_evaluated
    NDCG = sums["DCG"] / sums["IDCG"] if sums["IDCG"] else 0.0

    return dict(Precision=P, Recall=R, F1=F1, HitRate=HR, MAP=MAP, NDCG=NDCG)

# ── Run evaluation ──
metrics = ranking_metrics(als, item_user_conf, test_matrix, K=10)

print("\n Evaluation @10 ")
for name, val in metrics.items():
    print(f"{name:<9}: {val*100:6.2f}%")


In [None]:
# # Heat-map
# import matplotlib.pyplot as plt
# import seaborn as sns
# import numpy as np

# # item_factors: (num_items × factors) matrix from ALS
# item_factors = als.item_factors

# # pairwise Pearson correlation between items
# corr = np.corrcoef(item_factors)

# plt.figure(figsize=(10, 10))
# sns.heatmap(
#     corr,
#     cmap="coolwarm",
#     square=True,
#     annot=True,
#     fmt=".2f",
#     xticklabels=[idx2product[i] for i in range(len(corr))],
#     yticklabels=[idx2product[i] for i in range(len(corr))]
# )
# plt.title("Item-factor correlation (ALS)")
# plt.tight_layout()
# plt.show()


In [None]:
# tag mapping for meta data
TAG_COLUMNS = ["product_type", "color", "material", "pattern", "season", "price_range"]

product_to_tags = {}
for _, row in df_products.iterrows():
    pid = row["product_id"]
    tag_set = set()
    for col in TAG_COLUMNS:
        tag_val = str(row[col]).strip().lower()
        tag_set.add(f"{col}_{tag_val}")
    product_to_tags[pid] = tag_set


In [None]:
from collections import Counter

def tags_from_products(product_ids):
    all_tags = []
    for pid in product_ids:
        all_tags.extend(product_to_tags.get(pid, []))
    return Counter(all_tags)

In [None]:
# Tag recommendation

def recommend_products_numeric(user_idx, k=5):
    # ALS top-k product IDs given an integer user index
    rec_ids, _ = als.recommend(
        userid=int(user_idx),
        user_items=train_matrix[int(user_idx)],
        N=k,
        filter_already_liked_items=True,
        recalculate_user=True
    )
    return [idx2product[i] for i in rec_ids]

def recommend_tags_numeric(user_idx, k_products=5, k_tags=5):
    # Top tags after aggregating from top products
    prods = recommend_products_numeric(user_idx, k=k_products)
    tag_counts = tags_from_products(prods)       # Counter of tags
    return tag_counts.most_common(k_tags)

while True:
    inp = input("\nEnter numeric user ID (or 'quit'): ").strip()
    if inp.lower() in {"quit", "exit"}:
        break
    if not inp.isdigit() or int(inp) >= train_matrix.shape[0]:
        print(" Please enter a valid numeric user index (0 –", train_matrix.shape[0]-1, ")")
        continue

    uid = int(inp)
    print(f"\n Recommendations for user {uid}")

    print(" Products:")
    for pid in recommend_products_numeric(uid, k=5):
        print("  •", pid)

    print(" Tags:")
    for tag, cnt in recommend_tags_numeric(uid, k_products=5, k_tags=5):
        print(f"  • {tag}  (score: {cnt})")
