In [None]:
!pip install implicit


Collecting implicit
  Downloading implicit-0.7.2.tar.gz (70 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/70.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m70.3/70.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Building wheels for collected packages: implicit
  Building wheel for implicit (pyproject.toml) ... [?25l[?25hdone
  Created wheel for implicit: filename=implicit-0.7.2-cp312-cp312-linux_x86_64.whl size=10797486 sha256=7a032fdafc26e8de768d7df0e5d581569cf1eec07ba866dcbdecdd8101508f63
  Stored in directory: /root/.cache/pip/wheels/b2/00/4f/9ff8af07a0a53ac6007ea5d739da19cfe147a2df542b6899f8
Successfully built implicit
Installing collected packages: implicit
Successfully installed implicit-0.7.2


In [None]:
import pandas as pd
import numpy as np

# Load your datasets
meta = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RS_Mini/meta_All_Beauty.csv")
reviews = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/RS_Mini/All_Beauty_5.csv")

# Keep relevant columns
reviews = reviews[['reviewerID', 'asin', 'unixReviewTime']]
meta = meta[['asin', 'title']]

# Merge metadata
df = reviews.merge(meta, on="asin", how="left")

# Encode users and items
df['user_idx'] = df['reviewerID'].astype('category').cat.codes
df['item_idx'] = df['asin'].astype('category').cat.codes

num_users = df.user_idx.nunique()
num_items = df.item_idx.nunique()

print("Users:", num_users, "| Items:", num_items)

# Sort by time
df = df.sort_values("unixReviewTime")

# Train–test split: last interaction of each user = test
test_df = df.groupby('user_idx').tail(1)
train_df = df.drop(test_df.index)

print("Train size:", len(train_df), " | Test size:", len(test_df))


Users: 991 | Items: 85
Train size: 4776  | Test size: 991


BASELINE 2 — ITEM–BASED COLLABORATIVE FILTERING

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Build sparse interaction matrix
rows = train_df['user_idx']
cols = train_df['item_idx']
data = np.ones(len(train_df))

interaction_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_items))

# Compute item-item cosine similarity
item_sim = cosine_similarity(interaction_matrix.T)

print("Item similarity matrix:", item_sim.shape)


Item similarity matrix: (85, 85)


In [None]:
def recommend_itemcf(user_id, K=10):
    user_vector = interaction_matrix[user_id].toarray().flatten()

    # Score = weighted similarity sum
    scores = item_sim.dot(user_vector)

    # Remove already seen items
    scores[user_vector > 0] = -1

    # Top-K
    top_items = scores.argsort()[::-1][:K]
    return top_items


In [None]:
def evaluate_itemcf(K=10):
    hits = 0
    total = len(test_df)
    ndcg_sum = 0

    for _, row in test_df.iterrows():
        user = row.user_idx
        true_item = row.item_idx

        recs = recommend_itemcf(user, K)

        if true_item in recs:
            hits += 1
            rank = list(recs).index(true_item)
            ndcg_sum += 1 / np.log2(rank + 2)

    precision = hits / total
    recall = hits / total
    accuracy = hits / total
    ndcg = ndcg_sum / total

    return precision, recall, accuracy, ndcg


In [None]:
p, r, acc, n = evaluate_itemcf(K=10)

print("===== ITEM-BASED CF (BASELINE 2) =====")
print("Precision@10:", p)
print("Recall@10:", r)
print("Accuracy@10:", acc)
print("NDCG@10:", n)


===== ITEM-BASED CF (BASELINE 2) =====
Precision@10: 0.4106962663975782
Recall@10: 0.4106962663975782
Accuracy@10: 0.4106962663975782
NDCG@10: 0.39089969315564355


BASELINE 3 — ALS MATRIX FACTORIZATION

BASELINE 1: POPULARITY RECOMMENDER

In [None]:
import numpy as np

# Compute item popularity from train_df
item_popularity = train_df['item_idx'].value_counts().index.tolist()

def evaluate_popularity(K=10):
    precisions, recalls, accs, ndcgs = [], [], [], []

    top_k_items = item_popularity[:K]

    for user in test_df['user_idx'].unique():
        true_items = test_df[test_df.user_idx == user].item_idx.values.tolist()

        hits = len(set(top_k_items) & set(true_items))

        precision = hits / K
        recall = hits / len(true_items)
        accuracy = 1 if hits > 0 else 0
        ndcg = hits / K  # simplified ndcg

        precisions.append(precision)
        recalls.append(recall)
        accs.append(accuracy)
        ndcgs.append(ndcg)

    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(accs),
        np.mean(ndcgs)
    )

# Run
p, r, a, n = evaluate_popularity()

print("\n===== POPULARITY RECOMMENDER (BASELINE 2) =====")
print("Precision@10:", p)
print("Recall@10:   ", r)
print("Accuracy@10: ", a)
print("NDCG@10:     ", n)



===== POPULARITY RECOMMENDER (BASELINE 2) =====
Precision@10: 0.09253279515640768
Recall@10:    0.9253279515640767
Accuracy@10:  0.9253279515640767
NDCG@10:      0.09253279515640768


BASELINE 2: ITEM-ITEM COSINE SIMILARITY (KNN CF)

In [None]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix

# Build user-item matrix from train_df
rows = train_df['user_idx']
cols = train_df['item_idx']
data = np.ones(len(train_df))
train_matrix = csr_matrix((data, (rows, cols)), shape=(num_users, num_items))

# Compute item-item similarity
item_sim = cosine_similarity(train_matrix.T)   # 85 x 85

def recommend_knn(user, K):
    user_items = train_matrix[user].indices  # items user interacted with

    if len(user_items) == 0:
        return item_popularity[:K]  # fallback

    # Sum similarity scores from all items user interacted with
    scores = np.sum(item_sim[user_items], axis=0)

    # Remove already seen items
    scores[user_items] = -1

    top_k = np.argsort(scores)[::-1][:K]
    return top_k.tolist()

def evaluate_knn(K=10):
    precisions, recalls, accs, ndcgs = [], [], [], []

    for user in test_df['user_idx'].unique():
        true_items = test_df[test_df.user_idx == user].item_idx.values.tolist()

        pred_items = recommend_knn(user, K)

        hits = len(set(pred_items) & set(true_items))

        precision = hits / K
        recall = hits / len(true_items)
        accuracy = 1 if hits > 0 else 0
        ndcg = hits / K

        precisions.append(precision)
        recalls.append(recall)
        accs.append(accuracy)
        ndcgs.append(ndcg)

    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(accs),
        np.mean(ndcgs)
    )

# Run
p, r, a, n = evaluate_knn(K=10)

print("\n===== ITEM-ITEM KNN COLLABORATIVE FILTERING (BASELINE 3) =====")
print("Precision@10:", p)
print("Recall@10:   ", r)
print("Accuracy@10: ", a)
print("NDCG@10:     ", n)



===== ITEM-ITEM KNN COLLABORATIVE FILTERING (BASELINE 3) =====
Precision@10: 0.04096871846619576
Recall@10:    0.40968718466195764
Accuracy@10:  0.40968718466195764
NDCG@10:      0.04096871846619576


Final Baseline Code


BASELINE 4 — ITEM–ITEM SIMILARITY (COSINE SIMILARITY)

In [None]:
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix
from sklearn.metrics.pairwise import cosine_similarity
from tqdm import tqdm

# ========== BUILD USER–ITEM MATRIX ==========
num_users = df.user_idx.nunique()
num_items = df.item_idx.nunique()

train_matrix = csr_matrix(
    (np.ones(len(train_df)), (train_df.user_idx, train_df.item_idx)),
    shape=(num_users, num_items)
)

# ========== ITEM–ITEM COSINE SIMILARITY ==========
print("Computing item-item similarity matrix...")
item_sim = cosine_similarity(train_matrix.T)  # shape: items × items

print("Item similarity matrix ready:", item_sim.shape)

# ========== RECOMMENDATION FUNCTION ==========
def recommend_items(user_id, K=10):
    # items the user already interacted with
    user_items = train_matrix[user_id].indices

    if len(user_items) == 0:
        return []  # cold start

    # compute score = sum of similarities for all items user interacted with
    scores = item_sim[user_items].sum(axis=0)

    # remove items already seen
    scores[user_items] = -1e9

    # top K item indices
    top_items = np.argsort(scores)[-K:][::-1]
    return top_items.tolist()

# ========== METRIC FUNCTIONS ==========
def precision_at_k(pred, true, k):
    return len(set(pred[:k]) & set(true)) / k

def recall_at_k(pred, true, k):
    return len(set(pred[:k]) & set(true)) / len(true) if len(true) > 0 else 0

def accuracy_at_k(pred, true):
    return 1 if set(pred) & set(true) else 0

def ndcg_at_k(pred, true, k):
    dcg = 0.0
    for i, p in enumerate(pred[:k]):
        if p in true:
            dcg += 1 / np.log2(i + 2)
    idcg = 1.0  # because 1 relevant item per user
    return dcg / idcg

# ========== EVALUATION ==========
def evaluate_item_item(K=10):
    precisions, recalls, accuracies, ndcgs = [], [], [], []

    print("\nEvaluating Item–Item Collaborative Filtering...")
    for user in tqdm(test_df.user_idx.unique()):

        pred_items = recommend_items(user, K)
        true_items = test_df[test_df.user_idx == user].item_idx.values.tolist()

        if len(pred_items) == 0:
            continue

        precisions.append(precision_at_k(pred_items, true_items, K))
        recalls.append(recall_at_k(pred_items, true_items, K))
        accuracies.append(accuracy_at_k(pred_items, true_items))
        ndcgs.append(ndcg_at_k(pred_items, true_items, K))

    return (
        np.mean(precisions),
        np.mean(recalls),
        np.mean(accuracies),
        np.mean(ndcgs)
    )

# RUN BASELINE 4
p, r, a, n = evaluate_item_item(K=10)

print("\n===== BASELINE 4: ITEM-ITEM COLLABORATIVE FILTERING =====")
print("Precision@10:", p)
print("Recall@10:   ", r)
print("Accuracy@10: ", a)
print("NDCG@10:     ", n)

Computing item-item similarity matrix...
Item similarity matrix ready: (85, 85)

Evaluating Item–Item Collaborative Filtering...


100%|██████████| 991/991 [00:00<00:00, 1869.89it/s]


===== BASELINE 4: ITEM-ITEM COLLABORATIVE FILTERING =====
Precision@10: 0.04096871846619576
Recall@10:    0.40968718466195764
Accuracy@10:  0.40968718466195764
NDCG@10:      0.3922410305512074



