# Recommender Systems - ITEM-ITEM COLLABORATIVE FILTERING

In [1]:
# Uses MovieLens 100K (subset of 1,000 ratings only for speed)
# Builds a User–Item matrix.
# Implements User–User CF with cosine similarity.
# Predicts missing ratings using top-k similar users.
# Evaluates with RMSE.
# Outputs Top-N recommendations per user.

In [2]:
# ==============================
# ITEM-ITEM COLLABORATIVE FILTERING LAB
# Dataset: MovieLens 100k (subset for speed)
# ==============================
import pandas as pd
import numpy as np
import requests
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
import math

In [3]:
# ------------------------------------------
# 1. Load Dataset (First 1000 rows for speed)
# ------------------------------------------
print("Downloading MovieLens 100k sample...")
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
df = pd.read_csv(url, sep="\t", names=["user_id", "item_id", "rating", "timestamp"])
df = df.iloc[:1000, [0, 1, 2]] # Keep first 1000 rows
print("\nSample of the dataset:")
print(df.head())

Downloading MovieLens 100k sample...

Sample of the dataset:
   user_id  item_id  rating
0      196      242       3
1      186      302       3
2       22      377       1
3      244       51       2
4      166      346       1


In [4]:
# ------------------------------------------
# 2. Train-Test Split and Create User-Item Matrix
# ------------------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
def build_matrix(ratings):
 return ratings.pivot_table(index="user_id", columns="item_id", values="rating")
ui_train = build_matrix(train_df)
ui_test = build_matrix(test_df)
users = sorted(df.user_id.unique())
items = sorted(df.item_id.unique())
ui_train = ui_train.reindex(index=users, columns=items)
ui_test = ui_test.reindex(index=users, columns=items)
print("\nUser-Item Matrix (Training):")
print(ui_train.head())


User-Item Matrix (Training):
item_id  1     2     3     4     5     7     8     9     10    11    ...  \
user_id                                                              ...   
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   3.0   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  1286  1288  1295  1336  1393  1428  1444  1451  1462  1497  
user_id                                                              
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
2         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
6         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
7         NaN   N

In [6]:
# ------------------------------------------
# 3. Compute Item-Item Similarity (Cosine)
# ------------------------------------------
item_centered = ui_train.sub(ui_train.mean(axis=0), axis=1)
item_sim = cosine_similarity(item_centered.fillna(0).T)
item_sim_df = pd.DataFrame(item_sim, index=items, columns=items)
print("\nItem-Item Similarity Matrix:")
print(item_sim_df.round(2).head())

# ------------------------------------------
# 4. Predict Ratings using Item-Item CF
# ------------------------------------------
def predict_item_item(ui, item_sim, k=5):
      preds = ui.copy()
      for u in ui.index:
          for i in ui.columns:
              if not math.isnan(ui.loc[u, i]):
                    continue
              rated_items = ui.loc[u][~ui.loc[u].isna()].index
              if len(rated_items) == 0:
                    preds.loc[u, i] = ui.mean().mean()
                    continue
              sims = item_sim.loc[i, rated_items].sort_values(ascending=False)
              topk = sims.head(k)
              num = sum(sim * ui.loc[u, itm] for itm, sim in topk.items())
              den = sum(abs(sim) for sim in topk)
              preds.loc[u, i] = num / den if den else ui.mean().mean()
          return preds

preds_item = predict_item_item(ui_train, item_sim_df, k=5)


Item-Item Similarity Matrix:
   1     2     3     4     5     7     8     9     10    11    ...  1286  \
1   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
2   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
4   0.0   0.0   0.0   1.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   
5   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  ...   0.0   

   1288  1295  1336  1393  1428  1444  1451  1462  1497  
1   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
2   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
3   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
4   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  
5   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0   0.0  

[5 rows x 551 columns]


In [7]:
# ------------------------------------------
# 5. Evaluate with RMSE
# ------------------------------------------
mask = ~ui_test.isna()
rmse = np.sqrt(np.nanmean((ui_test[mask] - preds_item[mask]) ** 2))
print(f"\nRMSE on test set: {rmse:.4f}")


RMSE on test set: 1.2064


In [9]:
# ------------------------------------------
# 6. Top-N Recommendations per User
# ------------------------------------------
def top_n_recommendations(ui, preds, n=3):
    recs = {}
    for u in ui.index:
        missing = ui.loc[u][ui.loc[u].isna()].index
        scores = preds.loc[u, missing].sort_values(ascending=False).head(n)
        recs[u] = list(scores.items())
    return recs

recommendations = top_n_recommendations(ui_train, preds_item, n=3)
print("\nTop-3 Recommendations for first 5 users:")

for u in list(recommendations.keys())[:5]:
    print(f"User {u}: {recommendations[u]}")


Top-3 Recommendations for first 5 users:
User 1: [(56, 4.0), (508, 4.0), (228, 4.0)]
User 2: [(1, nan), (2, nan), (3, nan)]
User 5: [(1, nan), (3, nan), (4, nan)]
User 6: [(1, nan), (2, nan), (3, nan)]
User 7: [(1, nan), (2, nan), (3, nan)]
