#Recommender Systems - USER-TO-USER COLLABORATIVE FILTERING

In [3]:
# Requirements:
# pip install pandas numpy scikit-learn requests
import pandas as pd
import numpy as np
import requests
import math
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [4]:
"""
Data Mining Lab: User-User Collaborative Filtering (Simple & Fast)
Dataset: MovieLens 100K (sample of 1000 rows)
Steps:
 1. Load small external dataset
 2. Build user-item matrix
 3. Compute user-user similarity (cosine)
 4. Predict ratings using top-k similar users
 5. Evaluate RMSE
 6. Show Top-N recommendations
"""

'\nData Mining Lab: User-User Collaborative Filtering (Simple & Fast)\nDataset: MovieLens 100K (sample of 1000 rows)\nSteps:\n 1. Load small external dataset\n 2. Build user-item matrix\n 3. Compute user-user similarity (cosine)\n 4. Predict ratings using top-k similar users\n 5. Evaluate RMSE\n 6. Show Top-N recommendations\n'

In [5]:
# -------------------------------
# 1. LOAD SIMPLE EXTERNAL DATASET
# -------------------------------
url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
print("Downloading small sample from MovieLens...")
df = pd.read_csv(url, sep="\\t", names=["user_id", "item_id", "rating", "timestamp"], engine="python")
# Use only first 1000 rows for speed
df = df.iloc[:500, [0, 1, 2]]
print("Sample data:")
print(df.head())

Downloading small sample from MovieLens...
Sample data:
   user_id  item_id  rating
0      196      242       3
1      186      302       3
2       22      377       1
3      244       51       2
4      166      346       1


In [6]:
# -------------------------------
# 2. TRAIN/TEST SPLIT
# -------------------------------
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42)
#Converts data into a user-item rating matrix:
#index = users
#Columns = items
#Values = ratings (can be explicit ratings like 1–5 or implicit like 0/1).
def build_matrix(ratings):
 return ratings.pivot_table(index="user_id", columns="item_id", values="rating")
ui_train = build_matrix(train_df)
ui_test = build_matrix(test_df)
# Align both matrices to same users/items
#Ensures both matrices have the same shape with all users and items (fills missing values with NaN).
users = sorted(df.user_id.unique())
items = sorted(df.item_id.unique())
ui_train = ui_train.reindex(index=users, columns=items)
ui_test = ui_test.reindex(index=users, columns=items)
print("\nUser-Item Matrix (train):")
print(ui_train.head())


User-Item Matrix (train):
item_id  1     2     3     4     5     7     10    11    12    14    ...  \
user_id                                                              ...   
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
6         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   5.0  ...   
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   
8         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  ...   

item_id  1211  1217  1224  1240  1267  1295  1336  1393  1444  1451  
user_id                                                              
1         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
5         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
6         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
7         NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN   NaN  
8         NaN   NaN 

In [7]:
# -------------------------------
# 3. USER-USER SIMILARITY
# -------------------------------
#Each user's ratings are centered by subtracting their mean rating:
#Prevents bias from users who rate higher/lower on average.
#Example: If a user rates everything ~4.5, subtracting the mean normalizes their ratings.
def mean_center_rows(matrix):
 row_means = matrix.mean(axis=1)
 centered = matrix.sub(row_means, axis=0)
 return centered, row_means

#Cosine Similarity is applied:
#Treats each user's centered rating vector as a point in high-dimensional space.
#Ratings are filled with 0 where missing (no rating).
#Result: user_sim is a square matrix where each cell (u, v) indicates how similar two users are.
centered, user_means = mean_center_rows(ui_train)
similarity = cosine_similarity(centered.fillna(0))
user_sim = pd.DataFrame(similarity, index=users, columns=users)
print("\nUser-User Similarity Matrix:")
print(user_sim.round(2).head())


User-User Similarity Matrix:
   1    5    6    7    8    10   11   13   14   15   ...  296  297  298  299  \
1  1.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
5  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
6  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
7  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   
8  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  ...  0.0  0.0  0.0  0.0   

   301  302  303  305  307  308  
1  0.0  0.0  0.0  0.0  0.0  0.0  
5  0.0  0.0  0.0  0.0  0.0  0.0  
6  0.0  0.0  0.0  0.0  0.0  0.0  
7  0.0  0.0  0.0  0.0  0.0  0.0  
8  0.0  0.0  0.0  0.0  0.0  0.0  

[5 rows x 195 columns]


In [11]:
# -------------------------------
# 4. PREDICT RATINGS (TOP-k NEIGHBORS)
# -------------------------------
#ui – User-Item rating matrix (training data).
#user_sim – User-User similarity matrix (cosine similarity).
#user_means – Mean rating per user (for mean-centering adjustment).
#k – Number of top neighbors to consider when predicting.
def predict_user_user(ui, user_sim, user_means, k=3):
 #preds will store predicted ratings.
 preds = ui.copy()
 #rated is a boolean matrix indicating which ratings already exist (to avoid overwriting them).
 # rated = ~ui
 rated = ui.notna()   # same as ~ui.isna() but avoids dtype issues

 for u in ui.index:
      #Find potential neighbors for user u.
      #Skip rated items – predictions are only for missing ratings.
      neighbors = user_sim.loc[u].drop(u).sort_values(ascending=False)
      for i in ui.columns:
          if not math.isnan(ui.loc[u, i]): # skip rated items
              continue

          # Find neighbors who rated i
          #raters = Users (neighbors) who have rated item i.
          #topk = Selects top k most similar users among raters.
          raters = [n for n in neighbors.index if not math.isnan(ui.loc[n, i])]
          topk = neighbors.loc[raters].head(k)


          #If no similar users have rated the item, fallback to user's average rating.
          #Otherwise:
          #Compute a weighted sum of neighbors’ deviations from their own means.
          #Add this adjustment to the target user’s mean rating.
          if topk.empty:
              preds.loc[u, i] = user_means.loc[u] # fallback: user's mean
          else:
              num = 0.0
              den = 0.0
              for n, sim in topk.items():
                  num += sim * (ui.loc[n, i] - user_means.loc[n])
                  den += abs(sim)
              preds.loc[u, i] = user_means.loc[u] + (num / den if den else 0.0)
      return preds

#Produces a matrix of predicted ratings where missing values have been filled using collaborative filtering.
preds = predict_user_user(ui_train, user_sim, user_means, k=3)

In [12]:
# -------------------------------
# 5. EVALUATION (RMSE)
# -------------------------------
mask = ~ui_test.isna()
rmse = np.sqrt(np.nanmean((ui_test[mask] - preds[mask]) ** 2))
print(f"\nRMSE on test set: {rmse:.4f}")


RMSE on test set: 0.3333


In [15]:
# -------------------------------
# 6. TOP-N RECOMMENDATIONS
# -------------------------------
#ui – Original user-item rating matrix (with missing ratings).
#preds – Predicted rating matrix (from collaborative filtering).
#n – Number of recommendations to return for each user (default 3).
def top_n_recommendations(ui, preds, n=3):
 recs = {}
 #For user u, identify items they have not rated yet (NaN values in ui).
 for u in ui.index:
      missing = ui.loc[u][ui.loc[u].isna()].index

      #Look up predicted ratings from preds for only those missing items.
      #Sort them in descending order to prioritize higher predicted ratings.
      #Select top n items.
      scores = preds.loc[u, missing].sort_values(ascending=False).head(n)

      #Converts the pandas Series into a list of (item_id, predicted_score) tuples.
      #Adds it to the recs dictionary for user u
      recs[u] = list(scores.items())

 return recs

print("\nTop-3 Recommendations for first 5 users:")
#Prints Top-3 recommended items for the first five users.
# recommendations = top_n_recommendations(ui_train, preds, n=3)
# for u in list(recommendations.keys())[:5]:
#     print(f"User {u}: {recommendations[u]}")

recommendations = top_n_recommendations(ui_train, preds, n=5)
for u in list(recommendations.keys())[:10]:
    print(f"User {u}: {recommendations[u]}")



Top-3 Recommendations for first 5 users:
User 1: [(1451, 3.6666666666666665), (1, 3.6666666666666665), (2, 3.6666666666666665), (1134, 3.6666666666666665), (1115, 3.6666666666666665)]
User 5: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 6: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 7: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 8: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 10: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 11: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 13: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 14: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
User 15: [(1, nan), (2, nan), (3, nan), (4, nan), (5, nan)]
