In [62]:
import numpy as np
import pandas as pd
from surprise import Dataset, Reader, KNNBasic, SVD
from surprise.model_selection import train_test_split
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
import warnings
warnings.filterwarnings("ignore")

In [33]:
ratings = pd.read_csv("./ml-1m/ratings.dat", sep="::", engine='python',
                      names=["UserID", "MovieID", "Rating", "Timestamp"])

users = pd.read_csv("./ml-1m/users.dat", sep="::", engine='python',
                    names=["UserID", "Gender", "Age", "Occupation", "Zip-code"])

movies = pd.read_csv("./ml-1m/movies.dat", sep="::", engine='python',
                     names=["MovieID", "Title", "Genres"], encoding='latin-1')

ratings.head()

Unnamed: 0,UserID,MovieID,Rating,Timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [34]:
users.head()

Unnamed: 0,UserID,Gender,Age,Occupation,Zip-code
0,1,F,1,10,48067
1,2,M,56,16,70072
2,3,M,25,15,55117
3,4,M,45,7,2460
4,5,M,25,20,55455


In [35]:
movies.head()

Unnamed: 0,MovieID,Title,Genres
0,1,Toy Story (1995),Animation|Children's|Comedy
1,2,Jumanji (1995),Adventure|Children's|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama
4,5,Father of the Bride Part II (1995),Comedy


## Partial Cold Start Users:

In [36]:
random_seed = 10701
cold_start_user_portion = 0.2
cold_start_user_ratings = 5

# Sample random users as "cold-start" users
cold_start_users = ratings['UserID'].drop_duplicates().sample(frac=cold_start_user_portion, 
                                                              random_state=random_seed)

train_data = []
test_data = []

for uid in cold_start_users:
    user_ratings = ratings[ratings['UserID'] == uid]

    # Skip user if too few ratings to split
    if len(user_ratings) > 2 * cold_start_user_ratings:
        revealed = user_ratings.sample(n=cold_start_user_ratings, random_state=random_seed)
        held_out = user_ratings.drop(revealed.index)
        train_data.append(revealed)
        test_data.append(held_out)
    
# Add all other users' ratings to train
# The cold_start users (with their actual ratings) are used as test set
other_users = ratings[~ratings['UserID'].isin(cold_start_users)]
train_data.append(other_users)

train_df = pd.concat(train_data)
test_df = pd.concat(test_data)

## Baselines

In [37]:
reader = Reader(rating_scale=(1, 5))
trainset = Dataset.load_from_df(train_df[['UserID', 'MovieID', 'Rating']], reader).build_full_trainset()

algo_userknn = KNNBasic(sim_options={'user_based': True})
algo_userknn.fit(trainset)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x1b2239cf590>

In [38]:
algo_svd = SVD()
algo_svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x1b223bc4390>

In [None]:
all_items = train_df['MovieID'].unique()
train_users = pd.merge(train_df[['UserID']], users, on='UserID').drop_duplicates('UserID')

svd_recs = {}

for uid in test_df['UserID'].unique():
    known_items = set(train_df[train_df['UserID'] == uid]['MovieID'])  # already seen
    candidates = [iid for iid in all_items if iid not in known_items]

    # Predict scores
    predictions = [
        (iid, algo_svd.predict(uid, iid).est)
        for iid in candidates
    ]

    # Sort by predicted rating
    top_n = sorted(predictions, key=lambda x: x[1], reverse=True)[:10]
    svd_recs[uid] = [iid for iid, _ in top_n]

### Clustering (No User Data)

In [40]:
n_clusters = 10

# Extract user factors from SVD
user_factors = algo_svd.pu  # shape: (n_users, n_factors)
user_ids = [trainset.to_raw_uid(i) for i in range(len(user_factors))]

user_df = pd.DataFrame(user_factors, index=user_ids)

# Fit k-means
kmeans = KMeans(n_clusters=n_clusters, random_state=random_seed)
user_df['Cluster'] = kmeans.fit_predict(user_df)

# For each cluster, compute Top-N
cluster_topN = {}
for cluster_id in user_df['Cluster'].unique():
    cluster_users = user_df[user_df['Cluster'] == cluster_id].index
    cluster_ratings = train_df[train_df['UserID'].isin(cluster_users)]
    
    top_items = (cluster_ratings
                 .groupby('MovieID')['Rating']
                 .mean()
                 .sort_values(ascending=False)
                 .head(10)
                 .index.tolist())
    
    cluster_topN[cluster_id] = top_items


In [41]:
def compute_user_vector(uid, svd_model, train_df):
    user_data = train_df[train_df['UserID'] == uid]
    
    q_list, r_list = [], []
    for _, row in user_data.iterrows():
        try:
            inner_iid = svd_model.trainset.to_inner_iid(row['MovieID'])
            q_list.append(svd_model.qi[inner_iid])
            r_list.append(row['Rating'])
        except:
            continue  # skip items not in training

    if not q_list:
        return None

    Q = np.stack(q_list)
    r = np.array(r_list).reshape(-1, 1)

    # Solve for user vector p in least-squares sense
    p = np.linalg.pinv(Q.T @ Q) @ Q.T @ r
    return p.ravel()


In [42]:
user_clusters = {}

for uid in test_df['UserID'].unique():
    vec = compute_user_vector(uid, algo_svd, train_df)
    if vec is not None:
        cluster = kmeans.predict([vec])[0]
        user_clusters[uid] = cluster

cluster_recs = {}

for uid in test_df['UserID'].unique():
    cluster_id = user_clusters.get(uid)
    if cluster_id is None:
        print(f"User {uid} is not assigned to any cluster.")
        continue  # fallback or skip if user wasn't assigned

    top_items = cluster_topN.get(cluster_id, [])
    seen = set(train_df[train_df['UserID'] == uid]['MovieID'])

    cluster_recs[uid] = [iid for iid in top_items if iid not in seen][:10]


### Clustering (with user data)

In [57]:
train_users = pd.merge(train_df[['UserID']], users, on='UserID').drop_duplicates('UserID')

# Ensure Gender is binary
meta = train_users[['Gender', 'Age', 'Occupation']].copy()
meta['Gender'] = meta['Gender'].map({'M': 0, 'F': 1})

# Convert Age and Occupation to strings (column names must be consistent)
meta[['Age', 'Occupation']] = meta[['Age', 'Occupation']].astype(str)

encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded = encoder.fit_transform(meta[['Age', 'Occupation']])

# Fix column names
encoded_df = pd.DataFrame(encoded, index=meta.index)
encoded_df.columns = [str(col) for col in encoded_df.columns]  # ensure string column names

# Final metadata features
X_meta = pd.concat([meta[['Gender']].reset_index(drop=True), encoded_df.reset_index(drop=True)], axis=1)


In [58]:
# KMeans clustering
n_clusters = 10
kmeans_meta = KMeans(n_clusters=n_clusters, random_state=42)
meta_clusters = kmeans_meta.fit_predict(X_meta)

# Store cluster assignment
metadata_user_clusters = dict(zip(train_users['UserID'].values, meta_clusters))

In [59]:
cluster_topN_meta = {}

for cid in range(n_clusters):
    cluster_user_ids = [uid for uid, c in metadata_user_clusters.items() if c == cid]
    cluster_ratings = train_df[train_df['UserID'].isin(cluster_user_ids)]
    
    top_items = (cluster_ratings.groupby('MovieID')['Rating']
                              .mean()
                              .sort_values(ascending=False)
                              .head(50)  # to allow filtering later
                              .index.tolist())
    
    cluster_topN_meta[cid] = top_items


In [None]:
test_users = pd.merge(test_df[['UserID']], users, on='UserID').drop_duplicates('UserID')

cluster_recs_meta = {}

for _, row in test_users.iterrows():
    # Prepare metadata row
    meta_vec = pd.DataFrame([{
        'Gender': 0 if row['Gender'] == 'M' else 1,
        'Age': str(row['Age']),
        'Occupation': str(row['Occupation'])
    }])

    # Encode features
    encoded_meta = encoder.transform(meta_vec[['Age', 'Occupation']])
    gender_val = meta_vec[['Gender']].values
    X_input = np.hstack([gender_val, encoded_meta])

    # Predict cluster
    cluster_id = kmeans_meta.predict(X_input)[0]
    top_items = cluster_topN_meta.get(cluster_id, [])

    seen = set(train_df[train_df['UserID'] == row['UserID']]['MovieID'])
    recs = [iid for iid in top_items if iid not in seen][:10]

    cluster_recs_meta[row['UserID']] = recs


## Evaluation

In [43]:
def precision_at_k(predicted, relevant, k):
    predicted = predicted[:k]
    return len(set(predicted) & set(relevant)) / k if k else 0

def recall_at_k(predicted, relevant, k):
    predicted = predicted[:k]
    return len(set(predicted) & set(relevant)) / len(relevant) if relevant else 0

def ndcg_at_k(predicted, relevant, k):
    predicted = predicted[:k]
    dcg = 0.0
    for i, p in enumerate(predicted):
        if p in relevant:
            dcg += 1 / np.log2(i + 2)  # rank starts at 1
    idcg = sum(1 / np.log2(i + 2) for i in range(min(len(relevant), k)))
    return dcg / idcg if idcg else 0

In [44]:
# Test_df has ground truth ratings
relevance_threshold = 4

# Build a dictionary of relevant items per user
ground_truth = (
    test_df[test_df['Rating'] >= relevance_threshold]
    .groupby('UserID')['MovieID']
    .apply(set)
    .to_dict()
)


In [64]:
results = []

for uid in ground_truth:
    relevant_items = ground_truth.get(uid, set())
    
    svd_pred = svd_recs.get(uid, [])
    clust_pred = cluster_recs.get(uid, [])
    cluster_meta_pred = cluster_recs_meta.get(uid, [])

    results.append({
        'UserID': uid,
        'Precision@10_SVD': precision_at_k(svd_pred, relevant_items, 10),
        'Recall@10_SVD': recall_at_k(svd_pred, relevant_items, 10),
        'NDCG@10_SVD': ndcg_at_k(svd_pred, relevant_items, 10),
        'Precision@10_Cluster': precision_at_k(clust_pred, relevant_items, 10),
        'Recall@10_Cluster': recall_at_k(clust_pred, relevant_items, 10),
        'NDCG@10_Cluster': ndcg_at_k(clust_pred, relevant_items, 10),
        'Precision@10_Metadata': precision_at_k(cluster_meta_pred, relevant_items, 10),
        'Recall@10_Metadata': recall_at_k(cluster_meta_pred, relevant_items, 10),
        'NDCG@10_Metadata': ndcg_at_k(cluster_meta_pred, relevant_items, 10)
    })

eval_df = pd.DataFrame(results)
eval_df.mean()

UserID                   3022.840232
Precision@10_SVD            0.173924
Recall@10_SVD               0.024854
NDCG@10_SVD                 0.175361
Precision@10_Cluster        0.003642
Recall@10_Cluster           0.000261
NDCG@10_Cluster             0.004024
Precision@10_Metadata       0.001325
Recall@10_Metadata          0.000178
NDCG@10_Metadata            0.001437
dtype: float64