In [56]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score

In [57]:
import zipfile, urllib.request, os

url = "http://files.grouplens.org/datasets/movielens/ml-100k.zip"
zip_path = "ml-100k.zip"

if not os.path.exists("ml-100k"):
    urllib.request.urlretrieve(url, zip_path)
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(".")
    os.remove(zip_path)

ratings = pd.read_csv("ml-100k/u.data", sep="\t", names=["userId","movieId","rating","timestamp"])
movies = pd.read_csv("ml-100k/u.item", sep="|", names=["movieId","title"]+list(range(22)), encoding="latin-1")[["movieId","title"]]

In [58]:
train, test = train_test_split(ratings, test_size=0.2, random_state=42)


In [59]:
user_item_matrix = train.pivot(index="userId", columns="movieId", values="rating").fillna(0)

In [60]:
user_ratings_mean = user_item_matrix.mean(axis=1)
matrix_normalized = user_item_matrix.sub(user_ratings_mean, axis=0)

In [61]:
user_similarity = cosine_similarity(matrix_normalized)
user_similarity_df = pd.DataFrame(user_similarity, index=user_item_matrix.index, columns=user_item_matrix.index)

In [62]:
item_similarity = cosine_similarity(matrix_normalized.T)
item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)


In [63]:
def recommend_user_based(user_id, top_n=5):
    sim_scores = user_similarity_df.loc[user_id].drop(user_id)
    similar_users = sim_scores.sort_values(ascending=False).head(10).index

    # Weighted ratings
    weighted_ratings = user_item_matrix.loc[similar_users].T.dot(sim_scores.loc[similar_users])
    norm = sim_scores.loc[similar_users].sum()
    scores = weighted_ratings / (norm+1e-8)

    # Remove already seen movies
    seen = set(train[train["userId"] == user_id]["movieId"])
    scores = scores.drop(labels=seen, errors="ignore")

    return scores.sort_values(ascending=False).head(top_n)

def recommend_item_based(user_id, top_n=5):
    user_ratings = user_item_matrix.loc[user_id]
    scores = item_similarity_df.dot(user_ratings)
    scores = scores / (item_similarity_df.sum(axis=1)+1e-8)

    # Remove already seen movies
    seen = set(train[train["userId"] == user_id]["movieId"])
    scores = scores.drop(labels=seen, errors="ignore")

    return scores.sort_values(ascending=False).head(top_n)

In [64]:
def precision_at_k(user_id, k=5, method="user"):
    # Ground truth: movies in TEST set for this user
    true_items = set(test[test["userId"] == user_id]["movieId"])
    if len(true_items) == 0:
        return None  # skip users with no test data

    # Recommendations
    if method == "user":
        recs = recommend_user_based(user_id, k).index
    else:
        recs = recommend_item_based(user_id, k).index

    # Precision@K
    hit_count = len(set(recs).intersection(true_items))
    return hit_count / k

In [65]:
users = test["userId"].unique()
precisions_user = [precision_at_k(u, k=5, method="user") for u in users]
precisions_item = [precision_at_k(u, k=5, method="item") for u in users]

precisions_user = [p for p in precisions_user if p is not None]
precisions_item = [p for p in precisions_item if p is not None]

print("🔹 Example recommendations for User 1 (User-based):")
print(recommend_user_based(1, 5).map(lambda x: round(x,2)))

print("\n🔹 Example recommendations for User 1 (Item-based):")
print(recommend_item_based(1, 5).map(lambda x: round(x,2)))

print(f"\n✅ Average Precision@5 (User-based, {len(precisions_user)} users): {np.mean(precisions_user):.3f}")
print(f"✅ Average Precision@5 (Item-based, {len(precisions_item)} users): {np.mean(precisions_item):.3f}")

🔹 Example recommendations for User 1 (User-based):
movieId
100    3.50
181    3.22
4      3.22
64     3.11
82     3.07
dtype: float64

🔹 Example recommendations for User 1 (Item-based):
movieId
895    10.05
676     5.85
945     2.16
354     1.91
331     1.78
dtype: float64

✅ Average Precision@5 (User-based, 940 users): 0.349
✅ Average Precision@5 (Item-based, 940 users): 0.020
