### Movie Recommendation System (User-Based Collaborative Filtering)

- **Dataset**: MovieLens 100K (from Kaggle)
- **Task**: Recommend movies based on user similarity
- **Approach**:
  - Build a User-Item Matrix
  - Compute similarity scores between users
  - Recommend top-rated unseen movies
  - Evaluate with Precision@K

In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import precision_score

In [2]:
# Load ratings file (userId, itemId, rating, timestamp)
ratings = pd.read_csv("u.data", sep="\t", names=["userId", "movieId", "rating", "timestamp"])

# Load movie info file
movies = pd.read_csv("u.item", sep="|", encoding="latin-1", 
                     names=["movieId", "title"] + [str(i) for i in range(22)], 
                     usecols=[0,1])

ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [3]:
# Merge ratings with movie titles
data = pd.merge(ratings, movies, on="movieId")
data.head()

Unnamed: 0,userId,movieId,rating,timestamp,title
0,196,242,3,881250949,Kolya (1996)
1,186,302,3,891717742,L.A. Confidential (1997)
2,22,377,1,878887116,Heavyweights (1994)
3,244,51,2,880606923,Legends of the Fall (1994)
4,166,346,1,886397596,Jackie Brown (1997)


In [4]:
# Create User-Item Matrix
user_item_matrix = data.pivot_table(index="userId", columns="title", values="rating")
user_item_matrix.fillna(0, inplace=True)
user_item_matrix.head()

title,'Til There Was You (1997),1-900 (1994),101 Dalmatians (1996),12 Angry Men (1957),187 (1997),2 Days in the Valley (1996),"20,000 Leagues Under the Sea (1954)",2001: A Space Odyssey (1968),3 Ninjas: High Noon At Mega Mountain (1998),"39 Steps, The (1935)",...,Yankee Zulu (1994),Year of the Horse (1997),You So Crazy (1994),Young Frankenstein (1974),Young Guns (1988),Young Guns II (1990),"Young Poisoner's Handbook, The (1995)",Zeus and Roxanne (1997),unknown,Á köldum klaka (Cold Fever) (1994)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,5.0,0.0,0.0,3.0,4.0,0.0,0.0,...,0.0,0.0,0.0,5.0,3.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,2.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,4.0,0.0


In [5]:
# Compute cosine similarity between users
user_similarity = cosine_similarity(user_item_matrix)

# Convert to DataFrame for better visualization
user_similarity_df = pd.DataFrame(user_similarity, 
                                  index=user_item_matrix.index, 
                                  columns=user_item_matrix.index)

user_similarity_df.head()

userId,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.168937,0.048388,0.064561,0.37967,0.429682,0.443097,0.320079,0.078385,0.377733,...,0.372213,0.11986,0.26986,0.193343,0.197949,0.118722,0.315064,0.149086,0.181612,0.399432
2,0.168937,1.0,0.113393,0.179694,0.073623,0.242106,0.108604,0.104257,0.16247,0.161273,...,0.147095,0.310661,0.363328,0.410725,0.322713,0.231096,0.228793,0.162911,0.175273,0.106732
3,0.048388,0.113393,1.0,0.349781,0.021592,0.074018,0.067423,0.084419,0.062039,0.066217,...,0.033885,0.043453,0.16714,0.071288,0.126278,0.026758,0.164539,0.102899,0.136757,0.02699
4,0.064561,0.179694,0.349781,1.0,0.031804,0.068431,0.091507,0.18806,0.101284,0.060859,...,0.054615,0.036784,0.133619,0.196561,0.146058,0.030202,0.196858,0.152041,0.171538,0.058752
5,0.37967,0.073623,0.021592,0.031804,1.0,0.238636,0.374733,0.24893,0.056847,0.201427,...,0.340183,0.08058,0.095284,0.081053,0.148607,0.071612,0.239955,0.139595,0.153799,0.313941


In [6]:
def recommend_movies(user_id, user_item_matrix, user_similarity_df, top_n=5):
    # Find similar users
    similar_users = user_similarity_df[user_id].sort_values(ascending=False)
    similar_users = similar_users.drop(user_id) 
    
    # Take top 5 similar users
    top_users = similar_users.head(5).index
    
    # Movies rated by similar users
    similar_users_ratings = user_item_matrix.loc[top_users]
    
    # Average ratings
    avg_ratings = similar_users_ratings.mean().sort_values(ascending=False)
    
    # Exclude already watched movies
    watched = user_item_matrix.loc[user_id]
    recommendations = avg_ratings[watched == 0].head(top_n)
    
    return recommendations

In [7]:
# Try Recommendations: Recommend for user 1
recommendations = recommend_movies(1, user_item_matrix, user_similarity_df, top_n=5)
print("Top Recommendations for User 1:\n")
print(recommendations)

Top Recommendations for User 1:

title
Dr. Strangelove or: How I Learned to Stop Worrying and Love the Bomb (1963)    4.4
Casablanca (1942)                                                              4.0
Stand by Me (1986)                                                             4.0
Heathers (1989)                                                                4.0
Piano, The (1993)                                                              3.8
dtype: float64


In [8]:
# Evaluate with Precision@K
def precision_at_k(user_id, k=5):
    recommended = recommend_movies(user_id, user_item_matrix, user_similarity_df, top_n=k).index
    
    # actual liked movies (rating >= 4)
    actual = data[(data.userId == user_id) & (data.rating >= 4)]["title"].values
    
    hits = len(set(recommended) & set(actual))
    return hits / k

# Test for user 1
print("Precision@5 for User 1:", precision_at_k(1, k=5))

Precision@5 for User 1: 0.0


#### Bonus 1: Item-Based Collaborative Filtering

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

# Pivot table (user-item matrix)
user_item_matrix = ratings.pivot(index="userId", columns="movieId", values="rating")

# Fill NaNs with 0 for similarity
item_matrix = user_item_matrix.T.fillna(0)

# Compute item similarity
item_similarity = cosine_similarity(item_matrix)

# Convert to DataFrame for easier lookup
item_similarity_df = pd.DataFrame(item_similarity, 
                                  index=item_matrix.index, 
                                  columns=item_matrix.index)

item_similarity_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.402382,0.330245,0.454938,0.286714,0.116344,0.620979,0.481114,0.496288,0.273935,...,0.035387,0.0,0.0,0.0,0.035387,0.0,0.0,0.0,0.047183,0.047183
2,0.402382,1.0,0.273069,0.502571,0.318836,0.083563,0.383403,0.337002,0.255252,0.171082,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.078299,0.078299
3,0.330245,0.273069,1.0,0.324866,0.212957,0.106722,0.372921,0.200794,0.273669,0.158104,...,0.0,0.0,0.0,0.0,0.032292,0.0,0.0,0.0,0.0,0.096875
4,0.454938,0.502571,0.324866,1.0,0.334239,0.090308,0.489283,0.490236,0.419044,0.252561,...,0.0,0.0,0.094022,0.094022,0.037609,0.0,0.0,0.0,0.056413,0.075218
5,0.286714,0.318836,0.212957,0.334239,1.0,0.037299,0.334769,0.259161,0.272448,0.055453,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.094211


#### Bonus 2: Matrix Factorization (SVD)

In [12]:
from sklearn.decomposition import TruncatedSVD

# Pivot table (user-item matrix)
user_item_matrix = ratings.pivot(index="userId", columns="movieId", values="rating").fillna(0)

# Apply Truncated SVD (reduce to 20 latent features)
svd = TruncatedSVD(n_components=20, random_state=42)
matrix_reduced = svd.fit_transform(user_item_matrix)

# Reconstruct the approximated ratings matrix
approx_ratings = svd.inverse_transform(matrix_reduced)

# Convert back to DataFrame
approx_ratings_df = pd.DataFrame(approx_ratings, 
                                 index=user_item_matrix.index, 
                                 columns=user_item_matrix.columns)

approx_ratings_df.head()

movieId,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.228012,2.096937,1.276615,3.139637,0.551456,0.568562,4.419147,2.79892,3.134902,2.193979,...,-0.024103,0.008582,0.01456,0.009707,0.022546,-0.001981,-0.005944,-0.003963,0.031395,0.074819
2,2.024309,-0.008309,0.033837,0.277159,-0.008177,0.341411,1.627962,0.440471,2.541393,0.619027,...,0.000615,-0.021051,-0.008494,-0.005663,-0.00052,0.004875,0.014625,0.00975,-0.004853,-0.02822
3,-0.122395,-0.063842,0.169859,-0.205504,-0.097037,0.016595,-0.293257,-0.073747,-0.435728,0.096075,...,0.004596,-0.010258,0.023943,0.015962,-0.002083,0.011123,0.03337,0.022247,0.003056,0.002247
4,0.44912,-0.178459,0.092678,-0.07323,0.041396,-0.005179,0.338694,-0.103576,-0.099695,-0.169013,...,0.001569,-0.008776,-0.007678,-0.005119,-0.002431,0.005236,0.015708,0.010472,-0.002409,0.000591
5,3.697199,1.322204,0.353221,1.524847,0.507998,-0.143364,2.799689,1.310824,-0.428948,0.384931,...,-0.012855,0.003692,-0.032325,-0.02155,-0.014456,-0.001497,-0.00449,-0.002993,-0.000667,-0.015408
