In [1]:
# =========================================
# HYBRID RECOMMENDER SYSTEM LAB
# Combines Collaborative Filtering + Content-Based
# Dataset: MovieLens 100k (subset)
# =========================================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import math


In [2]:
# ------------------------------------------
# 1. Load Datasets
# ------------------------------------------
print("Downloading MovieLens data...")
ratings_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.data"
movies_url = "https://files.grouplens.org/datasets/movielens/ml-100k/u.item"
# Ratings
ratings = pd.read_csv(ratings_url, sep="\t", names=["user_id","movie_id","rating","timestamp"])
ratings = ratings.iloc[:2000, [0, 1, 2]] # first 2000 rows for speed
# Movies metadata
columns = ["movie_id","title","release_date","video_release","IMDb_URL",
 "unknown","Action","Adventure","Animation","Children","Comedy","Crime",
 "Documentary","Drama","Fantasy","Film-Noir","Horror","Musical","Mystery",
 "Romance","Sci-Fi","Thriller","War","Western"]
movies = pd.read_csv(movies_url, sep="|", names=columns, encoding="latin-1")
movies = movies.iloc[:500] # first 500 movies
genre_cols = columns[5:]
movies["genres"] = movies[genre_cols].apply(lambda x: " ".join([g for g,v in zip(genre_cols,x) if v==1]), axis=1)
print("\nSample Ratings:")
print(ratings.head())
print("\nSample Movies:")
print(movies[["movie_id","title","genres"]].head())
# Merge ratings and movies for filtering
ratings = ratings[ratings.movie_id.isin(movies.movie_id)]

Downloading MovieLens data...

Sample Ratings:
   user_id  movie_id  rating
0      196       242       3
1      186       302       3
2       22       377       1
3      244        51       2
4      166       346       1

Sample Movies:
   movie_id              title                     genres
0         1   Toy Story (1995)  Animation Children Comedy
1         2   GoldenEye (1995)  Action Adventure Thriller
2         3  Four Rooms (1995)                   Thriller
3         4  Get Shorty (1995)        Action Comedy Drama
4         5     Copycat (1995)       Crime Drama Thriller


In [4]:
# ------------------------------------------
# 2. Content-Based Similarity (TF-IDF on Genres)
# ------------------------------------------
tfidf = TfidfVectorizer(stop_words="english")
tfidf_matrix = tfidf.fit_transform(movies["genres"])
content_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)
content_sim_df = pd.DataFrame(content_sim, index=movies.movie_id, columns=movies.movie_id)
# ------------------------------------------
# 3. Collaborative Filtering (Item-Item CF)
# ------------------------------------------
train_df, test_df = train_test_split(ratings, test_size=0.2, random_state=42)
def build_matrix(ratings):
 return ratings.pivot_table(index="user_id", columns="movie_id", values="rating")
ui_train = build_matrix(train_df)
ui_test = build_matrix(test_df)
users = sorted(ratings.user_id.unique())
items = sorted(movies.movie_id.unique())
ui_train = ui_train.reindex(index=users, columns=items)
ui_test = ui_test.reindex(index=users, columns=items)
item_centered = ui_train.sub(ui_train.mean(axis=0), axis=1)
item_sim = cosine_similarity(item_centered.fillna(0).T)
item_sim_df = pd.DataFrame(item_sim, index=items, columns=items)
# Collaborative prediction
def predict_cf(ui, item_sim, k=5):
    preds = ui.copy()
    for u in ui.index:
        for i in ui.columns:
            if not math.isnan(ui.loc[u, i]):
                continue
            rated_items = ui.loc[u][~ui.loc[u].isna()].index
            if len(rated_items) == 0:
                preds.loc[u, i] = ui.mean().mean()
                continue
            sims = item_sim.loc[i, rated_items].sort_values(ascending=False)
            topk = sims.head(k)
            num = sum(sim * ui.loc[u, itm] for itm, sim in topk.items())
            den = sum(abs(sim) for sim in topk)
            preds.loc[u, i] = num / den if den else ui.mean().mean()
    return preds

preds_cf = predict_cf(ui_train, item_sim_df, k=5)

In [5]:
# ------------------------------------------
# 4. Hybrid Score = α * CF + (1-α) * Content
# ------------------------------------------
def hybrid_score(user_id, alpha=0.7, top_n=5):
    if user_id not in ui_train.index:
        return []

    rated_items = ui_train.loc[user_id][~ui_train.loc[user_id].isna()].index
    scores = {}
    for i in ui_train.columns:
        if not math.isnan(ui_train.loc[user_id, i]):
           continue
        cf_score = preds_cf.loc[user_id, i]
        content_score = 0
        for r in rated_items:
            content_score += content_sim_df.loc[i, r]
        content_score = content_score / len(rated_items) if len(rated_items) else 0
        scores[i] = alpha * cf_score + (1 - alpha) * content_score

    top_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    return movies[movies.movie_id.isin([i[0] for i in top_items])][["title","genres"]]


In [11]:
# ------------------------------------------
# 5. Test Hybrid Recommendations
# ------------------------------------------
print("\nHybrid Recommendations for User 8:")
print(hybrid_score(8, alpha=0.7, top_n=5))


Hybrid Recommendations for User 8:
                                     title                            genres
8                  Dead Man Walking (1995)                             Drama
233                            Jaws (1975)                     Action Horror
236                   Jerry Maguire (1996)                     Drama Romance
251  Lost World: Jurassic Park, The (1997)  Action Adventure Sci-Fi Thriller
477         Philadelphia Story, The (1940)                    Comedy Romance
