<a href="https://colab.research.google.com/github/hawa1983/DATA-612/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Purpose**

The purpose of this script is to enrich the MovieLens movie dataset (`movies.dat`) with detailed movie metadata from The Movie Database (TMDB) API. This metadata includes movie overviews, genres, poster and backdrop image URLs, cast and director information, keywords, user ratings, and trailer links. The enriched dataset will serve as the foundation for building content-based, collaborative, and hybrid recommender systems.

### **Methodology**

1. **Load MovieLens Movie Data**
   The script loads the `movies.dat` file, which contains basic movie information including `movieId`, `title`, and `genres`.

2. **Clean Titles and Extract Years**
   It processes the movie titles to remove the year from the title string and separately extracts the release year to improve search accuracy when querying TMDB.

3. **Query TMDB API**
   For each movie, it sends a search request to TMDB using the cleaned title and release year. If a match is found, it retrieves the movie’s TMDB ID.

4. **Retrieve Detailed Metadata**
   Using the TMDB ID, the script fetches:

   * Overview (plot summary)
   * Poster and backdrop image paths
   * Genre IDs, which are then mapped to readable genre names
   * Top 3 cast members
   * Director(s)
   * Associated keywords
   * YouTube trailer link (if available)

5. **Construct and Save Enriched Dataset**
   All metadata is compiled into a structured format and merged with the original MovieLens data. The final dataset is saved as `movies_enriched_full.csv` for downstream use in recommendation models.


In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# ---------------------------------------
# CONFIG
# ---------------------------------------
BASE_URL = "https://api.themoviedb.org/3"
IMAGE_BASE = "https://image.tmdb.org/t/p/w500"

# Use your TMDB Bearer Token (v4)
HEADERS = {
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIyZGZlNjMwMGMzYjIzMjc2NzExNjQ0N2JhNzhiMjM5MyIsIm5iZiI6MTc1MTkyMjA3Ni4xMzUsInN1YiI6IjY4NmMzNTljMzc4NjllOGEyNDUxZTM0OSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.S773ddH3FiIHtokPW4sYpJog0mXWS1o4OPov1KZneUw"
}

# TMDB genre ID to name mapping
GENRE_ID_TO_NAME = {
    28: "Action", 12: "Adventure", 16: "Animation", 35: "Comedy", 80: "Crime",
    99: "Documentary", 18: "Drama", 10751: "Family", 14: "Fantasy", 36: "History",
    27: "Horror", 10402: "Music", 9648: "Mystery", 10749: "Romance", 878: "Science Fiction",
    10770: "TV Movie", 53: "Thriller", 10752: "War", 37: "Western"
}

# ---------------------------------------
# STEP 1: Load MovieLens .dat Files
# ---------------------------------------

# Load movies.dat - format: MovieID::Title::Genres
movies_df = pd.read_csv("movies.dat", sep="::", engine='python', header=None, names=["movieId", "title", "genres"], encoding="latin-1")

# ---------------------------------------
# STEP 2: Clean Movie Titles and Extract Year
# ---------------------------------------

def extract_year(title):
    if "(" in title:
        try:
            return int(title.strip()[-5:-1])
        except:
            return None
    return None

def clean_title(title):
    if "(" in title:
        return title[:title.rfind("(")].strip()
    return title.strip()

movies_df["year"] = movies_df["title"].apply(extract_year)
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

# ---------------------------------------
# STEP 3: TMDB Metadata Functions
# ---------------------------------------

# Search for movie in TMDB
def search_tmdb(title, year):
    url = f"{BASE_URL}/search/movie"
    params = {"query": title, "year": year}
    response = requests.get(url, headers=HEADERS, params=params)
    r = response.json()
    if r.get("results"):
        return r["results"][0]
    return None

# Get full metadata from TMDB
def get_full_tmdb_metadata(tmdb_id):
    metadata = {}

    # Credits (cast, crew)
    credits = requests.get(f"{BASE_URL}/movie/{tmdb_id}/credits", headers=HEADERS).json()
    cast = [c["name"] for c in credits.get("cast", [])[:3]]
    directors = [c["name"] for c in credits.get("crew", []) if c.get("job") == "Director"]

    # Keywords
    keywords = requests.get(f"{BASE_URL}/movie/{tmdb_id}/keywords", headers=HEADERS).json()
    keyword_list = [k["name"] for k in keywords.get("keywords", [])]

    # Videos (trailers)
    videos = requests.get(f"{BASE_URL}/movie/{tmdb_id}/videos", headers=HEADERS).json()
    trailer_links = [
        f"https://www.youtube.com/watch?v={v['key']}"
        for v in videos.get("results", [])
        if v["site"] == "YouTube" and v["type"] == "Trailer"
    ]

    # Final metadata dictionary
    metadata["top_3_cast"] = ", ".join(cast)
    metadata["directors"] = ", ".join(directors)
    metadata["keywords"] = ", ".join(keyword_list)
    metadata["trailer_link"] = trailer_links[0] if trailer_links else None

    return metadata

# ---------------------------------------
# STEP 4: Enrich Movie Data
# ---------------------------------------

enriched = []

for _, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
    movie_data = search_tmdb(row["clean_title"], row["year"])

    if movie_data:
        tmdb_id = movie_data["id"]
        extra = get_full_tmdb_metadata(tmdb_id)

        genre_ids = movie_data.get("genre_ids", [])
        genre_names = [GENRE_ID_TO_NAME.get(gid, str(gid)) for gid in genre_ids]

        enriched.append({
            "tmdb_id": tmdb_id,
            "overview": movie_data.get("overview", ""),
            "poster_path": IMAGE_BASE + movie_data.get("poster_path", "") if movie_data.get("poster_path") else None,
            "backdrop_path": IMAGE_BASE + movie_data.get("backdrop_path", "") if movie_data.get("backdrop_path") else None,
            "vote_average": movie_data.get("vote_average", None),
            "vote_count": movie_data.get("vote_count", None),
            "tmdb_genres": ", ".join(genre_names),
            **extra
        })
    else:
        enriched.append({
            "tmdb_id": None,
            "overview": None,
            "poster_path": None,
            "backdrop_path": None,
            "vote_average": None,
            "vote_count": None,
            "tmdb_genres": None,
            "top_3_cast": None,
            "directors": None,
            "keywords": None,
            "trailer_link": None
        })

    time.sleep(0.25)  # Respect TMDB API rate limits

# ---------------------------------------
# STEP 5: Save Final Dataset
# ---------------------------------------

enriched_df = pd.DataFrame(enriched)
final_df = pd.concat([movies_df, enriched_df], axis=1)
final_df.to_csv("movies_enriched_full.csv", index=False)

print("DONE: Saved as 'movies_enriched_full.csv'")


FileNotFoundError: [Errno 2] No such file or directory: 'movies.dat'

## **Personalized Content-Based Movie Recommendation System**

This Python script implements a **Content-Based Filtering (CBF)** system enhanced with **personalized recommendations** using user-specific rating profiles. Built using the MovieLens 1M dataset and enriched metadata, the pipeline performs vectorization, similarity computation, and profile-based predictions.

**What This Script Does**

* **Module 1–2**: Load essential libraries and enriched movie data.
* **Module 3**: Load user ratings and demographics.
* **Module 4**: Engineer features combining genres, cast, crew, keywords, and movie overviews.
* **Module 5**: Transform content into TF-IDF, Count, or Binary vectors, and compute pairwise similarities using Cosine or Jaccard metrics.
* **Module 6**: Construct a weighted content profile per user based on past ratings.
* **Module 7**: Recommend top-N movies similar to the user profile, excluding already seen titles.

**Techniques Used**

* **Text Vectorization**: TF-IDF, CountVectorizer, Binary Count
* **Similarity Metrics**: Cosine Similarity, Jaccard Similarity
* **Personalization**: Weighted vector averaging based on each user’s rated items
* **Parallelization**: Speeds up Jaccard similarity computation using joblib

**Use Cases**

* Personalized recommendations for new users with a few ratings (cold-start)
* Improving diversity and relevance in suggested movies
* Generating fallback content suggestions in hybrid recommender systems

In [3]:
!pip install -r requirements.txt



### **Personalized Content-Based Movie Recommendation System Using Hybrid Textual Metadata and Multiple Similarity Models**

**Purpose**

The goal of this project is to build a personalized movie recommendation system that leverages content-based filtering techniques using enriched movie metadata. By incorporating user rating data and multiple text-based similarity strategies, the system aims to generate relevant and diverse movie suggestions tailored to individual user preferences—especially in cold-start or sparsely rated scenarios.

**Methodology**

1. **Data Loading & Preparation**

   * Movie metadata is loaded from an enriched dataset containing genres, keywords, cast, director, overview, and release year.
   * User ratings and demographic data are loaded and used to personalize recommendations.

2. **Feature Engineering**

   * A composite text field (`cbf_features`) is created for each movie by concatenating cleaned metadata fields: genres, keywords, cast, director, overview, and year.

3. **Vectorization**

   * Three representations of movie content are generated:

     * **TF-IDF Vectors**: Capture term importance within documents.
     * **Count Vectors**: Raw term frequencies without weighting.
     * **Binary Genre-Like Vectors**: For Jaccard similarity (1 if feature exists).

4. **Similarity Computation**

   * Cosine similarity is computed for TF-IDF and Count vectors.
   * Jaccard similarity is computed for binary vectors using pairwise intersection-over-union.

5. **User Profiling & Recommendation**

   * For **TF-IDF** and **Count** models:

     * A personalized **user profile vector** is created using a weighted average of vectors from rated movies.
     * Recommendations are generated by finding unseen movies most similar to the user’s profile.
   * For the **Binary + Jaccard** model:

     * The average Jaccard similarity is computed between each unseen movie and the user’s seen movies.

6. **Result Generation & Tagging**

   * Top 50 movie recommendations are produced per user for each model.
   * Each output is tagged with the model name: `"TF-IDF + Cosine"`, `"Count + Cosine"`, or `"Binary + Jaccard"`.

7. **Output Consolidation**

   * All recommendation outputs are combined into one labeled DataFrame for comparative analysis and visualization.

In [10]:
# ==============================
# STEP 1: Imports and Configuration
# ==============================
# Purpose: Import all necessary libraries required for data manipulation, text vectorization, similarity computation,
# evaluation, and parallel processing. These are core components for implementing a content-based recommender system.

import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.model_selection import train_test_split
from joblib import Parallel, delayed
from tqdm import tqdm

# ==============================
# STEP 2: Load Movie and User Data
# ==============================
# Purpose: Load movie metadata (e.g., genres, overview, cast, etc.) and user rating data from local CSV/Dat files.
# These datasets are essential for both feature creation (movie side) and personalization (user side).

def load_movie_data(filepath):
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} movies.")
    return df

def load_user_data(ratings_path, users_path):
    ratings = pd.read_csv(ratings_path, sep="::", engine="python",
                          names=["userId", "movieId", "rating", "timestamp"])
    users = pd.read_csv(users_path, sep="::", engine="python",
                        names=["userId", "gender", "age", "occupation", "zip"])
    print(f"Loaded {len(ratings)} ratings and {len(users)} users.")
    return ratings, users

# ==============================
# STEP 3: Feature Engineering
# ==============================
# Purpose: Create a single text string (cbf_features) per movie by combining genres, keywords, cast, director,
# overview, and year. Deduplicate genres and clean other fields to improve vectorization quality.

def create_feature_string(df):
    # Extract and clean genre strings from both columns
    genre_list_1 = df['genres'].fillna('').str.replace(r'\s+', '', regex=True).str.split('|')
    genre_list_2 = df['tmdb_genres'].fillna('').str.replace(r'\s+', '', regex=True).str.split(',')

    # Merge and deduplicate genres
    merged_genres = [
        ' '.join(sorted(set((g1 or []) + (g2 or []))))
        for g1, g2 in zip(genre_list_1, genre_list_2)
    ]

    # Clean other textual fields
    def clean(col):
        return df[col].fillna('').astype(str).str.replace(',', ' ').str.replace(r'\s+', ' ', regex=True)

    keywords = clean('keywords')
    cast = clean('top_3_cast') if 'top_3_cast' in df else ''
    directors = clean('directors') if 'directors' in df else ''
    overview = df['overview'].fillna('').astype(str).str.lower().str.replace(r'[^\w\s]', '', regex=True)
    year = df['year'].astype(str).fillna('')

    # Concatenate all fields into one text string per movie
    df['cbf_features'] = [
        f"{genres} {kw} {c} {d} {o} {y}".strip()
        for genres, kw, c, d, o, y in zip(merged_genres, keywords, cast, directors, overview, year)
    ]
    return df

# ==============================
# STEP 4: Vectorization and Similarity Computation
# ==============================
# Purpose: Convert text features into numeric vectors using TF-IDF, CountVectorizer, and Binary encoding.
# Then compute cosine and Jaccard similarity matrices to compare movies.

def vectorize_features(feature_series, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer(stop_words='english')
    elif method == 'count':
        vectorizer = CountVectorizer(stop_words='english')
    else:
        raise ValueError("Method must be 'tfidf' or 'count'")
    matrix = vectorizer.fit_transform(feature_series)
    return matrix, vectorizer

def binary_vectorize(feature_series):
    token_lists = feature_series.apply(lambda x: x.split())
    mlb = MultiLabelBinarizer(sparse_output=True)
    matrix = mlb.fit_transform(token_lists)
    return matrix, mlb

def compute_cosine_similarity(matrix):
    return cosine_similarity(matrix)

def jaccard_pairwise_parallel(matrix):
    """
    Compute the Jaccard similarity matrix in parallel using matrix operations.
    Args:
        matrix (csr_matrix): A binary sparse matrix (e.g., multi-hot genre vectors)
    Returns:
        ndarray: Pairwise Jaccard similarity matrix
    """
    A = matrix.astype(bool).astype(int)
    intersection = A @ A.T
    row_sums = np.asarray(A.sum(axis=1)).ravel()  # Fix: convert to ndarray
    union = row_sums[:, None] + row_sums[None, :] - intersection.toarray()
    jaccard = intersection.toarray() / np.maximum(union, 1e-10)  # Avoid division by zero
    return jaccard


# ==============================
# STEP 5: Build User Profile
# ==============================
# Purpose: Construct a personalized user profile vector by averaging the vectors of movies they've rated,
# weighted by the user's actual ratings.

def build_user_profile(user_id, ratings, tfidf_matrix, movie_df):
    user_ratings = ratings[ratings['userId'] == user_id]
    rated_movies = movie_df[movie_df['movieId'].isin(user_ratings['movieId'])]
    indices = rated_movies.index.tolist()
    weights = user_ratings.set_index('movieId').loc[rated_movies['movieId']]['rating'].values
    profile = np.average(tfidf_matrix[indices].toarray(), axis=0, weights=weights)
    return profile.reshape(1, -1)

# ==============================
# STEP 6: Recommendation Functions
# ==============================
# Purpose: Recommend movies to users by comparing either user profile to movie features,
# or movies-to-movies using similarity matrices.

def recommend_from_profile(user_id, ratings, tfidf_matrix, movie_df, model_label, top_n=50):
    user_profile = build_user_profile(user_id, ratings, tfidf_matrix, movie_df)
    sims = cosine_similarity(user_profile, tfidf_matrix).flatten()
    user_seen = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    unseen_indices = movie_df[~movie_df['movieId'].isin(user_seen)].index
    top_indices = unseen_indices[np.argsort(sims[unseen_indices])[-top_n:][::-1]]

    return pd.DataFrame({
        'movieId': movie_df.iloc[top_indices]['movieId'].values,
        'title': movie_df.iloc[top_indices]['title'].values,
        'score': sims[top_indices],
        'model': model_label
    })

def recommend_from_similarity_matrix(user_id, ratings, sim_matrix, movie_df, model_label, top_n=50):
    seen_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    seen_indices = movie_df[movie_df['movieId'].isin(seen_movie_ids)].index.tolist()
    unseen_indices = movie_df[~movie_df['movieId'].isin(seen_movie_ids)].index.tolist()

    if not seen_indices:
        print(f"No ratings found for user {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title', 'score', 'model'])

    mean_sims = sim_matrix[unseen_indices][:, seen_indices].mean(axis=1)
    top_indices = np.argsort(mean_sims)[-top_n:][::-1]
    top_movie_indices = np.array(unseen_indices)[top_indices]

    return pd.DataFrame({
        'movieId': movie_df.iloc[top_movie_indices]['movieId'].values,
        'title': movie_df.iloc[top_movie_indices]['title'].values,
        'score': mean_sims[top_indices],
        'model': model_label
    })

# ==============================
# STEP 7: Run Full Pipeline
# ==============================
# Purpose: Execute the end-to-end CBF pipeline: Load data, engineer features, vectorize them,
# compute similarity matrices, and generate personalized movie recommendations.

movie_df = load_movie_data("movies_enriched_full.csv")
ratings, users = load_user_data("ratings.dat", "users.dat")
movie_df.drop(columns=['trailer_link_y', 'backdrop_path_y', 'poster_path_y'], errors='ignore', inplace=True)

# Create the final feature string used for vectorization
movie_df = create_feature_string(movie_df)

# Vectorize features using three different techniques
tfidf_matrix, _ = vectorize_features(movie_df['cbf_features'], method='tfidf')
count_matrix, _ = vectorize_features(movie_df['cbf_features'], method='count')
binary_matrix, _ = binary_vectorize(movie_df['cbf_features'])

# Compute similarity matrices
sim_matrix_tfidf_cosine = compute_cosine_similarity(tfidf_matrix)
sim_matrix_count_cosine = compute_cosine_similarity(count_matrix)
sim_matrix_binary_jaccard = jaccard_pairwise_parallel(binary_matrix)

# Generate recommendations for a sample user
user_id = 5549
df_tfidf_cosine = recommend_from_profile(user_id, ratings, tfidf_matrix, movie_df, "TF-IDF + Cosine")
df_count_cosine = recommend_from_profile(user_id, ratings, count_matrix, movie_df, "Count + Cosine")
df_binary_jaccard = recommend_from_similarity_matrix(user_id, ratings, sim_matrix_binary_jaccard, movie_df, "Binary + Jaccard")

# Combine all model recommendations for analysis or output
all_recommendations = pd.concat([df_tfidf_cosine, df_count_cosine, df_binary_jaccard], ignore_index=True)

# Preview top recommendations
print("Final Combined Recommendations:")
print(all_recommendations.head())


Loaded 3883 movies.
Loaded 1000209 ratings and 6040 users.
Final Combined Recommendations:
   movieId                                                     title  \
0     1221                            Godfather: Part II, The (1974)   
1      293  Professional, The (a.k.a. Leon: The Professional) (1994)   
2     3540                                    Passion of Mind (1999)   
3     3568                      Smiling Fish and Goat on Fire (1999)   
4     3907                        Prince of Central Park, The (1999)   

      score            model  
0  0.246621  TF-IDF + Cosine  
1  0.226811  TF-IDF + Cosine  
2  0.203520  TF-IDF + Cosine  
3  0.191359  TF-IDF + Cosine  
4  0.191359  TF-IDF + Cosine  


### **Memory-based collaborative filtering module (UBCF, IBCF)**

***Purpose:***

This module implements **memory-based collaborative filtering** using **user-user** or **item-item** similarity. It addresses **user bias** by normalizing ratings through mean-centering and optionally **rescaling predictions** to the original rating scale for interpretability.

***Methodology:***

1. **Rating Matrix Construction**:

   * A user-item matrix is built from raw MovieLens-style ratings data.
   * For `kind='user'`, ratings are mean-centered per user to reduce bias from lenient or strict raters.
   * For `kind='item'`, raw ratings are used directly (no normalization), as the algorithm focuses on item similarities based on a single user's input.

2. **Similarity Computation**:

   * Cosine similarity is computed either:

     * **Across users** for user-based CF (`kind='user'`)
     * **Across items** for item-based CF (`kind='item'`)
   * `sklearn.metrics.pairwise_distances` is used to derive similarity as `1 - cosine_distance`.

3. **Prediction Generation**:

   * For **user-based CF**:

     * Ratings from similar users are weighted by similarity and averaged.
     * The user’s mean rating is **added back** to restore predictions to the original scale (e.g., 1–5).
   * For **item-based CF**:

     * A user’s own ratings are used to compute scores for similar items.
     * No mean is added back, since predictions are already on the correct scale.

4. **Top-N Recommendations**:

   * The system filters out movies the user has already rated.
   * It ranks unseen movies by predicted score and returns the top-N recommendations.
   * Each recommendation is labeled with the model type (`User-Based CF` or `Item-Based CF`) for downstream tracking.

In [5]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# === STEP 1: Load Ratings ===
# Purpose: Import user-movie ratings dataset and split it into training and test sets.
# This allows us to train the model on one portion and evaluate on unseen data.
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# === STEP 2: Create Bias-Adjusted User-Item Matrix ===
# Purpose: Create a user-item ratings matrix and adjust for biases by removing the global mean,
# user bias (tendency to rate high/low), and item bias (popularity effects).
def create_bias_adjusted_matrix(ratings_df):
    matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
    global_mean = ratings_df['rating'].mean()
    user_bias = matrix.sub(global_mean, axis=0).mean(axis=1)
    item_bias = matrix.sub(global_mean, axis=0).sub(user_bias, axis=0).mean(axis=0)
    adjusted = matrix.sub(global_mean).sub(user_bias, axis=0).sub(item_bias, axis=1)
    return adjusted.fillna(0), global_mean, user_bias, item_bias

user_item_matrix, global_mean, user_bias, item_bias = create_bias_adjusted_matrix(train_ratings)

# === STEP 3: Compute Similarity Matrices ===
# Purpose: Measure similarity between users or items using cosine similarity on the bias-adjusted matrix.
# These similarity scores will later help generate personalized recommendations.
user_sim_matrix = cosine_similarity(user_item_matrix)
item_sim_matrix = cosine_similarity(user_item_matrix.T)

# === STEP 4: Recommender Function (Top-N or Full Scores) ===
# Purpose: Generate movie recommendations using memory-based collaborative filtering.
# Predict scores for unseen items using either user-based or item-based similarity and adjust with biases.
def recommend_memory_based(user_id, matrix, global_mean, user_bias, item_bias, sim_matrix, kind='user', top_n=50, return_full=False):
    if kind == 'user':
        sim_scores = sim_matrix[matrix.index.get_loc(user_id)]
        weighted = sim_scores @ matrix.values
        norm = np.abs(sim_scores).sum()
        preds = weighted / norm if norm != 0 else np.zeros_like(weighted)
        preds += global_mean + user_bias.loc[user_id]
    else:
        user_vector = matrix.loc[user_id]
        weighted = user_vector @ sim_matrix
        norm = (user_vector != 0) @ np.abs(sim_matrix)
        with np.errstate(divide='ignore', invalid='ignore'):
            preds = np.true_divide(weighted, norm)
            preds[norm == 0] = 0
        preds += global_mean + user_bias.loc[user_id] + item_bias.values

    preds = np.clip(preds, 1.0, 5.0)
    pred_series = pd.Series(preds, index=matrix.columns)
    seen = train_ratings[train_ratings['userId'] == user_id]['movieId'].tolist()
    pred_series = pred_series.drop(labels=seen, errors='ignore')

    if return_full:
        return pred_series
    else:
        top_preds = pred_series.sort_values(ascending=False).head(top_n)
        return pd.DataFrame({
            'userId': user_id,
            'movieId': top_preds.index,
            'score': top_preds.values
        })

# === STEP 5: Evaluation Function ===
# Purpose: Evaluate model performance using RMSE by comparing predicted scores to actual ratings
# in the test set for multiple users. Measures how accurate the recommender is overall.
def evaluate_model(test_df, matrix, global_mean, user_bias, item_bias, sim_matrix, kind='user'):
    all_preds = []
    for uid in test_df['userId'].unique():
        if uid not in matrix.index:
            continue
        recs = recommend_memory_based(uid, matrix, global_mean, user_bias, item_bias, sim_matrix, kind, top_n=1000, return_full=True)
        actual = test_df[test_df['userId'] == uid]
        merged = pd.merge(actual, recs.rename("score"), on="movieId")
        all_preds.append(merged)

    all_preds_df = pd.concat(all_preds, ignore_index=True)
    rmse = np.sqrt(mean_squared_error(all_preds_df['rating'], all_preds_df['score'])) if not all_preds_df.empty else np.nan
    return rmse

# === STEP 6: Run Evaluation ===
# Purpose: Calculate RMSE for user-based CF, item-based CF, and a dummy predictor
# that always predicts the global mean rating.
user_rmse = evaluate_model(test_ratings, user_item_matrix, global_mean, user_bias, item_bias, user_sim_matrix, 'user')
item_rmse = evaluate_model(test_ratings, user_item_matrix, global_mean, user_bias, item_bias, item_sim_matrix, 'item')
dummy_rmse = np.sqrt(mean_squared_error(test_ratings['rating'], [global_mean] * len(test_ratings)))

print(f"User-Based CF RMSE: {user_rmse:.4f}")
print(f"Item-Based CF RMSE: {item_rmse:.4f}")
print(f"Dummy Predictor RMSE: {dummy_rmse:.4f}")

# === STEP 7: Get Recommendations for a Specific User (Optional) ===
# Purpose: Generate and display the top-N recommended movies for a target user using both models.
# This is useful for presenting personalized suggestions.
user_id = 5549
movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
user_recs = recommend_memory_based(user_id, user_item_matrix, global_mean, user_bias, item_bias, user_sim_matrix, 'user', top_n=50)
item_recs = recommend_memory_based(user_id, user_item_matrix, global_mean, user_bias, item_bias, item_sim_matrix, 'item', top_n=50)

user_recs = user_recs.merge(movies, on='movieId', how='left')
item_recs = item_recs.merge(movies, on='movieId', how='left')

print("\nTop 10 User-Based CF Recommendations:")
print(user_recs[['movieId', 'title', 'score']])

print("\nTop 10 Item-Based CF Recommendations:")
print(item_recs[['movieId', 'title', 'score']])


User-Based CF RMSE: 1.0336
Item-Based CF RMSE: 0.8796
Dummy Predictor RMSE: 1.1197

Top 10 User-Based CF Recommendations:
    movieId                                          title     score
0      2701                          Wild Wild West (1999)  3.608871
1      1917                              Armageddon (1998)  3.608796
2      1721                                 Titanic (1997)  3.607812
3      3753                            Patriot, The (2000)  3.605514
4      2881                         Double Jeopardy (1999)  3.604429
5      2722                           Deep Blue Sea (1999)  3.602775
6       736                                 Twister (1996)  3.602065
7      3113                             End of Days (1999)  3.601182
8       780                  Independence Day (ID4) (1996)  3.600492
9      1101                                 Top Gun (1986)  3.600417
10     2724                           Runaway Bride (1999)  3.600414
11     2605                              Entrapmen

### Hybrid CBF and UBCF Model

In [27]:
# ==============================
# Hybrid Recommender: UBCF + CBF
# ==============================

def hybrid_ubcf_cbf(user_id, user_item_matrix, user_means, user_sim_matrix,
                    tfidf_matrix, ratings, movie_df,
                    w_cf=0.2, w_cbf=0.8, top_n=50):
    """
    Combine UBCF and CBF scores via weighted average.

    Parameters:
    - user_id: int
    - user_item_matrix: pd.DataFrame (mean-centered matrix)
    - user_means: pd.Series
    - user_sim_matrix: np.array
    - tfidf_matrix: sparse matrix from TF-IDF
    - ratings: pd.DataFrame
    - movie_df: pd.DataFrame with movieId, title
    - w_cf: float, weight for UBCF
    - w_cbf: float, weight for CBF
    - top_n: int

    Returns:
    - pd.DataFrame with movieId, title, hybrid_score, model
    """
    # --- UBCF predictions ---
    ubcf_df = recommend_memory_based(
        user_id=user_id,
        user_item_matrix=user_item_matrix,
        user_means=user_means,
        similarity_matrix=user_sim_matrix,
        kind='user',
        top_n=1000  # keep more to allow intersection
    )

    # --- CBF predictions ---
    cbf_df = recommend_from_profile(
        user_id=user_id,
        ratings=ratings,
        tfidf_matrix=tfidf_matrix,
        movie_df=movie_df,
        model_label='CBF',  # temporary label
        top_n=1000
    ).rename(columns={'score': 'cbf_score'})

    # --- Merge ---
    ubcf_df = ubcf_df.rename(columns={'score': 'ubcf_score'})
    merged = pd.merge(ubcf_df, cbf_df, on='movieId')

    # --- Combine Scores ---
    merged['hybrid_score'] = w_cf * merged['ubcf_score'] + w_cbf * merged['cbf_score']
    hybrid_df = merged[['movieId', 'title', 'hybrid_score']].copy()
    hybrid_df['model'] = 'Hybrid (UBCF + CBF)'

    return hybrid_df.sort_values(by='hybrid_score', ascending=False).head(top_n)[
        ['movieId', 'title', 'hybrid_score', 'model']
    ]

# ==============================
# Generate Hybrid Recommendations for User 5549
# ==============================

hybrid_recs = hybrid_ubcf_cbf(
    user_id=5549,
    user_item_matrix=user_item_matrix,
    user_means=user_means,
    user_sim_matrix=user_sim_matrix,
    tfidf_matrix=tfidf_matrix_tfidf,
    ratings=ratings,
    movie_df=movie_df,
    w_cf=0.5,
    w_cbf=0.5,
    top_n=50
)

# ==============================
# Display Output
# ==============================

print("\nTop 50 Hybrid Recommendations for User 5549:")
print(hybrid_recs.head())



Top 50 Hybrid Recommendations for User 5549:
     movieId                              title  hybrid_score  \
0       1221     Godfather: Part II, The (1974)      1.894670   
192     3457             Waking the Dead (1999)      1.866184   
155     3177                 Next Friday (1999)      1.845677   
215     3721                      Trixie (1999)      1.844950   
37      2995  House on Haunted Hill, The (1999)      1.831576   

                   model  
0    Hybrid (UBCF + CBF)  
192  Hybrid (UBCF + CBF)  
155  Hybrid (UBCF + CBF)  
215  Hybrid (UBCF + CBF)  
37   Hybrid (UBCF + CBF)  


## **Model-Based Filtering:**

  * *SVD (Surprise)*: Learns latent features from the rating matrix.
  * *ALS (PySpark)*: Scalable factorization method for large datasets.


### **Module 9: Model-Based Collaborative Filtering (SVD using Surprise)**

**Purpose:**
Use matrix factorization (SVD) to learn latent user/item features from the rating matrix.

**Application:**
- Accurate, scalable recommendations for sparse datasets using user/item embeddings.
- Suitable for small to medium datasets.
- Optimized via `GridSearchCV` for hyperparameter tuning.
- Good interpretability of latent factors per user and item.



In [13]:
# ==============================
# Module 9: Model-Based Collaborative Filtering (SVD using Surprise)
# ==============================

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd
from tqdm import tqdm

# ==============================
# Prepare Surprise Dataset
# ==============================

def prepare_surprise_data(ratings):
    reader = Reader(rating_scale=(0.5, 5.0))
    return Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# ==============================
# Tune SVD Model with Grid Search
# ==============================

def tune_svd_model(data):
    param_grid = {
        'n_factors': [50, 100],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02, 0.1]
    }
    print("Tuning SVD model with GridSearchCV...")
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=0)

    with tqdm(total=1, desc="GridSearchCV") as pbar:
        gs.fit(data)
        pbar.update(1)

    print(f"Best RMSE: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}")
    return gs.best_estimator['rmse']

# ==============================
# Train and Evaluate SVD
# ==============================

def evaluate_svd(model, data, model_label='SVD (Surprise)'):
    trainset, testset = train_test_split(data, test_size=0.2)
    model.fit(trainset)

    print("Making predictions...")
    predictions = [model.predict(item[0], item[1], r_ui=item[2]) for item in tqdm(testset, desc="Predicting")]

    score = rmse(predictions)

    pred_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
    pred_df = pred_df.rename(columns={'uid': 'userId', 'iid': 'movieId', 'rui': 'true_rating', 'est': 'pred_rating'})
    pred_df['model'] = model_label
    return pred_df[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']], score

# ==============================
# Main Execution
# ==============================

# Step 1: Load ratings
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

# Step 2: Prepare Surprise data
data = prepare_surprise_data(ratings)

# Step 3: Tune model
best_svd_model = tune_svd_model(data)

# Step 4: Evaluate model
pred_df, rmse_score = evaluate_svd(best_svd_model, data)

# Step 5: Output
print(pred_df.head())
print(f"Final RMSE: {rmse_score:.4f}")

# Step 6: Top-50 Predictions for User 5549
target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = [mid for mid in all_movie_ids if mid not in rated_movie_ids]

print(f"\nGenerating predictions for User {target_user}...")
top_preds = [(movie_id, best_svd_model.predict(target_user, movie_id).est)
             for movie_id in tqdm(unrated_movie_ids, desc="Predicting for user")]

top_50_df = pd.DataFrame(top_preds, columns=['movieId', 'pred_rating'])
top_50_df = top_50_df.sort_values(by='pred_rating', ascending=False).head(50)
top_50_df['userId'] = target_user
top_50_df['model'] = 'SVD (Surprise)'
top_50_df = top_50_df[['userId', 'movieId', 'pred_rating', 'model']]

# Step 7: Merge with movie titles only
movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_df = top_50_df.merge(movies, on='movieId', how='left')

# Step 8: Final Output
print("\nTop 50 Recommendations for User 5549:")
print(top_50_df[['movieId', 'title', 'pred_rating']].head(10))


Tuning SVD model with GridSearchCV...


GridSearchCV: 100%|██████████| 1/1 [06:38<00:00, 398.79s/it]


Best RMSE: 0.8820577922491172 with params: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.02}
Making predictions...


Predicting: 100%|██████████| 200042/200042 [00:02<00:00, 95750.70it/s]


RMSE: 0.8701
   userId  movieId  true_rating  pred_rating           model
0    1470     2873          1.0     2.022161  SVD (Surprise)
1    1974     3201          4.0     4.217697  SVD (Surprise)
2    2825     2384          5.0     3.837053  SVD (Surprise)
3     462     2640          3.0     3.146119  SVD (Surprise)
4    1937      858          5.0     4.142713  SVD (Surprise)
Final RMSE: 0.8701

Generating predictions for User 5549...


Predicting for user: 100%|██████████| 3673/3673 [00:00<00:00, 173473.70it/s]


Top 50 Recommendations for User 5549:
   movieId  \
0      911   
1     2905   
2     1262   
3     1207   
4     2019   
5      920   
6     1272   
7      913   
8     3338   
9      318   

                                                                 title  \
0                                                       Charade (1963)   
1                                                       Sanjuro (1962)   
2                                             Great Escape, The (1963)   
3                                         To Kill a Mockingbird (1962)   
4  Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)   
5                                            Gone with the Wind (1939)   
6                                                        Patton (1970)   
7                                           Maltese Falcon, The (1941)   
8                                               For All Mankind (1989)   
9                                     Shawshank Redemption, The (1




### **Model-Based Collaborative Filtering (ALS using PySpark)**

**Purpose:**
Use Alternating Least Squares (ALS) to learn latent user/item features at scale.

**Application:**
- Distributed recommendation system for large-scale datasets.
- Runs on Apache Spark for horizontal scalability.
- Handles sparsity well using factorization.
- Suited for real-time, production-level systems with massive data.


In [14]:
# ==============================
# Module 10: Model-Based Collaborative Filtering (ALS using PySpark)
# ==============================

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import Row
import pandas as pd

# --- Start Spark Session ---
spark = SparkSession.builder \
    .appName("ALSModel") \
    .getOrCreate()

# --- Load Ratings ---
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])
ratings_df = spark.createDataFrame(ratings[['userId', 'movieId', 'rating']])

# --- Train ALS Model ---
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    rank=10, maxIter=10, regParam=0.1,
    coldStartStrategy="drop", nonnegative=True
)
als_model = als.fit(ratings_df)

# --- Evaluate ALS Model ---
predictions = als_model.transform(ratings_df)
pred_pd = predictions.select('userId', 'movieId', 'rating', 'prediction').toPandas()
pred_pd = pred_pd.rename(columns={'rating': 'true_rating', 'prediction': 'pred_rating'})
pred_pd['model'] = 'ALS (PySpark)'

# --- Evaluate ALS RMSE ---
evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)
rmse_score = evaluator.evaluate(predictions)

# --- Output Evaluation ---
print(pred_pd[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']].head())
print(f"\nFinal RMSE: {rmse_score:.4f}")

# ==============================
# Step 6: Top-50 Predictions for User 5549
# ==============================

target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))

# Create Spark DataFrame of userId + unrated movieId pairs
user_unrated_pairs = spark.createDataFrame([Row(userId=target_user, movieId=int(mid)) for mid in unrated_movie_ids])

# Predict ratings using ALS model
print(f"\nGenerating Top-50 recommendations for User {target_user}...")
top_preds_df = als_model.transform(user_unrated_pairs).dropna()

# Get top-50 highest predicted ratings
top_50_preds = top_preds_df.orderBy(col("prediction").desc()).limit(50)
top_50_pd = top_50_preds.select("userId", "movieId", "prediction").toPandas()
top_50_pd['model'] = "ALS (PySpark)"
top_50_pd = top_50_pd.rename(columns={'prediction': 'pred_rating'})

# ==============================
# Step 7: Merge with Movie Titles Only
# ==============================

movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_pd = top_50_pd.merge(movies, on='movieId', how='left')

# ==============================
# Step 8: Output Top-50
# ==============================

print("\nTop 50 Recommendations for User 5549:")
print(top_50_pd[['movieId', 'title', 'pred_rating']].head(10))


   userId  movieId  true_rating  pred_rating          model
0     148     2122            4     2.768286  ALS (PySpark)
1     148     2142            4     3.385553  ALS (PySpark)
2     148     2366            5     3.547921  ALS (PySpark)
3     148     3175            5     3.873491  ALS (PySpark)
4     148     1580            4     4.024727  ALS (PySpark)

Final RMSE: 0.8357

Generating Top-50 recommendations for User 5549...

Top 50 Recommendations for User 5549:
   movieId                            title  pred_rating
0      572           Foreign Student (1994)     5.054198
1     1471               Boys Life 2 (1997)     4.919178
2     2760  Gambler, The (A Játékos) (1997)     4.464029
3      953     It's a Wonderful Life (1946)     4.359691
4     1519            Broken English (1996)     4.351626
5     2503          Apple, The (Sib) (1998)     4.329711
6     2129     Saltmen of Tibet, The (1997)     4.324127
7      912                Casablanca (1942)     4.317692
8      751      