<a href="https://colab.research.google.com/github/hawa1983/DATA-612/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Purpose**

The purpose of this script is to enrich the MovieLens movie dataset (`movies.dat`) with detailed movie metadata from The Movie Database (TMDB) API. This metadata includes movie overviews, genres, poster and backdrop image URLs, cast and director information, keywords, user ratings, and trailer links. The enriched dataset will serve as the foundation for building content-based, collaborative, and hybrid recommender systems.

### **Methodology**

1. **Load MovieLens Movie Data**
   The script loads the `movies.dat` file, which contains basic movie information including `movieId`, `title`, and `genres`.

2. **Clean Titles and Extract Years**
   It processes the movie titles to remove the year from the title string and separately extracts the release year to improve search accuracy when querying TMDB.

3. **Query TMDB API**
   For each movie, it sends a search request to TMDB using the cleaned title and release year. If a match is found, it retrieves the movie’s TMDB ID.

4. **Retrieve Detailed Metadata**
   Using the TMDB ID, the script fetches:

   * Overview (plot summary)
   * Poster and backdrop image paths
   * Genre IDs, which are then mapped to readable genre names
   * Top 3 cast members
   * Director(s)
   * Associated keywords
   * YouTube trailer link (if available)

5. **Construct and Save Enriched Dataset**
   All metadata is compiled into a structured format and merged with the original MovieLens data. The final dataset is saved as `movies_enriched_full.csv` for downstream use in recommendation models.


In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# ---------------------------------------
# CONFIG
# ---------------------------------------
BASE_URL = "https://api.themoviedb.org/3"
IMAGE_BASE = "https://image.tmdb.org/t/p/w500"

# Use your TMDB Bearer Token (v4)
HEADERS = {
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIyZGZlNjMwMGMzYjIzMjc2NzExNjQ0N2JhNzhiMjM5MyIsIm5iZiI6MTc1MTkyMjA3Ni4xMzUsInN1YiI6IjY4NmMzNTljMzc4NjllOGEyNDUxZTM0OSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.S773ddH3FiIHtokPW4sYpJog0mXWS1o4OPov1KZneUw"
}

# TMDB genre ID to name mapping
GENRE_ID_TO_NAME = {
    28: "Action", 12: "Adventure", 16: "Animation", 35: "Comedy", 80: "Crime",
    99: "Documentary", 18: "Drama", 10751: "Family", 14: "Fantasy", 36: "History",
    27: "Horror", 10402: "Music", 9648: "Mystery", 10749: "Romance", 878: "Science Fiction",
    10770: "TV Movie", 53: "Thriller", 10752: "War", 37: "Western"
}

# ---------------------------------------
# STEP 1: Load MovieLens .dat Files
# ---------------------------------------

# Load movies.dat - format: MovieID::Title::Genres
movies_df = pd.read_csv("movies.dat", sep="::", engine='python', header=None, names=["movieId", "title", "genres"], encoding="latin-1")

# ---------------------------------------
# STEP 2: Clean Movie Titles and Extract Year
# ---------------------------------------

def extract_year(title):
    if "(" in title:
        try:
            return int(title.strip()[-5:-1])
        except:
            return None
    return None

def clean_title(title):
    if "(" in title:
        return title[:title.rfind("(")].strip()
    return title.strip()

movies_df["year"] = movies_df["title"].apply(extract_year)
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

# ---------------------------------------
# STEP 3: TMDB Metadata Functions
# ---------------------------------------

# Search for movie in TMDB
def search_tmdb(title, year):
    url = f"{BASE_URL}/search/movie"
    params = {"query": title, "year": year}
    response = requests.get(url, headers=HEADERS, params=params)
    r = response.json()
    if r.get("results"):
        return r["results"][0]
    return None

# Get full metadata from TMDB
def get_full_tmdb_metadata(tmdb_id):
    metadata = {}

    # Credits (cast, crew)
    credits = requests.get(f"{BASE_URL}/movie/{tmdb_id}/credits", headers=HEADERS).json()
    cast = [c["name"] for c in credits.get("cast", [])[:3]]
    directors = [c["name"] for c in credits.get("crew", []) if c.get("job") == "Director"]

    # Keywords
    keywords = requests.get(f"{BASE_URL}/movie/{tmdb_id}/keywords", headers=HEADERS).json()
    keyword_list = [k["name"] for k in keywords.get("keywords", [])]

    # Videos (trailers)
    videos = requests.get(f"{BASE_URL}/movie/{tmdb_id}/videos", headers=HEADERS).json()
    trailer_links = [
        f"https://www.youtube.com/watch?v={v['key']}"
        for v in videos.get("results", [])
        if v["site"] == "YouTube" and v["type"] == "Trailer"
    ]

    # Final metadata dictionary
    metadata["top_3_cast"] = ", ".join(cast)
    metadata["directors"] = ", ".join(directors)
    metadata["keywords"] = ", ".join(keyword_list)
    metadata["trailer_link"] = trailer_links[0] if trailer_links else None

    return metadata

# ---------------------------------------
# STEP 4: Enrich Movie Data
# ---------------------------------------

enriched = []

for _, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
    movie_data = search_tmdb(row["clean_title"], row["year"])

    if movie_data:
        tmdb_id = movie_data["id"]
        extra = get_full_tmdb_metadata(tmdb_id)

        genre_ids = movie_data.get("genre_ids", [])
        genre_names = [GENRE_ID_TO_NAME.get(gid, str(gid)) for gid in genre_ids]

        enriched.append({
            "tmdb_id": tmdb_id,
            "overview": movie_data.get("overview", ""),
            "poster_path": IMAGE_BASE + movie_data.get("poster_path", "") if movie_data.get("poster_path") else None,
            "backdrop_path": IMAGE_BASE + movie_data.get("backdrop_path", "") if movie_data.get("backdrop_path") else None,
            "vote_average": movie_data.get("vote_average", None),
            "vote_count": movie_data.get("vote_count", None),
            "tmdb_genres": ", ".join(genre_names),
            **extra
        })
    else:
        enriched.append({
            "tmdb_id": None,
            "overview": None,
            "poster_path": None,
            "backdrop_path": None,
            "vote_average": None,
            "vote_count": None,
            "tmdb_genres": None,
            "top_3_cast": None,
            "directors": None,
            "keywords": None,
            "trailer_link": None
        })

    time.sleep(0.25)  # Respect TMDB API rate limits

# ---------------------------------------
# STEP 5: Save Final Dataset
# ---------------------------------------

enriched_df = pd.DataFrame(enriched)
final_df = pd.concat([movies_df, enriched_df], axis=1)
final_df.to_csv("movies_enriched_full.csv", index=False)

print("DONE: Saved as 'movies_enriched_full.csv'")


## **Personalized Content-Based Movie Recommendation System**

This Python script implements a **Content-Based Filtering (CBF)** system enhanced with **personalized recommendations** using user-specific rating profiles. Built using the MovieLens 1M dataset and enriched metadata, the pipeline performs vectorization, similarity computation, and profile-based predictions.

**What This Script Does**

* **Module 1–2**: Load essential libraries and enriched movie data.
* **Module 3**: Load user ratings and demographics.
* **Module 4**: Engineer features combining genres, cast, crew, keywords, and movie overviews.
* **Module 5**: Transform content into TF-IDF, Count, or Binary vectors, and compute pairwise similarities using Cosine or Jaccard metrics.
* **Module 6**: Construct a weighted content profile per user based on past ratings.
* **Module 7**: Recommend top-N movies similar to the user profile, excluding already seen titles.

**Techniques Used**

* **Text Vectorization**: TF-IDF, CountVectorizer, Binary Count
* **Similarity Metrics**: Cosine Similarity, Jaccard Similarity
* **Personalization**: Weighted vector averaging based on each user’s rated items
* **Parallelization**: Speeds up Jaccard similarity computation using joblib

**Use Cases**

* Personalized recommendations for new users with a few ratings (cold-start)
* Improving diversity and relevance in suggested movies
* Generating fallback content suggestions in hybrid recommender systems

In [2]:
!pip install -r requirements.txt



### **Personalized Content-Based Movie Recommendation System Using Hybrid Textual Metadata and Multiple Similarity Models**

**Purpose**

The goal of this project is to build a personalized movie recommendation system that leverages content-based filtering techniques using enriched movie metadata. By incorporating user rating data and multiple text-based similarity strategies, the system aims to generate relevant and diverse movie suggestions tailored to individual user preferences—especially in cold-start or sparsely rated scenarios.

**Methodology**

1. **Data Loading & Preparation**

   * Movie metadata is loaded from an enriched dataset containing genres, keywords, cast, director, overview, and release year.
   * User ratings and demographic data are loaded and used to personalize recommendations.

2. **Feature Engineering**

   * A composite text field (`cbf_features`) is created for each movie by concatenating cleaned metadata fields: genres, keywords, cast, director, overview, and year.

3. **Vectorization**

   * Three representations of movie content are generated:

     * **TF-IDF Vectors**: Capture term importance within documents.
     * **Count Vectors**: Raw term frequencies without weighting.
     * **Binary Genre-Like Vectors**: For Jaccard similarity (1 if feature exists).

4. **Similarity Computation**

   * Cosine similarity is computed for TF-IDF and Count vectors.
   * Jaccard similarity is computed for binary vectors using pairwise intersection-over-union.

5. **User Profiling & Recommendation**

   * For **TF-IDF** and **Count** models:

     * A personalized **user profile vector** is created using a weighted average of vectors from rated movies.
     * Recommendations are generated by finding unseen movies most similar to the user’s profile.
   * For the **Binary + Jaccard** model:

     * The average Jaccard similarity is computed between each unseen movie and the user’s seen movies.

6. **Result Generation & Tagging**

   * Top 50 movie recommendations are produced per user for each model.
   * Each output is tagged with the model name: `"TF-IDF + Cosine"`, `"Count + Cosine"`, or `"Binary + Jaccard"`.

7. **Output Consolidation**

   * All recommendation outputs are combined into one labeled DataFrame for comparative analysis and visualization.

In [None]:
# ===============================
# Hybrid CBF Pipeline with RMSE, Top-N, and CSV Export
# ===============================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from tqdm import tqdm
import warnings

warnings.filterwarnings("ignore")

# Load Data
movies = pd.read_csv("movies_enriched_full.csv")
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

# Feature Engineering
def create_feature_string(df):
    def clean(col):
        return df[col].fillna('').astype(str).str.replace(',', ' ').str.replace(r'\s+', ' ', regex=True)
    df['cbf_features'] = (
        clean('tmdb_genres') + ' ' +
        clean('keywords') + ' ' +
        clean('top_3_cast') + ' ' +
        clean('directors') + ' ' +
        df['overview'].fillna('').str.lower().str.replace(r'[^\w\s]', '', regex=True) + ' ' +
        df['year'].astype(str)
    )
    return df

movies = create_feature_string(movies)

# Train-Test Split Per User
def train_test_split_user(ratings, test_size=0.2):
    train_rows, test_rows = [], []
    for user_id, group in ratings.groupby('userId'):
        if len(group) >= 5:
            train, test = train_test_split(group, test_size=test_size, random_state=42)
            train_rows.append(train)
            test_rows.append(test)
        else:
            train_rows.append(group)
    return pd.concat(train_rows), pd.concat(test_rows)

train_ratings, test_ratings = train_test_split_user(ratings)

# Bias Terms
global_mean = train_ratings['rating'].mean()
user_bias = train_ratings.groupby('userId')['rating'].mean() - global_mean
item_bias = train_ratings.groupby('movieId')['rating'].mean() - global_mean

# Vectorizers
tfidf_matrix = TfidfVectorizer(stop_words='english').fit_transform(movies['cbf_features'])
count_matrix = CountVectorizer(stop_words='english').fit_transform(movies['cbf_features'])
binary_matrix = CountVectorizer(binary=True).fit_transform(movies['cbf_features'])

# Helper Functions
def build_user_profile(user_id, train_ratings, matrix, movies):
    user_train = train_ratings[train_ratings['userId'] == user_id]
    indices = movies[movies['movieId'].isin(user_train['movieId'])].index
    if len(indices) == 0:
        return None
    weights = user_train.set_index('movieId').loc[movies.iloc[indices]['movieId']]['rating'].values
    return np.average(matrix[indices].toarray(), axis=0, weights=weights).reshape(1, -1)

def evaluate_rmse_for_user(user_id, train_ratings, test_ratings, matrix, movies, sim_fn):
    profile = build_user_profile(user_id, train_ratings, matrix, movies)
    if profile is None:
        return None
    user_test = test_ratings[test_ratings['userId'] == user_id]
    test_movies = movies[movies['movieId'].isin(user_test['movieId'])]
    test_indices = test_movies.index
    if len(test_indices) == 0:
        return None
    sims = sim_fn(profile, matrix[test_indices]).flatten()
    b_u = user_bias.get(user_id, 0)
    b_i = item_bias.reindex(test_movies['movieId']).fillna(0).values
    preds = np.clip(global_mean + b_u + b_i + sims * 1.5, 0.5, 5.0)
    actual = user_test.set_index('movieId').loc[test_movies['movieId']]['rating'].values
    return np.sqrt(mean_squared_error(actual, preds))

def evaluate_rmse_all_users(train_ratings, test_ratings, matrix, movies, sim_fn):
    user_ids = test_ratings['userId'].unique()
    rmses = []
    for user_id in tqdm(user_ids, desc="Evaluating users"):
        rmse = evaluate_rmse_for_user(user_id, train_ratings, test_ratings, matrix, movies, sim_fn)
        if rmse is not None:
            rmses.append(rmse)
    return np.mean(rmses)

def recommend_top_n(user_id, train_ratings, matrix, movies, sim_fn, top_n=10):
    profile = build_user_profile(user_id, train_ratings, matrix, movies)
    if profile is None:
        return pd.DataFrame()
    seen = train_ratings[train_ratings['userId'] == user_id]['movieId']
    unseen = movies[~movies['movieId'].isin(seen)]
    sims = sim_fn(profile, matrix[unseen.index]).flatten()
    content_scores = sims * 1.5
    b_u = user_bias.get(user_id, 0)
    b_i = item_bias.reindex(unseen['movieId']).fillna(0).values
    preds = np.clip(global_mean + b_u + b_i + content_scores, 0.5, 5.0)
    top_idx = np.argsort(preds)[-top_n:][::-1]
    return unseen.iloc[top_idx][['movieId', 'title']].assign(predicted_rating=preds[top_idx])

# Save Predictions for Meta-Learner
def save_predictions(user_ids, matrix, sim_fn, label):
    dfs = []
    for user_id in tqdm(user_ids, desc=f"Scoring {label}"):
        profile = build_user_profile(user_id, train_ratings, matrix, movies)
        if profile is None:
            continue
        user_test = test_ratings[test_ratings['userId'] == user_id]
        test_movies = movies[movies['movieId'].isin(user_test['movieId'])]
        test_indices = test_movies.index
        if len(test_indices) == 0:
            continue
        sims = sim_fn(profile, matrix[test_indices]).flatten()
        b_u = user_bias.get(user_id, 0)
        b_i = item_bias.reindex(test_movies['movieId']).fillna(0).values
        preds = np.clip(global_mean + b_u + b_i + sims * 1.5, 0.5, 5.0)
        actual = user_test.set_index('movieId').loc[test_movies['movieId']]['rating'].values
        df = pd.DataFrame({
            'userId': user_id,
            'movieId': test_movies['movieId'].values,
            'true_rating': actual,
            f'{label}_score': preds
        })
        dfs.append(df)
    result = pd.concat(dfs)
    result.to_csv(f'cbf_predictions_{label}.csv', index=False)

# Run Evaluations and Save Predictions
rmse_tfidf = evaluate_rmse_all_users(train_ratings, test_ratings, tfidf_matrix, movies, cosine_similarity)
rmse_count = evaluate_rmse_all_users(train_ratings, test_ratings, count_matrix, movies, cosine_similarity)
rmse_binary = evaluate_rmse_all_users(train_ratings, test_ratings, binary_matrix.toarray(), movies,
                                      lambda x, y: 1 - pairwise_distances(x, y, metric='jaccard'))

print(f"\nTF-IDF + Cosine RMSE: {rmse_tfidf:.4f}")
print(f"Count + Cosine RMSE: {rmse_count:.4f}")
print(f"Binary + Jaccard RMSE: {rmse_binary:.4f}")

# Top-N Recommendations for User 5549
print("\nTop-N Recommendations for User 5549 — TF-IDF")
print(recommend_top_n(5549, train_ratings, tfidf_matrix, movies, cosine_similarity))

print("\nTop-N Recommendations for User 5549 — Count")
print(recommend_top_n(5549, train_ratings, count_matrix, movies, cosine_similarity))

print("\nTop-N Recommendations for User 5549 — Jaccard")
print(recommend_top_n(5549, train_ratings, binary_matrix, movies,
                      lambda x, y: 1 - pairwise_distances(x, y, metric='jaccard')))

# Save Predictions
save_predictions(test_ratings['userId'].unique(), tfidf_matrix, cosine_similarity, 'tfidf')
save_predictions(test_ratings['userId'].unique(), count_matrix, cosine_similarity, 'count')
save_predictions(test_ratings['userId'].unique(), binary_matrix.toarray(),
                 lambda x, y: 1 - pairwise_distances(x, y, metric='jaccard'), 'binary')


Evaluating users:   1%|          | 68/6040 [00:02<03:40, 27.10it/s]

In [3]:
# ============================
# Libraries and Imports
# ============================
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import pairwise_distances
from sklearn.metrics import mean_squared_error  # Replacing root_mean_squared_error for compatibility
from sklearn.model_selection import train_test_split
from tqdm import tqdm

# Suppress DataConversionWarning (especially from Jaccard)
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings("ignore", category=DataConversionWarning)

# ============================
# Step 1: Load Data
# ============================
movies = pd.read_csv("movies_enriched_full.csv")
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

# ============================
# Step 2: Create CBF Features
# ============================
def create_feature_string(df):
    def clean(col):
        return df[col].fillna('').astype(str).str.replace(',', ' ').str.replace(r'\s+', ' ', regex=True)

    genres = df['tmdb_genres'].fillna('').str.replace(',', ' ')
    keywords = clean('keywords')
    cast = clean('top_3_cast')
    director = clean('directors')
    overview = df['overview'].fillna('').str.lower().str.replace(r'[^\w\s]', '', regex=True)
    year = df['year'].astype(str)

    df['cbf_features'] = genres + ' ' + keywords + ' ' + cast + ' ' + director + ' ' + overview + ' ' + year
    return df

movies = create_feature_string(movies)

# ============================
# Step 3: Per-User Train/Test Split
# ============================
def train_test_split_user(ratings, test_size=0.2):
    train_rows = []
    test_rows = []
    for user_id, group in ratings.groupby('userId'):
        if len(group) >= 5:
            train, test = train_test_split(group, test_size=test_size, random_state=42)
            train_rows.append(train)
            test_rows.append(test)
        else:
            train_rows.append(group)
    return pd.concat(train_rows), pd.concat(test_rows)

train_ratings, test_ratings = train_test_split_user(ratings)

# ============================
# Step 4: Compute Bias Terms
# ============================
global_mean = train_ratings['rating'].mean()
user_bias = train_ratings.groupby('userId')['rating'].mean() - global_mean
item_bias = train_ratings.groupby('movieId')['rating'].mean() - global_mean

# ============================
# Step 5: Build User Profile
# ============================
def build_user_profile(user_id, train_ratings, matrix, movies):
    user_train = train_ratings[train_ratings['userId'] == user_id]
    indices = movies[movies['movieId'].isin(user_train['movieId'])].index

    if len(indices) == 0:
        return None

    weights = user_train.set_index('movieId').loc[movies.iloc[indices]['movieId']]['rating'].values

    # Check if matrix is sparse
    row_vectors = matrix[indices].toarray() if hasattr(matrix, "toarray") else matrix[indices]

    return np.average(row_vectors, axis=0, weights=weights).reshape(1, -1)


# ============================
# Step 6: RMSE Evaluation
# ============================
def evaluate_rmse_for_user(user_id, train_ratings, test_ratings, matrix, movies, sim_fn):
    profile = build_user_profile(user_id, train_ratings, matrix, movies)
    if profile is None:
        return None
    user_test = test_ratings[test_ratings['userId'] == user_id]
    test_movies = movies[movies['movieId'].isin(user_test['movieId'])]
    test_indices = test_movies.index
    if len(test_indices) == 0:
        return None

    sims = sim_fn(profile, matrix[test_indices]).flatten()
    content_scores = sims * 1.5
    b_u = user_bias.get(user_id, 0)
    b_i = item_bias.reindex(test_movies['movieId']).fillna(0).values
    preds = np.clip(global_mean + b_u + b_i + content_scores, 0.5, 5.0)
    actual = user_test.set_index('movieId').loc[test_movies['movieId']]['rating'].values
    # return mean_squared_error(actual, preds, squared=False)
    return np.sqrt(mean_squared_error(actual, preds))

def evaluate_rmse_all_users(train_ratings, test_ratings, matrix, movies, sim_fn):
    user_ids = test_ratings['userId'].unique()
    rmse_list = []
    for user_id in tqdm(user_ids, desc="Evaluating users"):
        rmse = evaluate_rmse_for_user(user_id, train_ratings, test_ratings, matrix, movies, sim_fn)
        if rmse is not None:
            rmse_list.append(rmse)
    return np.mean(rmse_list) if rmse_list else None

# ============================
# Step 7: Top-N Recommendations
# ============================
def recommend_top_n(user_id, train_ratings, matrix, movies, sim_fn, top_n=10):
    profile = build_user_profile(user_id, train_ratings, matrix, movies)
    if profile is None:
        return pd.DataFrame()
    seen = train_ratings[train_ratings['userId'] == user_id]['movieId']
    unseen = movies[~movies['movieId'].isin(seen)]
    sims = sim_fn(profile, matrix[unseen.index]).flatten()
    content_scores = sims * 1.5
    b_u = user_bias.get(user_id, 0)
    b_i = item_bias.reindex(unseen['movieId']).fillna(0).values
    preds = np.clip(global_mean + b_u + b_i + content_scores, 0.5, 5.0)
    top_idx = np.argsort(preds)[-top_n:][::-1]
    return unseen.iloc[top_idx][['movieId', 'title']].assign(predicted_rating=preds[top_idx])

# ============================
# Step 8: Vectorization
# ============================
tfidf_vectorizer = TfidfVectorizer(stop_words='english')
count_vectorizer = CountVectorizer(stop_words='english')
binary_vectorizer = CountVectorizer(binary=True)

tfidf_matrix = tfidf_vectorizer.fit_transform(movies['cbf_features'])
count_matrix = count_vectorizer.fit_transform(movies['cbf_features'])
binary_matrix = binary_vectorizer.fit_transform(movies['cbf_features'])

# ============================
# Step 9: RMSE Evaluation
# ============================
rmse_tfidf = evaluate_rmse_all_users(train_ratings, test_ratings, tfidf_matrix, movies, cosine_similarity)
rmse_count = evaluate_rmse_all_users(train_ratings, test_ratings, count_matrix, movies, cosine_similarity)
rmse_binary = evaluate_rmse_all_users(train_ratings, test_ratings, binary_matrix.toarray(),  # Convert to dense
    movies,
    lambda x, y: 1 - pairwise_distances(x, y, metric='jaccard')
)

print(f"\nTF-IDF + Cosine RMSE: {rmse_tfidf:.4f}")
print(f"Count + Cosine RMSE: {rmse_count:.4f}")
print(f"Binary + Jaccard RMSE: {rmse_binary:.4f}")

# ============================
# Step 10: Show Top-N for User 5549
# ============================
print("\nTop-N Recommendations for User 5549 — TF-IDF")
print(recommend_top_n(5549, train_ratings, tfidf_matrix, movies, cosine_similarity))

print("\nTop-N Recommendations for User 5549 — Count")
print(recommend_top_n(5549, train_ratings, count_matrix, movies, cosine_similarity))

print("\nTop-N Recommendations for User 5549 — Jaccard")
print(recommend_top_n(5549, train_ratings, binary_matrix, movies,
                      lambda x, y: 1 - pairwise_distances(x, y, metric='jaccard')))


Evaluating users: 100%|██████████| 6040/6040 [03:47<00:00, 26.49it/s]
Evaluating users: 100%|██████████| 6040/6040 [03:45<00:00, 26.76it/s]
Evaluating users: 100%|██████████| 6040/6040 [04:10<00:00, 24.14it/s]



TF-IDF + Cosine RMSE: 0.9424
Count + Cosine RMSE: 0.9693
Binary + Jaccard RMSE: 0.9297

Top-N Recommendations for User 5549 — TF-IDF
      movieId                                              title  \
3164     3233                               Smashing Time (1967)   
1396     1420  Message to Love: The Isle of Wight Festival (1...   
3313     3382                             Song of Freedom (1936)   
3587     3656                                       Lured (1947)   
777       787                 Gate of Heavenly Peace, The (1995)   
568       572                             Foreign Student (1994)   
977       989          Schlafes Bruder (Brother of Sleep) (1995)   
3811     3881                           Bittersweet Motel (2000)   
1339     1360  Identification of a Woman (Identificazione di ...   
1762     1830                            Follow the Bitch (1998)   

      predicted_rating  
3164               5.0  
1396               5.0  
3313               5.0  
3587             

TypeError: scipy distance metrics do not support sparse matrices.

### **Memory-based collaborative filtering module (UBCF, IBCF)**

***Purpose:***

This module implements **memory-based collaborative filtering** using **user-user** or **item-item** similarity. It addresses **user bias** by normalizing ratings through mean-centering and optionally **rescaling predictions** to the original rating scale for interpretability.

***Methodology:***

1. **Rating Matrix Construction**:

   * A user-item matrix is built from raw MovieLens-style ratings data.
   * For `kind='user'`, ratings are mean-centered per user to reduce bias from lenient or strict raters.
   * For `kind='item'`, raw ratings are used directly (no normalization), as the algorithm focuses on item similarities based on a single user's input.

2. **Similarity Computation**:

   * Cosine similarity is computed either:

     * **Across users** for user-based CF (`kind='user'`)
     * **Across items** for item-based CF (`kind='item'`)
   * `sklearn.metrics.pairwise_distances` is used to derive similarity as `1 - cosine_distance`.

3. **Prediction Generation**:

   * For **user-based CF**:

     * Ratings from similar users are weighted by similarity and averaged.
     * The user’s mean rating is **added back** to restore predictions to the original scale (e.g., 1–5).
   * For **item-based CF**:

     * A user’s own ratings are used to compute scores for similar items.
     * No mean is added back, since predictions are already on the correct scale.

4. **Top-N Recommendations**:

   * The system filters out movies the user has already rated.
   * It ranks unseen movies by predicted score and returns the top-N recommendations.
   * Each recommendation is labeled with the model type (`User-Based CF` or `Item-Based CF`) for downstream tracking.

In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

# === STEP 1: Load Ratings ===
# Purpose: Import user-movie ratings dataset and split it into training and test sets.
# This allows us to train the model on one portion and evaluate on unseen data.
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# === STEP 2: Create Bias-Adjusted User-Item Matrix ===
# Purpose: Create a user-item ratings matrix and adjust for biases by removing the global mean,
# user bias (tendency to rate high/low), and item bias (popularity effects).
def create_bias_adjusted_matrix(ratings_df):
    matrix = ratings_df.pivot(index='userId', columns='movieId', values='rating')
    global_mean = ratings_df['rating'].mean()
    user_bias = matrix.sub(global_mean, axis=0).mean(axis=1)
    item_bias = matrix.sub(global_mean, axis=0).sub(user_bias, axis=0).mean(axis=0)
    adjusted = matrix.sub(global_mean).sub(user_bias, axis=0).sub(item_bias, axis=1)
    return adjusted.fillna(0), global_mean, user_bias, item_bias

user_item_matrix, global_mean, user_bias, item_bias = create_bias_adjusted_matrix(train_ratings)

# === STEP 3: Compute Similarity Matrices ===
# Purpose: Measure similarity between users or items using cosine similarity on the bias-adjusted matrix.
# These similarity scores will later help generate personalized recommendations.
user_sim_matrix = cosine_similarity(user_item_matrix)
item_sim_matrix = cosine_similarity(user_item_matrix.T)

# === STEP 4: Recommender Function (Top-N or Full Scores) ===
# Purpose: Generate movie recommendations using memory-based collaborative filtering.
# Predict scores for unseen items using either user-based or item-based similarity and adjust with biases.
def recommend_memory_based(user_id, matrix, global_mean, user_bias, item_bias, sim_matrix, kind='user', top_n=50, return_full=False):
    if kind == 'user':
        sim_scores = sim_matrix[matrix.index.get_loc(user_id)]
        weighted = sim_scores @ matrix.values
        norm = np.abs(sim_scores).sum()
        preds = weighted / norm if norm != 0 else np.zeros_like(weighted)
        preds += global_mean + user_bias.loc[user_id]
    else:
        user_vector = matrix.loc[user_id]
        weighted = user_vector @ sim_matrix
        norm = (user_vector != 0) @ np.abs(sim_matrix)
        with np.errstate(divide='ignore', invalid='ignore'):
            preds = np.true_divide(weighted, norm)
            preds[norm == 0] = 0
        preds += global_mean + user_bias.loc[user_id] + item_bias.values

    preds = np.clip(preds, 1.0, 5.0)
    pred_series = pd.Series(preds, index=matrix.columns)
    seen = train_ratings[train_ratings['userId'] == user_id]['movieId'].tolist()
    pred_series = pred_series.drop(labels=seen, errors='ignore')

    if return_full:
        return pred_series
    else:
        top_preds = pred_series.sort_values(ascending=False).head(top_n)
        return pd.DataFrame({
            'userId': user_id,
            'movieId': top_preds.index,
            'score': top_preds.values
        })

# === STEP 5: Evaluation Function ===
# Purpose: Evaluate model performance using RMSE by comparing predicted scores to actual ratings
# in the test set for multiple users. Measures how accurate the recommender is overall.
def evaluate_model(test_df, matrix, global_mean, user_bias, item_bias, sim_matrix, kind='user'):
    all_preds = []
    for uid in test_df['userId'].unique():
        if uid not in matrix.index:
            continue
        recs = recommend_memory_based(uid, matrix, global_mean, user_bias, item_bias, sim_matrix, kind, top_n=1000, return_full=True)
        actual = test_df[test_df['userId'] == uid]
        merged = pd.merge(actual, recs.rename("score"), on="movieId")
        all_preds.append(merged)

    all_preds_df = pd.concat(all_preds, ignore_index=True)
    rmse = np.sqrt(mean_squared_error(all_preds_df['rating'], all_preds_df['score'])) if not all_preds_df.empty else np.nan
    return rmse

# === STEP 6: Run Evaluation ===
# Purpose: Calculate RMSE for user-based CF, item-based CF, and a dummy predictor
# that always predicts the global mean rating.
user_rmse = evaluate_model(test_ratings, user_item_matrix, global_mean, user_bias, item_bias, user_sim_matrix, 'user')
item_rmse = evaluate_model(test_ratings, user_item_matrix, global_mean, user_bias, item_bias, item_sim_matrix, 'item')
dummy_rmse = np.sqrt(mean_squared_error(test_ratings['rating'], [global_mean] * len(test_ratings)))

print(f"User-Based CF RMSE: {user_rmse:.4f}")
print(f"Item-Based CF RMSE: {item_rmse:.4f}")
print(f"Dummy Predictor RMSE: {dummy_rmse:.4f}")

# === STEP 7: Get Recommendations for a Specific User (Optional) ===
# Purpose: Generate and display the top-N recommended movies for a target user using both models.
# This is useful for presenting personalized suggestions.
user_id = 5549
movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
user_recs = recommend_memory_based(user_id, user_item_matrix, global_mean, user_bias, item_bias, user_sim_matrix, 'user', top_n=50)
item_recs = recommend_memory_based(user_id, user_item_matrix, global_mean, user_bias, item_bias, item_sim_matrix, 'item', top_n=50)

user_recs = user_recs.merge(movies, on='movieId', how='left')
item_recs = item_recs.merge(movies, on='movieId', how='left')

print("\nTop 10 User-Based CF Recommendations:")
print(user_recs[['movieId', 'title', 'score']])

print("\nTop 10 Item-Based CF Recommendations:")
print(item_recs[['movieId', 'title', 'score']])

# === STEP 8: Hybrid Score Fusion (Weighted Average) ===
# Purpose: Combine UBCF and IBCF scores using a weighted average for improved recommendations.

# Merge UBCF and IBCF scores
merged = user_recs[['movieId', 'score']].rename(columns={'score': 'ubcf_score'}).merge(
    item_recs[['movieId', 'score']].rename(columns={'score': 'ibcf_score'}),
    on='movieId'
)

# Assign weights (adjust as needed)
ubcf_weight = 0.5
ibcf_weight = 0.5

# Compute hybrid score
merged['hybrid_score'] = ubcf_weight * merged['ubcf_score'] + ibcf_weight * merged['ibcf_score']
hybrid_recs = merged.merge(movies, on='movieId', how='left').sort_values('hybrid_score', ascending=False)

print("\nTop 10 Hybrid Recommendations:")
print(hybrid_recs[['movieId', 'title', 'hybrid_score']].head(10))



**Grid Search Optimization of Hybrid Recommender Weights (UBCF + IBCF + CBF)**

**Purpose:**

To determine the optimal combination of weights for the hybrid recommender system that blends User-Based Collaborative Filtering (UBCF), Item-Based Collaborative Filtering (IBCF), and Content-Based Filtering (CBF), in order to minimize prediction error (RMSE) and improve recommendation accuracy.

**Methodology:**

1. **Define Evaluation Metric:**
   Use Root Mean Squared Error (RMSE) to evaluate how well predicted ratings from the hybrid model match actual user ratings in the test set.

2. **Generate Weight Combinations:**
   Create a grid of possible weight combinations for UBCF, IBCF, and CBF using increments of 0.2, ensuring that all weights sum to 1.0.

3. **Calculate Hybrid Predictions:**
   For each user in the test set (or a subset for faster testing), generate hybrid recommendations using the current weight combination.

4. **Compute RMSE:**
   Compare predicted scores to actual ratings for that user and compute RMSE. Repeat for each user and take the mean RMSE across all users.

5. **Store and Rank Results:**
   Store all weight combinations along with their RMSEs, then sort the results to identify the combination that yields the lowest error.

6. **Visualize Results:**
   Generate a 3D scatter plot showing the relationship between UBCF, IBCF, and RMSE for visual insight into optimal regions of the weight space.


In [None]:
from itertools import product

def grid_search_hybrid_weights(user_id, user_item_matrix, user_means,
                                user_sim_matrix, item_sim_matrix,
                                tfidf_matrix, ratings, movie_df,
                                weight_step=0.2, k=10):
    """
    Grid search over hybrid weight combinations (UBCF, IBCF, CBF) where weights sum to 1.
    Returns the top-k hybrid combinations based on average hybrid score.
    """

    results = []

    # Generate all valid combinations of weights summing to 1
    steps = np.arange(0, 1 + weight_step, weight_step)
    for w1, w2, w3 in product(steps, repeat=3):
        if abs((w1 + w2 + w3) - 1.0) > 1e-5:
            continue

        try:
            recs = hybrid_ubcf_ibcf_cbf(
                user_id=user_id,
                user_item_matrix=user_item_matrix,
                user_means=user_means,
                user_sim_matrix=user_sim_matrix,
                item_sim_matrix=item_sim_matrix,
                tfidf_matrix=tfidf_matrix,
                ratings=ratings,
                movie_df=movie_df,
                w_ubcf=w1,
                w_ibcf=w2,
                w_cbf=w3,
                top_n=50
            )

            avg_score = recs['hybrid_score'].mean()
            results.append({
                'w_ubcf': w1,
                'w_ibcf': w2,
                'w_cbf': w3,
                'avg_hybrid_score': avg_score,
                'top_recs': recs
            })
        except Exception as e:
            print(f"Skipped combo ({w1}, {w2}, {w3}): {e}")
            continue

    results = sorted(results, key=lambda x: x['avg_hybrid_score'], reverse=True)
    return results[:k]

grid_results = grid_search_hybrid_weights(
    user_id=5549,
    user_item_matrix=user_item_matrix,
    user_means=user_means,
    user_sim_matrix=user_sim_matrix,
    item_sim_matrix=item_sim_matrix,
    tfidf_matrix=tfidf_matrix,
    ratings=ratings,
    movie_df=movies,
    weight_step=0.2,
    k=10
)

for res in grid_results:
    print(f"Weights -> UBCF: {res['w_ubcf']}, IBCF: {res['w_ibcf']}, CBF: {res['w_cbf']}")
    print(res['top_recs'][['title', 'hybrid_score']].head(5))
    print("-" * 40)


## **Model-Based Filtering:**

  * *SVD (Surprise)*: Learns latent features from the rating matrix.
  * *ALS (PySpark)*: Scalable factorization method for large datasets.


### **Module 9: Model-Based Collaborative Filtering (SVD using Surprise)**

**Purpose:**
Use matrix factorization (SVD) to learn latent user/item features from the rating matrix.

**Application:**
- Accurate, scalable recommendations for sparse datasets using user/item embeddings.
- Suitable for small to medium datasets.
- Optimized via `GridSearchCV` for hyperparameter tuning.
- Good interpretability of latent factors per user and item.



In [None]:
# ==============================
# Module 9: Model-Based Collaborative Filtering (SVD using Surprise)
# ==============================

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd
from tqdm import tqdm

# ==============================
# Prepare Surprise Dataset
# ==============================

def prepare_surprise_data(ratings):
    reader = Reader(rating_scale=(0.5, 5.0))
    return Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# ==============================
# Tune SVD Model with Grid Search
# ==============================

def tune_svd_model(data):
    param_grid = {
        'n_factors': [50, 100],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02, 0.1]
    }
    print("Tuning SVD model with GridSearchCV...")
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=0)

    with tqdm(total=1, desc="GridSearchCV") as pbar:
        gs.fit(data)
        pbar.update(1)

    print(f"Best RMSE: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}")
    return gs.best_estimator['rmse']

# ==============================
# Train and Evaluate SVD
# ==============================

def evaluate_svd(model, data, model_label='SVD (Surprise)'):
    trainset, testset = train_test_split(data, test_size=0.2)
    model.fit(trainset)

    print("Making predictions...")
    predictions = [model.predict(item[0], item[1], r_ui=item[2]) for item in tqdm(testset, desc="Predicting")]

    score = rmse(predictions)

    pred_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
    pred_df = pred_df.rename(columns={'uid': 'userId', 'iid': 'movieId', 'rui': 'true_rating', 'est': 'pred_rating'})
    pred_df['model'] = model_label
    return pred_df[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']], score

# ==============================
# Main Execution
# ==============================

# Step 1: Load ratings
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

# Step 2: Prepare Surprise data
data = prepare_surprise_data(ratings)

# Step 3: Tune model
best_svd_model = tune_svd_model(data)

# Step 4: Evaluate model
pred_df, rmse_score = evaluate_svd(best_svd_model, data)

# Step 5: Output
print(pred_df.head())
print(f"Final RMSE: {rmse_score:.4f}")

# Step 6: Top-50 Predictions for User 5549
target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = [mid for mid in all_movie_ids if mid not in rated_movie_ids]

print(f"\nGenerating predictions for User {target_user}...")
top_preds = [(movie_id, best_svd_model.predict(target_user, movie_id).est)
             for movie_id in tqdm(unrated_movie_ids, desc="Predicting for user")]

top_50_df = pd.DataFrame(top_preds, columns=['movieId', 'pred_rating'])
top_50_df = top_50_df.sort_values(by='pred_rating', ascending=False).head(50)
top_50_df['userId'] = target_user
top_50_df['model'] = 'SVD (Surprise)'
top_50_df = top_50_df[['userId', 'movieId', 'pred_rating', 'model']]

# Step 7: Merge with movie titles only
movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_df = top_50_df.merge(movies, on='movieId', how='left')

# Step 8: Final Output
print("\nTop 50 Recommendations for User 5549:")
print(top_50_df[['movieId', 'title', 'pred_rating']].head(10))


### **Model-Based Collaborative Filtering (ALS using PySpark)**

**Purpose:**
Use Alternating Least Squares (ALS) to learn latent user/item features at scale.

**Application:**
- Distributed recommendation system for large-scale datasets.
- Runs on Apache Spark for horizontal scalability.
- Handles sparsity well using factorization.
- Suited for real-time, production-level systems with massive data.


In [None]:
# ==============================
# Module 10: Model-Based Collaborative Filtering (ALS using PySpark)
# ==============================

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import Row
import pandas as pd

# --- Start Spark Session ---
spark = SparkSession.builder \
    .appName("ALSModel") \
    .getOrCreate()

# --- Load Ratings ---
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])
ratings_df = spark.createDataFrame(ratings[['userId', 'movieId', 'rating']])

# --- Train ALS Model ---
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    rank=10, maxIter=10, regParam=0.1,
    coldStartStrategy="drop", nonnegative=True
)
als_model = als.fit(ratings_df)

# --- Evaluate ALS Model ---
predictions = als_model.transform(ratings_df)
pred_pd = predictions.select('userId', 'movieId', 'rating', 'prediction').toPandas()
pred_pd = pred_pd.rename(columns={'rating': 'true_rating', 'prediction': 'pred_rating'})
pred_pd['model'] = 'ALS (PySpark)'

# --- Evaluate ALS RMSE ---
evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)
rmse_score = evaluator.evaluate(predictions)

# --- Output Evaluation ---
print(pred_pd[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']].head())
print(f"\nFinal RMSE: {rmse_score:.4f}")

# ==============================
# Step 6: Top-50 Predictions for User 5549
# ==============================

target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))

# Create Spark DataFrame of userId + unrated movieId pairs
user_unrated_pairs = spark.createDataFrame([Row(userId=target_user, movieId=int(mid)) for mid in unrated_movie_ids])

# Predict ratings using ALS model
print(f"\nGenerating Top-50 recommendations for User {target_user}...")
top_preds_df = als_model.transform(user_unrated_pairs).dropna()

# Get top-50 highest predicted ratings
top_50_preds = top_preds_df.orderBy(col("prediction").desc()).limit(50)
top_50_pd = top_50_preds.select("userId", "movieId", "prediction").toPandas()
top_50_pd['model'] = "ALS (PySpark)"
top_50_pd = top_50_pd.rename(columns={'prediction': 'pred_rating'})

# ==============================
# Step 7: Merge with Movie Titles Only
# ==============================

movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_pd = top_50_pd.merge(movies, on='movieId', how='left')

# ==============================
# Step 8: Output Top-50
# ==============================

print("\nTop 50 Recommendations for User 5549:")
print(top_50_pd[['movieId', 'title', 'pred_rating']].head(10))
