<a href="https://colab.research.google.com/github/hawa1983/DATA-612/blob/main/Final_Project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### **Purpose**

The purpose of this script is to enrich the MovieLens movie dataset (`movies.dat`) with detailed movie metadata from The Movie Database (TMDB) API. This metadata includes movie overviews, genres, poster and backdrop image URLs, cast and director information, keywords, user ratings, and trailer links. The enriched dataset will serve as the foundation for building content-based, collaborative, and hybrid recommender systems.

### **Methodology**

1. **Load MovieLens Movie Data**
   The script loads the `movies.dat` file, which contains basic movie information including `movieId`, `title`, and `genres`.

2. **Clean Titles and Extract Years**
   It processes the movie titles to remove the year from the title string and separately extracts the release year to improve search accuracy when querying TMDB.

3. **Query TMDB API**
   For each movie, it sends a search request to TMDB using the cleaned title and release year. If a match is found, it retrieves the movie’s TMDB ID.

4. **Retrieve Detailed Metadata**
   Using the TMDB ID, the script fetches:

   * Overview (plot summary)
   * Poster and backdrop image paths
   * Genre IDs, which are then mapped to readable genre names
   * Top 3 cast members
   * Director(s)
   * Associated keywords
   * YouTube trailer link (if available)

5. **Construct and Save Enriched Dataset**
   All metadata is compiled into a structured format and merged with the original MovieLens data. The final dataset is saved as `movies_enriched_full.csv` for downstream use in recommendation models.


In [None]:
import pandas as pd
import requests
from tqdm import tqdm
import time

# ---------------------------------------
# CONFIG
# ---------------------------------------
BASE_URL = "https://api.themoviedb.org/3"
IMAGE_BASE = "https://image.tmdb.org/t/p/w500"

# Use your TMDB Bearer Token (v4)
HEADERS = {
    "Authorization": "Bearer eyJhbGciOiJIUzI1NiJ9.eyJhdWQiOiIyZGZlNjMwMGMzYjIzMjc2NzExNjQ0N2JhNzhiMjM5MyIsIm5iZiI6MTc1MTkyMjA3Ni4xMzUsInN1YiI6IjY4NmMzNTljMzc4NjllOGEyNDUxZTM0OSIsInNjb3BlcyI6WyJhcGlfcmVhZCJdLCJ2ZXJzaW9uIjoxfQ.S773ddH3FiIHtokPW4sYpJog0mXWS1o4OPov1KZneUw"
}

# TMDB genre ID to name mapping
GENRE_ID_TO_NAME = {
    28: "Action", 12: "Adventure", 16: "Animation", 35: "Comedy", 80: "Crime",
    99: "Documentary", 18: "Drama", 10751: "Family", 14: "Fantasy", 36: "History",
    27: "Horror", 10402: "Music", 9648: "Mystery", 10749: "Romance", 878: "Science Fiction",
    10770: "TV Movie", 53: "Thriller", 10752: "War", 37: "Western"
}

# ---------------------------------------
# STEP 1: Load MovieLens .dat Files
# ---------------------------------------

# Load movies.dat - format: MovieID::Title::Genres
movies_df = pd.read_csv("movies.dat", sep="::", engine='python', header=None, names=["movieId", "title", "genres"], encoding="latin-1")

# ---------------------------------------
# STEP 2: Clean Movie Titles and Extract Year
# ---------------------------------------

def extract_year(title):
    if "(" in title:
        try:
            return int(title.strip()[-5:-1])
        except:
            return None
    return None

def clean_title(title):
    if "(" in title:
        return title[:title.rfind("(")].strip()
    return title.strip()

movies_df["year"] = movies_df["title"].apply(extract_year)
movies_df["clean_title"] = movies_df["title"].apply(clean_title)

# ---------------------------------------
# STEP 3: TMDB Metadata Functions
# ---------------------------------------

# Search for movie in TMDB
def search_tmdb(title, year):
    url = f"{BASE_URL}/search/movie"
    params = {"query": title, "year": year}
    response = requests.get(url, headers=HEADERS, params=params)
    r = response.json()
    if r.get("results"):
        return r["results"][0]
    return None

# Get full metadata from TMDB
def get_full_tmdb_metadata(tmdb_id):
    metadata = {}

    # Credits (cast, crew)
    credits = requests.get(f"{BASE_URL}/movie/{tmdb_id}/credits", headers=HEADERS).json()
    cast = [c["name"] for c in credits.get("cast", [])[:3]]
    directors = [c["name"] for c in credits.get("crew", []) if c.get("job") == "Director"]

    # Keywords
    keywords = requests.get(f"{BASE_URL}/movie/{tmdb_id}/keywords", headers=HEADERS).json()
    keyword_list = [k["name"] for k in keywords.get("keywords", [])]

    # Videos (trailers)
    videos = requests.get(f"{BASE_URL}/movie/{tmdb_id}/videos", headers=HEADERS).json()
    trailer_links = [
        f"https://www.youtube.com/watch?v={v['key']}"
        for v in videos.get("results", [])
        if v["site"] == "YouTube" and v["type"] == "Trailer"
    ]

    # Final metadata dictionary
    metadata["top_3_cast"] = ", ".join(cast)
    metadata["directors"] = ", ".join(directors)
    metadata["keywords"] = ", ".join(keyword_list)
    metadata["trailer_link"] = trailer_links[0] if trailer_links else None

    return metadata

# ---------------------------------------
# STEP 4: Enrich Movie Data
# ---------------------------------------

enriched = []

for _, row in tqdm(movies_df.iterrows(), total=len(movies_df)):
    movie_data = search_tmdb(row["clean_title"], row["year"])

    if movie_data:
        tmdb_id = movie_data["id"]
        extra = get_full_tmdb_metadata(tmdb_id)

        genre_ids = movie_data.get("genre_ids", [])
        genre_names = [GENRE_ID_TO_NAME.get(gid, str(gid)) for gid in genre_ids]

        enriched.append({
            "tmdb_id": tmdb_id,
            "overview": movie_data.get("overview", ""),
            "poster_path": IMAGE_BASE + movie_data.get("poster_path", "") if movie_data.get("poster_path") else None,
            "backdrop_path": IMAGE_BASE + movie_data.get("backdrop_path", "") if movie_data.get("backdrop_path") else None,
            "vote_average": movie_data.get("vote_average", None),
            "vote_count": movie_data.get("vote_count", None),
            "tmdb_genres": ", ".join(genre_names),
            **extra
        })
    else:
        enriched.append({
            "tmdb_id": None,
            "overview": None,
            "poster_path": None,
            "backdrop_path": None,
            "vote_average": None,
            "vote_count": None,
            "tmdb_genres": None,
            "top_3_cast": None,
            "directors": None,
            "keywords": None,
            "trailer_link": None
        })

    time.sleep(0.25)  # Respect TMDB API rate limits

# ---------------------------------------
# STEP 5: Save Final Dataset
# ---------------------------------------

enriched_df = pd.DataFrame(enriched)
final_df = pd.concat([movies_df, enriched_df], axis=1)
final_df.to_csv("movies_enriched_full.csv", index=False)

print("DONE: Saved as 'movies_enriched_full.csv'")


FileNotFoundError: [Errno 2] No such file or directory: 'movies.dat'

## **Personalized Content-Based Movie Recommendation System**

This Python script implements a **Content-Based Filtering (CBF)** system enhanced with **personalized recommendations** using user-specific rating profiles. Built using the MovieLens 1M dataset and enriched metadata, the pipeline performs vectorization, similarity computation, and profile-based predictions.

**What This Script Does**

* **Module 1–2**: Load essential libraries and enriched movie data.
* **Module 3**: Load user ratings and demographics.
* **Module 4**: Engineer features combining genres, cast, crew, keywords, and movie overviews.
* **Module 5**: Transform content into TF-IDF, Count, or Binary vectors, and compute pairwise similarities using Cosine or Jaccard metrics.
* **Module 6**: Construct a weighted content profile per user based on past ratings.
* **Module 7**: Recommend top-N movies similar to the user profile, excluding already seen titles.

**Techniques Used**

* **Text Vectorization**: TF-IDF, CountVectorizer, Binary Count
* **Similarity Metrics**: Cosine Similarity, Jaccard Similarity
* **Personalization**: Weighted vector averaging based on each user’s rated items
* **Parallelization**: Speeds up Jaccard similarity computation using joblib

**Use Cases**

* Personalized recommendations for new users with a few ratings (cold-start)
* Improving diversity and relevance in suggested movies
* Generating fallback content suggestions in hybrid recommender systems

In [4]:
!pip install -r requirements.txt

Collecting numpy==1.26.4 (from -r requirements.txt (line 1))
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m2.2 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-surprise==1.1.4 (from -r requirements.txt (line 3))
  Downloading scikit_surprise-1.1.4.tar.gz (154 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/154.4 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m154.4/154.4 kB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting scikit-learn==1.4.2 (from -r requirements.txt (line 4))


In [46]:
# ==============================
# Module 1: Imports & Configuration
# ==============================
import pandas as pd
import numpy as np
import pickle
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from joblib import Parallel, delayed
from tqdm import tqdm
pd.set_option('display.max_colwidth', None)

# ==============================
# Module 2: Load Movie Data
# ==============================
def load_movie_data(filepath):
    df = pd.read_csv(filepath)
    print(f"Loaded {len(df)} movies.")
    return df

# ==============================
# Module 3: Load User Ratings and Demographics
# ==============================
def load_user_data(ratings_path, users_path):
    ratings = pd.read_csv(ratings_path, sep="::", engine="python",
                          names=["userId", "movieId", "rating", "timestamp"])
    users = pd.read_csv(users_path, sep="::", engine="python",
                        names=["userId", "gender", "age", "occupation", "zip"])
    print(f"Loaded {len(ratings)} ratings and {len(users)} users.")
    return ratings, users

# ==============================
# Module 4: Feature Engineering
# ==============================
def create_feature_string(df):
    def split_and_clean(col, delimiter='|'):
        return col.fillna('').str.replace(r'\s+', '', regex=True).str.split(delimiter)

    genre_list_1 = split_and_clean(df['genres'], delimiter='|')
    genre_list_2 = split_and_clean(df['tmdb_genres'], delimiter=',')
    merged_genres = [
        ' '.join(sorted(set(g1 or []) | set(g2 or [])))
        for g1, g2 in zip(genre_list_1, genre_list_2)
    ]

    def clean_text(col):
        return col.fillna('').str.replace(r'\s+', '', regex=True).str.replace(',', ' ')

    overview_clean = df['overview'].fillna('').str.lower().str.replace('[^\w\s]', '', regex=True)
    year_str = df['year'].astype(str).fillna('')

    df['cbf_features'] = (
        pd.Series(merged_genres) + ' ' +
        clean_text(df['keywords']) + ' ' +
        clean_text(df['top_3_cast']) + ' ' +
        clean_text(df['directors']) + ' ' +
        overview_clean + ' ' +
        year_str
    )

    return df[['movieId', 'title', 'cbf_features']]

# ==============================
# Module 5: Vectorization & Similarity
# ==============================
def vectorize_features(text_series, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer(stop_words='english')
    elif method == 'count':
        vectorizer = CountVectorizer(stop_words='english')
    else:
        raise ValueError("Method must be 'tfidf' or 'count'")
    matrix = vectorizer.fit_transform(text_series)
    print(f"{method.upper()} vectorization complete. Shape: {matrix.shape}")
    return matrix, vectorizer

def binary_vectorize(text_series):
    vectorizer = CountVectorizer(binary=True, stop_words='english')
    matrix = vectorizer.fit_transform(text_series)
    print(f"Binary Count vectorization complete. Shape: {matrix.shape}")
    return matrix.toarray(), vectorizer

def compute_cosine_similarity(matrix):
    sim = cosine_similarity(matrix)
    print("Cosine similarity computed.")
    return sim

def jaccard_pairwise_parallel(matrix):
    n = matrix.shape[0]
    sim_matrix = np.zeros((n, n))

    def jaccard_row(i):
        a = matrix[i]
        row_sim = np.zeros(n)
        for j in range(i, n):
            b = matrix[j]
            intersection = np.logical_and(a, b).sum()
            union = np.logical_or(a, b).sum()
            score = intersection / union if union > 0 else 0.0
            row_sim[j] = score
        return i, row_sim

    results = Parallel(n_jobs=-1)(
        delayed(jaccard_row)(i) for i in tqdm(range(n), desc="Jaccard Similarity")
    )

    for i, row in results:
        sim_matrix[i, i:] = row[i:]
        sim_matrix[i:, i] = row[i:]

    print("Jaccard similarity matrix built.")
    return sim_matrix

def jaccard_pairwise_parallel(A):
    A = A.astype(bool).astype(int)  # Ensure binary
    intersection = A @ A.T
    row_sums = A.sum(axis=1).A1  # Convert to 1D array
    union = row_sums[:, None] + row_sums[None, :] - intersection
    jaccard = intersection / np.maximum(union, 1e-10)  # Prevent divide by zero
    return jaccard

def save_matrix(matrix, filename):
    with open(filename, 'wb') as f:
        pickle.dump(matrix, f)
    print(f"Saved similarity matrix to: {filename}")

# ==============================
# Module 6: Build User Profile
# ==============================
def build_user_profile(user_id, ratings, tfidf_matrix, movie_df):
    user_ratings = ratings[ratings['userId'] == user_id]
    rated_movies = movie_df[movie_df['movieId'].isin(user_ratings['movieId'])]
    indices = rated_movies.index.tolist()
    weights = user_ratings.set_index('movieId').loc[rated_movies['movieId']]['rating'].values
    profile = np.average(tfidf_matrix[indices].toarray(), axis=0, weights=weights)
    return profile.reshape(1, -1)

# ==============================
# Module 7: Personalized Recommendation
# ==============================
def recommend_movies(user_id, ratings, tfidf_matrix, movie_df, top_n=50):
    user_profile = build_user_profile(user_id, ratings, tfidf_matrix, movie_df)
    sims = cosine_similarity(user_profile, tfidf_matrix).flatten()
    user_seen = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    unseen_indices = movie_df[~movie_df['movieId'].isin(user_seen)].index
    top_indices = unseen_indices[np.argsort(sims[unseen_indices])[-top_n:][::-1]]
    return movie_df.iloc[top_indices][['movieId', 'title', 'year']], sims[top_indices]


***Content-Based Similarity Recommendations***

Purpose:
Generate item recommendations using multiple content-based similarity strategies. Each set of recommendations is labeled by model type for downstream evaluation and comparison.

Methodology:
1. Load enriched movie metadata and user ratings.
2. Create combined feature strings using genres, keywords, cast, directors, and overview.
3. Vectorize the features using three methods: TF-IDF, Count, and Binary.
4. Compute pairwise similarity:
   - Cosine similarity for TF-IDF and Count vectors
   - Jaccard similarity for binary vectors
5. For a given user, identify previously seen movies and score unseen ones based on average similarity to the seen set.
6. Return top-N recommendations as labeled DataFrames including: movieId, title, predicted score, and model name.



In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import joblib
import os

# ==============================
# Load Data
# ==============================
movie_df = load_movie_data("movies_enriched_full.csv")
ratings, users = load_user_data("ratings.dat", "users.dat")





movie_df.drop(columns=['trailer_link_y', 'backdrop_path_y', 'poster_path_y'], errors='ignore', inplace=True)

# Rename to expected column names
# movie_df.rename(columns={'top_3_cast': 'cast', 'directors': 'director'}, inplace=True)

# Now safely create feature string
movie_df = create_feature_string(movie_df)

# ==============================
# Split Data
# ==============================
train_ratings, test_ratings = train_test_split(ratings, test_size=0.2, random_state=42)

# ==============================
# Vectorize
# ==============================
tfidf_matrix_tfidf, _ = vectorize_features(movie_df['cbf_features'], method='tfidf')
count_matrix_count, _ = vectorize_features(movie_df['cbf_features'], method='count')
binary_matrix_binary, _ = binary_vectorize(movie_df['cbf_features'])
sim_matrix_binary_jaccard = jaccard_pairwise_parallel(binary_matrix_binary)

# ==============================
# Recommender Functions
# ==============================
def predict_and_evaluate(model_label, user_id, sim_matrix=None, feature_matrix=None):
    if sim_matrix is not None:
        recs = recommend_from_similarity_matrix(user_id, train_ratings, sim_matrix, movie_df, model_label, top_n=50)
    else:
        recs = recommend_from_profile(user_id, train_ratings, feature_matrix, movie_df, model_label, top_n=50)

    merged = pd.merge(recs, test_ratings[test_ratings['userId'] == user_id][['movieId', 'rating']],
                      on='movieId', how='inner')
    merged['userId'] = user_id  # Add userId explicitly
    merged.rename(columns={'rating': 'true_rating', 'score': 'pred_rating'}, inplace=True)


    if not merged.empty:
        rmse = np.sqrt(mean_squared_error(merged['true_rating'], merged['pred_rating']))
        mae = mean_absolute_error(merged['true_rating'], merged['pred_rating'])
    else:
        rmse = np.nan
        mae = np.nan

    merged['model'] = model_label
    return merged[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']], rmse, mae

# ==============================
# Run Evaluation for Each Model
# ==============================
user_id = 5549
results = {}

# TF-IDF + Cosine
tfidf_df, tfidf_rmse, tfidf_mae = predict_and_evaluate("TF-IDF + Cosine", user_id, feature_matrix=tfidf_matrix_tfidf)
results['TF-IDF + Cosine'] = (tfidf_df, tfidf_rmse, tfidf_mae)

# Count + Cosine
count_df, count_rmse, count_mae = predict_and_evaluate("Count + Cosine", user_id, feature_matrix=count_matrix_count)
results['Count + Cosine'] = (count_df, count_rmse, count_mae)

# Binary + Jaccard
jaccard_df, jaccard_rmse, jaccard_mae = predict_and_evaluate("Binary + Jaccard", user_id, sim_matrix=sim_matrix_binary_jaccard)
results['Binary + Jaccard'] = (jaccard_df, jaccard_rmse, jaccard_mae)

# ==============================
# Save Outputs
# ==============================
output_dir = "cbf_outputs"
os.makedirs(output_dir, exist_ok=True)

for model_name, (df, rmse, mae) in results.items():
    print(f"\nModel: {model_name}")
    print(f"  RMSE: {rmse:.4f}" if not np.isnan(rmse) else "  RMSE: N/A")
    print(f"  MAE : {mae:.4f}" if not np.isnan(mae) else "  MAE : N/A")

    filename = model_name.lower().replace(" + ", "_").replace(" ", "_") + "_predictions.csv"
    df.to_csv(os.path.join(output_dir, filename), index=False)


Loaded 3883 movies.
Loaded 1000209 ratings and 6040 users.
TFIDF vectorization complete. Shape: (3883, 33433)
COUNT vectorization complete. Shape: (3883, 33433)
Binary Count vectorization complete. Shape: (3883, 33433)


In [44]:
print(movie_df.columns.tolist())


['movieId', 'title', 'genres', 'year', 'clean_title', 'tmdb_id', 'overview', 'poster_path', 'backdrop_path', 'vote_average', 'vote_count', 'tmdb_genres', 'cast', 'director', 'keywords', 'trailer_link']


In [32]:
import os
import joblib
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.sparse import csr_matrix
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MultiLabelBinarizer

# ========================
# Utility: Save or Load File
# ========================
def save_or_load_similarity(file_path, compute_func, *args, **kwargs):
    if os.path.exists(file_path):
        print(f"✅ Loaded existing similarity matrix: {file_path}")
        return joblib.load(file_path)
    else:
        print(f"⚙️ Computing and saving similarity matrix: {file_path}")
        sim = compute_func(*args, **kwargs)
        joblib.dump(sim, file_path)
        return sim

# ========================
# Vectorize Functions
# ========================
def vectorize_features(feature_series, method='tfidf'):
    if method == 'tfidf':
        vectorizer = TfidfVectorizer()
    elif method == 'count':
        vectorizer = CountVectorizer()
    else:
        raise ValueError("Method must be 'tfidf' or 'count'")

    matrix = vectorizer.fit_transform(feature_series)
    return matrix, vectorizer

def binary_vectorize(feature_series):
    token_lists = feature_series.apply(lambda x: x.split())
    mlb = MultiLabelBinarizer(sparse_output=True)
    matrix = mlb.fit_transform(token_lists)
    return matrix, mlb

def compute_cosine_similarity(matrix):
    return cosine_similarity(matrix)

def jaccard_pairwise_parallel(matrix):
    # Jaccard similarity for sparse binary matrix
    A = matrix.astype(bool).astype(int)
    intersection = A @ A.T
    row_sums = A.sum(axis=1).A1
    union = row_sums[:, None] + row_sums[None, :] - intersection.A
    jaccard = intersection.A / np.maximum(union, 1e-10)
    return jaccard

# ========================
# Load Data and Features
# ========================
movie_df = pd.read_csv("movies_enriched_full.csv")
movie_df.drop(columns=['trailer_link_y', 'backdrop_path_y', 'poster_path_y'], errors='ignore', inplace=True)

def create_feature_string(df):
    df['cbf_features'] = df[['genres', 'keywords', 'cast', 'director']].fillna('').agg(' '.join, axis=1)
    return df

movie_df = create_feature_string(movie_df)

# ========================
# TF-IDF + Cosine Similarity
# ========================
tfidf_matrix, vectorizer_tfidf = vectorize_features(movie_df['cbf_features'], method='tfidf')
sim_matrix_tfidf_cosine = save_or_load_similarity(
    "sim_matrix_tfidf_cosine.pkl",
    compute_cosine_similarity,
    tfidf_matrix
)

# ========================
# Count + Cosine Similarity
# ========================
count_matrix, vectorizer_count = vectorize_features(movie_df['cbf_features'], method='count')
sim_matrix_count_cosine = save_or_load_similarity(
    "sim_matrix_count_cosine.pkl",
    compute_cosine_similarity,
    count_matrix
)

# ========================
# Binary + Jaccard Similarity
# ========================
binary_matrix, _ = binary_vectorize(movie_df['cbf_features'])
sim_matrix_binary_jaccard = save_or_load_similarity(
    "sim_matrix_binary_jaccard.pkl",
    jaccard_pairwise_parallel,
    binary_matrix
)


KeyError: "['cast', 'director'] not in index"

In [33]:
print(movie_df.head())

   movieId                               title                        genres  \
0        1                    Toy Story (1995)   Animation|Children's|Comedy   
1        2                      Jumanji (1995)  Adventure|Children's|Fantasy   
2        3             Grumpier Old Men (1995)                Comedy|Romance   
3        4            Waiting to Exhale (1995)                  Comedy|Drama   
4        5  Father of the Bride Part II (1995)                        Comedy   

   year                  clean_title  tmdb_id  \
0  1995                    Toy Story    862.0   
1  1995                      Jumanji   8844.0   
2  1995             Grumpier Old Men  15602.0   
3  1995            Waiting to Exhale  31357.0   
4  1995  Father of the Bride Part II  11862.0   

                                                                                                                                                                                                                                

In [2]:
# ==============================
# Module 8: Content-Based Similarity Recommendations (Multi-Model)
# ==============================

# ==============================
# Step 1: Load Movie & User Data
# ==============================

movie_df = load_movie_data("movies_enriched_full.csv")
ratings, users = load_user_data("ratings.dat", "users.dat")

# Remove redundant columns if they exist
movie_df.drop(columns=['trailer_link_y', 'backdrop_path_y', 'poster_path_y'], errors='ignore', inplace=True)

# Recreate CBF Features
movie_df = create_feature_string(movie_df)

# ==============================
# Step 2: Vectorize & Compute Similarities
# ==============================

# --- TF-IDF + Cosine ---
tfidf_matrix_tfidf, vectorizer_tfidf = vectorize_features(movie_df['cbf_features'], method='tfidf')
sim_matrix_tfidf_cosine = compute_cosine_similarity(tfidf_matrix_tfidf)

# --- Count + Cosine ---
count_matrix_count, vectorizer_count = vectorize_features(movie_df['cbf_features'], method='count')
sim_matrix_count_cosine = compute_cosine_similarity(count_matrix_count)

# --- Binary + Jaccard ---
binary_matrix_binary, vectorizer_binary = binary_vectorize(movie_df['cbf_features'])
sim_matrix_binary_jaccard = jaccard_pairwise_parallel(binary_matrix_binary)

# ==============================
# Step 3: Recommendation Functions
# ==============================

def recommend_from_similarity_matrix(user_id, ratings, sim_matrix, movie_df, model_label, top_n=50):
    seen_movie_ids = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    seen_indices = movie_df[movie_df['movieId'].isin(seen_movie_ids)].index.tolist()
    unseen_indices = movie_df[~movie_df['movieId'].isin(seen_movie_ids)].index.tolist()

    if not seen_indices:
        print(f"No ratings found for user {user_id}.")
        return pd.DataFrame(columns=['movieId', 'title', 'score', 'model'])

    mean_sims = sim_matrix[unseen_indices][:, seen_indices].mean(axis=1)
    top_indices = np.argsort(mean_sims)[-top_n:][::-1]
    top_movie_indices = np.array(unseen_indices)[top_indices]

    return pd.DataFrame({
        'movieId': movie_df.iloc[top_movie_indices]['movieId'].values,
        'title': movie_df.iloc[top_movie_indices]['title'].values,
        'score': mean_sims[top_indices],
        'model': model_label
    })

def recommend_from_profile(user_id, ratings, tfidf_matrix, movie_df, model_label, top_n=50):
    user_profile = build_user_profile(user_id, ratings, tfidf_matrix, movie_df)
    sims = cosine_similarity(user_profile, tfidf_matrix).flatten()
    user_seen = ratings[ratings['userId'] == user_id]['movieId'].tolist()
    unseen_indices = movie_df[~movie_df['movieId'].isin(user_seen)].index
    top_indices = unseen_indices[np.argsort(sims[unseen_indices])[-top_n:][::-1]]

    return pd.DataFrame({
        'movieId': movie_df.iloc[top_indices]['movieId'].values,
        'title': movie_df.iloc[top_indices]['title'].values,
        'score': sims[top_indices],
        'model': model_label
    })

# ==============================
# Step 4: Generate Recommendations (Labeled Outputs)
# ==============================

user_id = 5549

df_tfidf_cosine = recommend_from_profile(
    user_id, ratings, tfidf_matrix_tfidf, movie_df,
    model_label='TF-IDF + Cosine', top_n=50
)

df_count_cosine = recommend_from_profile(
    user_id, ratings, count_matrix_count, movie_df,
    model_label='Count + Cosine', top_n=50
)

df_binary_jaccard = recommend_from_similarity_matrix(
    user_id, ratings, sim_matrix_binary_jaccard, movie_df,
    model_label='Binary + Jaccard', top_n=50
)

# ==============================
# Step 5: Combine All Model Outputs
# ==============================

all_recommendations_combined = pd.concat([
    df_tfidf_cosine,
    df_count_cosine,
    df_binary_jaccard
], ignore_index=True)

print("Recommendation generation complete. Combined shape:", all_recommendations_combined.shape)
print("\nRecommendation generation complete. Combined shape:\n", all_recommendations_combined)

Loaded 3883 movies.
Loaded 1000209 ratings and 6040 users.
TFIDF vectorization complete. Shape: (3883, 33433)
Cosine similarity computed.
COUNT vectorization complete. Shape: (3883, 33433)
Cosine similarity computed.
Binary Count vectorization complete. Shape: (3883, 33433)


Jaccard Similarity: 100%|██████████| 3883/3883 [17:40<00:00,  3.66it/s]


Jaccard similarity matrix built.
Recommendation generation complete. Combined shape: (150, 4)


### **Memory-based collaborative filtering module (UBCF, IBCF)**

***Purpose:***

This module implements **memory-based collaborative filtering** using **user-user** or **item-item** similarity. It addresses **user bias** by normalizing ratings through mean-centering and optionally **rescaling predictions** to the original rating scale for interpretability.

***Methodology:***

1. **Rating Matrix Construction**:

   * A user-item matrix is built from raw MovieLens-style ratings data.
   * For `kind='user'`, ratings are mean-centered per user to reduce bias from lenient or strict raters.
   * For `kind='item'`, raw ratings are used directly (no normalization), as the algorithm focuses on item similarities based on a single user's input.

2. **Similarity Computation**:

   * Cosine similarity is computed either:

     * **Across users** for user-based CF (`kind='user'`)
     * **Across items** for item-based CF (`kind='item'`)
   * `sklearn.metrics.pairwise_distances` is used to derive similarity as `1 - cosine_distance`.

3. **Prediction Generation**:

   * For **user-based CF**:

     * Ratings from similar users are weighted by similarity and averaged.
     * The user’s mean rating is **added back** to restore predictions to the original scale (e.g., 1–5).
   * For **item-based CF**:

     * A user’s own ratings are used to compute scores for similar items.
     * No mean is added back, since predictions are already on the correct scale.

4. **Top-N Recommendations**:

   * The system filters out movies the user has already rated.
   * It ranks unseen movies by predicted score and returns the top-N recommendations.
   * Each recommendation is labeled with the model type (`User-Based CF` or `Item-Based CF`) for downstream tracking.

In [3]:
# ==============================
# Module 8: Memory-Based Collaborative Filtering (Bias-Normalized)
# ==============================
# Purpose: Compute user-user or item-item similarity from the rating matrix.
# Application: Real-time, interpretable recommendations with optional bias correction.

from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import pandas as pd

# --- Create Mean-Centered User-Item Matrix ---
def create_normalized_user_item_matrix(ratings):
    """
    Purpose: Create a user-item matrix with ratings mean-centered per user.
    Application: Reduces bias from generous or harsh raters.
    """
    matrix = ratings.pivot(index='userId', columns='movieId', values='rating')
    user_means = matrix.mean(axis=1)
    return matrix.sub(user_means, axis=0).fillna(0), user_means

# --- Compute Cosine Similarity ---
def compute_similarity(matrix, kind='user'):
    """
    Purpose: Compute pairwise cosine similarity between users or items.
    Application: Support for User-User or Item-Item collaborative filtering.
    """
    if kind == 'user':
        sim = 1 - pairwise_distances(matrix, metric='cosine')
    elif kind == 'item':
        sim = 1 - pairwise_distances(matrix.T, metric='cosine')
    else:
        raise ValueError("kind must be 'user' or 'item'")

    print(f"{kind.title()}-based similarity computed. Shape: {sim.shape}")
    return sim

# --- Generate Top-N Recommendations with Rescaled Predictions ---
def recommend_memory_based(user_id, user_item_matrix, user_means, similarity_matrix, kind='user', top_n=50):
    """
    Purpose: Recommend items using normalized ratings and return predictions on original scale.
    Application: User-User or Item-Item CF with appropriate bias handling.
    """
    model_label = f"{kind.title()}-Based CF"

    if kind == 'user':
        # User-based: normalize ratings and add back mean after prediction
        user_sim_scores = similarity_matrix[user_id - 1]
        normalized_ratings = user_item_matrix.values

        weighted_scores = user_sim_scores @ normalized_ratings
        sum_weights = np.abs(user_sim_scores).sum()

        if sum_weights == 0:
            print("No similar users found.")
            return pd.DataFrame(columns=['movieId', 'score', 'model'])

        predicted_ratings = weighted_scores / sum_weights
        user_seen = user_item_matrix.loc[user_id]
        unseen_mask = user_seen == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
            .sort_values(ascending=False).head(top_n)

        # Re-center predictions to original scale
        recs += user_means.loc[user_id]

    elif kind == 'item':
        # Item-based: do NOT add back user mean
        user_ratings = user_item_matrix.loc[user_id]
        scores = user_ratings @ similarity_matrix
        sum_weights = (user_ratings != 0) @ np.abs(similarity_matrix)

        with np.errstate(divide='ignore', invalid='ignore'):
            predicted_ratings = np.true_divide(scores, sum_weights)
            predicted_ratings[sum_weights == 0] = 0

        unseen_mask = user_ratings == 0
        recs = pd.Series(predicted_ratings, index=user_item_matrix.columns)[unseen_mask]\
            .sort_values(ascending=False).head(top_n)

    else:
        raise ValueError("kind must be 'user' or 'item'")

    return pd.DataFrame({
        'movieId': recs.index,
        'score': recs.values,
        'model': model_label
    })


***Application of UBCF and IBCF***

In [12]:
# ==============================
# Step 1: Create Bias-Normalized Matrix
# ==============================

user_item_matrix, user_means = create_normalized_user_item_matrix(ratings)

# ==============================
# Step 2: Compute Similarity Matrices
# ==============================

user_sim_matrix = compute_similarity(user_item_matrix, kind='user')
item_sim_matrix = compute_similarity(user_item_matrix, kind='item')

# ==============================
# Step 3: Generate Recommendations
# ==============================

user_cf_recs = recommend_memory_based(
    5549,                    # user_id
    user_item_matrix,
    user_means,
    user_sim_matrix,
    kind='user',
    top_n=50
)

item_cf_recs = recommend_memory_based(
    5549,                    # user_id
    user_item_matrix,
    user_means,
    item_sim_matrix,
    kind='item',
    top_n=50
)

# ==============================
# Step 4: Merge with Movie Titles Only
# ==============================

movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]  # Retain only movieId and title
user_cf_recs = user_cf_recs.merge(movies, on="movieId", how="left")
item_cf_recs = item_cf_recs.merge(movies, on="movieId", how="left")

# ==============================
# Step 5: Display Output
# ==============================

print("\nTop 50 User-Based CF Recommendations for User 5549:")
print(user_cf_recs[['movieId', 'title', 'score']].head())

print("\nTop 50 Item-Based CF Recommendations for User 5549:")
print(item_cf_recs[['movieId', 'title', 'score']].head())


User-based similarity computed. Shape: (6040, 6040)
Item-based similarity computed. Shape: (3706, 3706)

Top 50 User-Based CF Recommendations for User 5549:
   movieId                                      title     score
0     1221             Godfather: Part II, The (1974)  3.600872
1     1304  Butch Cassidy and the Sundance Kid (1969)  3.567786
2      919                   Wizard of Oz, The (1939)  3.566165
3     1207               To Kill a Mockingbird (1962)  3.563622
4     1262                   Great Escape, The (1963)  3.558363

Top 50 Item-Based CF Recommendations for User 5549:
   movieId  \
0     3297   
1     3209   
2     1316   
3     2591   
4     1555   

                                                                title  \
0                                  With Byrd at the South Pole (1930)   
1                                         Loves of Carmen, The (1948)   
2                                                         Anna (1996)   
3  Jeanne and the Perfect Guy

### Hybrid CBF and UBCF Model

In [27]:
# ==============================
# Hybrid Recommender: UBCF + CBF
# ==============================

def hybrid_ubcf_cbf(user_id, user_item_matrix, user_means, user_sim_matrix,
                    tfidf_matrix, ratings, movie_df,
                    w_cf=0.2, w_cbf=0.8, top_n=50):
    """
    Combine UBCF and CBF scores via weighted average.

    Parameters:
    - user_id: int
    - user_item_matrix: pd.DataFrame (mean-centered matrix)
    - user_means: pd.Series
    - user_sim_matrix: np.array
    - tfidf_matrix: sparse matrix from TF-IDF
    - ratings: pd.DataFrame
    - movie_df: pd.DataFrame with movieId, title
    - w_cf: float, weight for UBCF
    - w_cbf: float, weight for CBF
    - top_n: int

    Returns:
    - pd.DataFrame with movieId, title, hybrid_score, model
    """
    # --- UBCF predictions ---
    ubcf_df = recommend_memory_based(
        user_id=user_id,
        user_item_matrix=user_item_matrix,
        user_means=user_means,
        similarity_matrix=user_sim_matrix,
        kind='user',
        top_n=1000  # keep more to allow intersection
    )

    # --- CBF predictions ---
    cbf_df = recommend_from_profile(
        user_id=user_id,
        ratings=ratings,
        tfidf_matrix=tfidf_matrix,
        movie_df=movie_df,
        model_label='CBF',  # temporary label
        top_n=1000
    ).rename(columns={'score': 'cbf_score'})

    # --- Merge ---
    ubcf_df = ubcf_df.rename(columns={'score': 'ubcf_score'})
    merged = pd.merge(ubcf_df, cbf_df, on='movieId')

    # --- Combine Scores ---
    merged['hybrid_score'] = w_cf * merged['ubcf_score'] + w_cbf * merged['cbf_score']
    hybrid_df = merged[['movieId', 'title', 'hybrid_score']].copy()
    hybrid_df['model'] = 'Hybrid (UBCF + CBF)'

    return hybrid_df.sort_values(by='hybrid_score', ascending=False).head(top_n)[
        ['movieId', 'title', 'hybrid_score', 'model']
    ]

# ==============================
# Generate Hybrid Recommendations for User 5549
# ==============================

hybrid_recs = hybrid_ubcf_cbf(
    user_id=5549,
    user_item_matrix=user_item_matrix,
    user_means=user_means,
    user_sim_matrix=user_sim_matrix,
    tfidf_matrix=tfidf_matrix_tfidf,
    ratings=ratings,
    movie_df=movie_df,
    w_cf=0.5,
    w_cbf=0.5,
    top_n=50
)

# ==============================
# Display Output
# ==============================

print("\nTop 50 Hybrid Recommendations for User 5549:")
print(hybrid_recs.head())



Top 50 Hybrid Recommendations for User 5549:
     movieId                              title  hybrid_score  \
0       1221     Godfather: Part II, The (1974)      1.894670   
192     3457             Waking the Dead (1999)      1.866184   
155     3177                 Next Friday (1999)      1.845677   
215     3721                      Trixie (1999)      1.844950   
37      2995  House on Haunted Hill, The (1999)      1.831576   

                   model  
0    Hybrid (UBCF + CBF)  
192  Hybrid (UBCF + CBF)  
155  Hybrid (UBCF + CBF)  
215  Hybrid (UBCF + CBF)  
37   Hybrid (UBCF + CBF)  


## **Model-Based Filtering:**

  * *SVD (Surprise)*: Learns latent features from the rating matrix.
  * *ALS (PySpark)*: Scalable factorization method for large datasets.


### **Module 9: Model-Based Collaborative Filtering (SVD using Surprise)**

**Purpose:**
Use matrix factorization (SVD) to learn latent user/item features from the rating matrix.

**Application:**
- Accurate, scalable recommendations for sparse datasets using user/item embeddings.
- Suitable for small to medium datasets.
- Optimized via `GridSearchCV` for hyperparameter tuning.
- Good interpretability of latent factors per user and item.



In [13]:
# ==============================
# Module 9: Model-Based Collaborative Filtering (SVD using Surprise)
# ==============================

from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd
from tqdm import tqdm

# ==============================
# Prepare Surprise Dataset
# ==============================

def prepare_surprise_data(ratings):
    reader = Reader(rating_scale=(0.5, 5.0))
    return Dataset.load_from_df(ratings[['userId', 'movieId', 'rating']], reader)

# ==============================
# Tune SVD Model with Grid Search
# ==============================

def tune_svd_model(data):
    param_grid = {
        'n_factors': [50, 100],
        'lr_all': [0.005, 0.01],
        'reg_all': [0.02, 0.1]
    }
    print("Tuning SVD model with GridSearchCV...")
    gs = GridSearchCV(SVD, param_grid, measures=['rmse'], cv=3, joblib_verbose=0)

    with tqdm(total=1, desc="GridSearchCV") as pbar:
        gs.fit(data)
        pbar.update(1)

    print(f"Best RMSE: {gs.best_score['rmse']} with params: {gs.best_params['rmse']}")
    return gs.best_estimator['rmse']

# ==============================
# Train and Evaluate SVD
# ==============================

def evaluate_svd(model, data, model_label='SVD (Surprise)'):
    trainset, testset = train_test_split(data, test_size=0.2)
    model.fit(trainset)

    print("Making predictions...")
    predictions = [model.predict(item[0], item[1], r_ui=item[2]) for item in tqdm(testset, desc="Predicting")]

    score = rmse(predictions)

    pred_df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])
    pred_df = pred_df.rename(columns={'uid': 'userId', 'iid': 'movieId', 'rui': 'true_rating', 'est': 'pred_rating'})
    pred_df['model'] = model_label
    return pred_df[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']], score

# ==============================
# Main Execution
# ==============================

# Step 1: Load ratings
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])

# Step 2: Prepare Surprise data
data = prepare_surprise_data(ratings)

# Step 3: Tune model
best_svd_model = tune_svd_model(data)

# Step 4: Evaluate model
pred_df, rmse_score = evaluate_svd(best_svd_model, data)

# Step 5: Output
print(pred_df.head())
print(f"Final RMSE: {rmse_score:.4f}")

# Step 6: Top-50 Predictions for User 5549
target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = [mid for mid in all_movie_ids if mid not in rated_movie_ids]

print(f"\nGenerating predictions for User {target_user}...")
top_preds = [(movie_id, best_svd_model.predict(target_user, movie_id).est)
             for movie_id in tqdm(unrated_movie_ids, desc="Predicting for user")]

top_50_df = pd.DataFrame(top_preds, columns=['movieId', 'pred_rating'])
top_50_df = top_50_df.sort_values(by='pred_rating', ascending=False).head(50)
top_50_df['userId'] = target_user
top_50_df['model'] = 'SVD (Surprise)'
top_50_df = top_50_df[['userId', 'movieId', 'pred_rating', 'model']]

# Step 7: Merge with movie titles only
movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_df = top_50_df.merge(movies, on='movieId', how='left')

# Step 8: Final Output
print("\nTop 50 Recommendations for User 5549:")
print(top_50_df[['movieId', 'title', 'pred_rating']].head(10))


Tuning SVD model with GridSearchCV...


GridSearchCV: 100%|██████████| 1/1 [06:38<00:00, 398.79s/it]


Best RMSE: 0.8820577922491172 with params: {'n_factors': 50, 'lr_all': 0.005, 'reg_all': 0.02}
Making predictions...


Predicting: 100%|██████████| 200042/200042 [00:02<00:00, 95750.70it/s]


RMSE: 0.8701
   userId  movieId  true_rating  pred_rating           model
0    1470     2873          1.0     2.022161  SVD (Surprise)
1    1974     3201          4.0     4.217697  SVD (Surprise)
2    2825     2384          5.0     3.837053  SVD (Surprise)
3     462     2640          3.0     3.146119  SVD (Surprise)
4    1937      858          5.0     4.142713  SVD (Surprise)
Final RMSE: 0.8701

Generating predictions for User 5549...


Predicting for user: 100%|██████████| 3673/3673 [00:00<00:00, 173473.70it/s]


Top 50 Recommendations for User 5549:
   movieId  \
0      911   
1     2905   
2     1262   
3     1207   
4     2019   
5      920   
6     1272   
7      913   
8     3338   
9      318   

                                                                 title  \
0                                                       Charade (1963)   
1                                                       Sanjuro (1962)   
2                                             Great Escape, The (1963)   
3                                         To Kill a Mockingbird (1962)   
4  Seven Samurai (The Magnificent Seven) (Shichinin no samurai) (1954)   
5                                            Gone with the Wind (1939)   
6                                                        Patton (1970)   
7                                           Maltese Falcon, The (1941)   
8                                               For All Mankind (1989)   
9                                     Shawshank Redemption, The (1




### **Model-Based Collaborative Filtering (ALS using PySpark)**

**Purpose:**
Use Alternating Least Squares (ALS) to learn latent user/item features at scale.

**Application:**
- Distributed recommendation system for large-scale datasets.
- Runs on Apache Spark for horizontal scalability.
- Handles sparsity well using factorization.
- Suited for real-time, production-level systems with massive data.


In [14]:
# ==============================
# Module 10: Model-Based Collaborative Filtering (ALS using PySpark)
# ==============================

from pyspark.sql import SparkSession
from pyspark.ml.recommendation import ALS
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.sql.functions import col
from pyspark.sql import Row
import pandas as pd

# --- Start Spark Session ---
spark = SparkSession.builder \
    .appName("ALSModel") \
    .getOrCreate()

# --- Load Ratings ---
ratings = pd.read_csv("ratings.dat", sep="::", engine="python",
                      names=["userId", "movieId", "rating", "timestamp"])
ratings_df = spark.createDataFrame(ratings[['userId', 'movieId', 'rating']])

# --- Train ALS Model ---
als = ALS(
    userCol="userId", itemCol="movieId", ratingCol="rating",
    rank=10, maxIter=10, regParam=0.1,
    coldStartStrategy="drop", nonnegative=True
)
als_model = als.fit(ratings_df)

# --- Evaluate ALS Model ---
predictions = als_model.transform(ratings_df)
pred_pd = predictions.select('userId', 'movieId', 'rating', 'prediction').toPandas()
pred_pd = pred_pd.rename(columns={'rating': 'true_rating', 'prediction': 'pred_rating'})
pred_pd['model'] = 'ALS (PySpark)'

# --- Evaluate ALS RMSE ---
evaluator = RegressionEvaluator(
    metricName='rmse',
    labelCol='rating',
    predictionCol='prediction'
)
rmse_score = evaluator.evaluate(predictions)

# --- Output Evaluation ---
print(pred_pd[['userId', 'movieId', 'true_rating', 'pred_rating', 'model']].head())
print(f"\nFinal RMSE: {rmse_score:.4f}")

# ==============================
# Step 6: Top-50 Predictions for User 5549
# ==============================

target_user = 5549
all_movie_ids = ratings['movieId'].unique()
rated_movie_ids = ratings[ratings['userId'] == target_user]['movieId'].unique()
unrated_movie_ids = list(set(all_movie_ids) - set(rated_movie_ids))

# Create Spark DataFrame of userId + unrated movieId pairs
user_unrated_pairs = spark.createDataFrame([Row(userId=target_user, movieId=int(mid)) for mid in unrated_movie_ids])

# Predict ratings using ALS model
print(f"\nGenerating Top-50 recommendations for User {target_user}...")
top_preds_df = als_model.transform(user_unrated_pairs).dropna()

# Get top-50 highest predicted ratings
top_50_preds = top_preds_df.orderBy(col("prediction").desc()).limit(50)
top_50_pd = top_50_preds.select("userId", "movieId", "prediction").toPandas()
top_50_pd['model'] = "ALS (PySpark)"
top_50_pd = top_50_pd.rename(columns={'prediction': 'pred_rating'})

# ==============================
# Step 7: Merge with Movie Titles Only
# ==============================

movies = pd.read_csv("movies_enriched_full.csv")[['movieId', 'title']]
top_50_pd = top_50_pd.merge(movies, on='movieId', how='left')

# ==============================
# Step 8: Output Top-50
# ==============================

print("\nTop 50 Recommendations for User 5549:")
print(top_50_pd[['movieId', 'title', 'pred_rating']].head(10))


   userId  movieId  true_rating  pred_rating          model
0     148     2122            4     2.768286  ALS (PySpark)
1     148     2142            4     3.385553  ALS (PySpark)
2     148     2366            5     3.547921  ALS (PySpark)
3     148     3175            5     3.873491  ALS (PySpark)
4     148     1580            4     4.024727  ALS (PySpark)

Final RMSE: 0.8357

Generating Top-50 recommendations for User 5549...

Top 50 Recommendations for User 5549:
   movieId                            title  pred_rating
0      572           Foreign Student (1994)     5.054198
1     1471               Boys Life 2 (1997)     4.919178
2     2760  Gambler, The (A Játékos) (1997)     4.464029
3      953     It's a Wonderful Life (1946)     4.359691
4     1519            Broken English (1996)     4.351626
5     2503          Apple, The (Sib) (1998)     4.329711
6     2129     Saltmen of Tibet, The (1997)     4.324127
7      912                Casablanca (1942)     4.317692
8      751      