<a href="https://colab.research.google.com/github/hawa1983/DATA-612/blob/main/Project_2_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Import Required Libraries

In [1]:
# This section imports all necessary libraries for data processing, similarity computation, evaluation, and visualization.
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity, pairwise_distances
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import normalize
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split


In [None]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import jaccard_score
from sklearn.preprocessing import normalize
from tqdm import tqdm

# --- Step 1: Load datasets ---
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv")
movies = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv")
print(f"Loaded {ratings['userId'].nunique()} users, {ratings['movieId'].nunique()} movies.")

# --- Step 2: Sample 10,000 users and save ratings ---
sampled_user_ids = ratings['userId'].drop_duplicates().sample(n=10000, random_state=42)
ratings_sampled = ratings[ratings['userId'].isin(sampled_user_ids)]
ratings_sampled.to_csv("ratings_sampled.csv", index=False)

# --- Step 3: Create user-movie matrix ---
user_movie_matrix = ratings_sampled.pivot(index='userId', columns='movieId', values='rating')
user_ids = user_movie_matrix.index.tolist()
user_means = user_movie_matrix.mean(axis=1)

# --- Step 4a: Cosine Similarity (User-Based) ---
user_movie_centered = user_movie_matrix.sub(user_means, axis=0).fillna(0)
cosine_sim_matrix = cosine_similarity(user_movie_centered.values)
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=user_ids, columns=user_ids)
cosine_sim_df.to_csv("cosine_user_similarity_sampled.csv")
print("Saved cosine user-user similarity matrix.")

# --- Step 4b: Jaccard Similarity (User-Based) ---
user_movie_binary = user_movie_matrix.notna().astype(int)
jaccard_sim_matrix = np.zeros((len(user_ids), len(user_ids)))
print("Computing Jaccard user-user similarity matrix...")

for i, user_i in enumerate(tqdm(user_ids)):
    for j in range(i, len(user_ids)):
        user_j = user_ids[j]
        sim = jaccard_score(user_movie_binary.loc[user_i], user_movie_binary.loc[user_j])
        jaccard_sim_matrix[i, j] = sim
        jaccard_sim_matrix[j, i] = sim

jaccard_sim_df = pd.DataFrame(jaccard_sim_matrix, index=user_ids, columns=user_ids)
jaccard_sim_df.to_csv("jaccard_user_similarity_sampled.csv")
print("Saved Jaccard user-user similarity matrix.")

# --- Step 5: Content-Based Similarity using Genre ---

# Identify genre columns (all one-hot columns after 'title' and 'movieId')
genre_cols = [col for col in movies.columns if col not in ['movieId', 'title']]
unique_movies = movies.copy().reset_index(drop=True)
movie_ids = unique_movies['movieId'].tolist()

# Normalize genre matrix for cosine similarity
genre_matrix = unique_movies[genre_cols].values
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

# --- Step 5a: Cosine Similarity (Content-Based) ---
cosine_content_sim = cosine_similarity(genre_matrix_normalized)
cosine_content_df = pd.DataFrame(cosine_content_sim, index=movie_ids, columns=movie_ids)
cosine_content_df.to_csv("cosine_content_similarity.csv")
print("Saved cosine content-based similarity matrix.")

# --- Step 5b: Jaccard Similarity (Content-Based) ---
jaccard_content_matrix = np.zeros((len(movie_ids), len(movie_ids)))
print("Computing Jaccard content-based similarity matrix...")

for i in tqdm(range(len(movie_ids))):
    for j in range(i, len(movie_ids)):
        sim = jaccard_score(genre_matrix[i], genre_matrix[j])
        jaccard_content_matrix[i, j] = sim
        jaccard_content_matrix[j, i] = sim

jaccard_content_df = pd.DataFrame(jaccard_content_matrix, index=movie_ids, columns=movie_ids)
jaccard_content_df.to_csv("jaccard_content_similarity.csv")
print("Saved Jaccard content-based similarity matrix.")


Loaded 59029 users, 11190 movies.
Saved cosine user-user similarity matrix.
Computing Jaccard user-user similarity matrix...


  0%|          | 2/10000 [01:03<87:52:28, 31.64s/it]

# 2. Load and Preprocess Data

- Downloads smaller, pre-filtered versions of the ratings and movies datasets from GitHub.
- These files contain fewer rows and are easier to work with in Colab (won’t crash memory).
- pd.read_csv() loads them into DataFrames named ratings and movies.

In [2]:
# Step 1: Load subset datasets
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv

ratings = pd.read_csv("ratings_subset.csv")
movies = pd.read_csv("movies_subset.csv")

# Step 2: Convert genres to multi-hot encoded format
# This block prepares genre data for content-based filtering.
# genres_list: Converts the genre string (e.g., 'Action|Adventure') into a Python list.
# all_genres: Builds a sorted list of all unique genres in the dataset.
# The loop creates a new column for each genre (multi-hot encoding):
# If a movie has that genre, it gets a 1, else 0.

movies['genres'] = movies['genres'].fillna('')
movies['genres_list'] = movies['genres'].apply(lambda x: x.split('|'))

all_genres = sorted(set(genre for sublist in movies['genres_list'] for genre in sublist))
for genre in all_genres:
    movies[genre] = movies['genres_list'].apply(lambda x: 1 if genre in x else 0)

# Step 3: Merge movie features with ratings
# Merges the processed movies DataFrame (now with genre vectors) with ratings.
# This results in movie_data, a dataset where each row contains:
    ## The user ID
    ## The movie's genre indicators (1s and 0s)
    ## The rating the user gave that movie

movie_data = pd.merge(movies.drop(columns=['genres_list']), ratings, on='movieId')

print("Shape of merged dataset:", movie_data.shape)
print(movie_data.head())


Shape of merged dataset: (100000, 26)
   movieId             title                                       genres  \
0        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
1        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
2        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
3        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   
4        1  Toy Story (1995)  Adventure|Animation|Children|Comedy|Fantasy   

   (no genres listed)  Action  Adventure  Animation  Children  Comedy  Crime  \
0                   0       0          1          1         1       1      0   
1                   0       0          1          1         1       1      0   
2                   0       0          1          1         1       1      0   
3                   0       0          1          1         1       1      0   
4                   0       0          1          1         1       1      0   

   ...  Musical  M

# 3. Content-Based Filtering Using Genre Vectors and Cosine Similarity

This code implements a **content-based recommender system** using movie genres. Each movie is represented as a binary (multi-hot) vector based on its associated genres (e.g., Action, Comedy, Drama). The steps include:

* Normalizing the genre vectors using **L2 norm** so that each vector has unit length.
* Calculating **cosine similarity** between movie vectors to measure how similar their genre compositions are.
* Creating a function that, given a movie title, returns the top-N most similar movies (excluding itself) based purely on genre similarity.

This technique does not rely on user ratings — instead, it recommends items that are similar in content (genre) to a given movie.

This code implements a **non-personalized content-based recommender system** using only movie genres. It does **not use user ratings or preferences**. Instead, it recommends movies that are **similar in genre** to a specified movie.

#### How It Works:

* Each movie is represented as a binary (multi-hot encoded) vector across genres (e.g., Action, Comedy, Drama).
* These vectors are **L2-normalized** so that all movies lie on a unit hypersphere — making **cosine similarity** an effective way to measure closeness.
* Given a movie title, the model:

  * Finds its genre vector.
  * Computes cosine similarity to all other movies.
  * Returns the top-N most similar movies (excluding itself).

#### What It Does Not Do:

* It does **not use any user data** (no `userId`, no ratings).
* There is **no personalization**. All users will get the same recommendations for a given movie.

#### Best Use Case:

This type of model is ideal when:

* You have **no user data** (cold start).
* You want to recommend movies **based on content alone** (e.g., genre-based similarity).
* You’re building a basic recommender system that can later be enhanced with collaborative filtering or hybrid techniques.




In [3]:
# --- Imports ---
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import random

# --- Step 1: Use unique movies for similarity computation ---
unique_movies = movies.copy().reset_index(drop=True)

# --- Step 2: Normalize genre matrix ---
genre_cols = all_genres  # Assumes 'all_genres' is your list of genre columns
genre_matrix = unique_movies[genre_cols].values
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

# --- Step 3: Create title-to-index mapping ---
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# --- Step 4: Recommendation Function Based on Genre Similarity ---
def get_recommendations(title, topN=20):
    if title not in movie_idx:
        return f"Movie '{title}' not found in dataset."

    idx = movie_idx[title]
    query_vector = genre_matrix_normalized[idx].reshape(1, -1)
    sim_scores = cosine_similarity(query_vector, genre_matrix_normalized)[0]

    # Rank and filter out the movie itself
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:topN+1]

    # Output as list of (title, similarity score)
    recommendations = [(unique_movies['title'][i], score) for i, score in sim_scores]
    return recommendations

# --- Step 5: Use Fixed Target Movie ---
target_title = 'O.J.: Made in America (2016)'

# Print explanation
print("\nContent-based Recommendations using GENRE similarity (cosine distance):")

# Run recommendation
if target_title in movie_idx:
    print(f"\nTop 20 Movies Most Similar in Genre to '{target_title}':")
    for title, sim in get_recommendations(target_title):
        print(f"{title:<45} Similarity: {sim:.4f}")
else:
    print(f"Movie '{target_title}' not found in the dataset.")



Content-based Recommendations using GENRE similarity (cosine distance):

Top 20 Movies Most Similar in Genre to 'O.J.: Made in America (2016)':
Catwalk (1996)                                Similarity: 1.0000
Anne Frank Remembered (1995)                  Similarity: 1.0000
Man of the Year (1995)                        Similarity: 1.0000
Crumb (1994)                                  Similarity: 1.0000
Unzipped (1995)                               Similarity: 1.0000
Hoop Dreams (1994)                            Similarity: 1.0000
Wonderful, Horrible Life of Leni Riefenstahl, The (Macht der Bilder: Leni Riefenstahl, Die) (1993) Similarity: 1.0000
War Room, The (1993)                          Similarity: 1.0000
Celluloid Closet, The (1995)                  Similarity: 1.0000
Haunted World of Edward D. Wood Jr., The (1996) Similarity: 1.0000
Maya Lin: A Strong Clear Vision (1994)        Similarity: 1.0000
Synthetic Pleasures (1995)                    Similarity: 1.0000
Microcosmos (Microco

In [4]:
# Imports tools for normalizing feature vectors and computing similarity between them.
from sklearn.preprocessing import normalize
from sklearn.metrics.pairwise import cosine_similarity
import random

# Use only unique movie rows for similarity matrix
# Copies the movies DataFrame and resets the index to ensure each movie is uniquely indexed.
unique_movies = movies.copy().reset_index(drop=True)

# Normalize genre matrix
# Extracts the genre vectors for each movie (multi-hot encoded).
# Applies L2 normalization so that all genre vectors have a length of 1 (helps with cosine similarity).
genre_cols = all_genres
genre_matrix = unique_movies[genre_cols].values
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

# Create title-to-index map for unique movies
# Creates a dictionary-like mapping from movie titles to their corresponding row index — used to look up vector positions.
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# Define function to get recommendations
# Defines a function that takes a movie title and returns the top N most similar movies.
def get_recommendations(title, topN=20):
    if title not in movie_idx:
        return f"Movie '{title}' not found in dataset."

    idx = movie_idx[title]
    query_vector = genre_matrix_normalized[idx].reshape(1, -1)
    sim_scores = cosine_similarity(query_vector, genre_matrix_normalized)[0]

    # Enumerate and sort scores, excluding the movie itself
    sim_scores = list(enumerate(sim_scores))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:topN+1]

    # Format output: list of (title, similarity) tuples
    recommendations = [(unique_movies['title'][i], score) for i, score in sim_scores]
    return recommendations


# Sample 20 titles
print("Available sample titles:")
print(unique_movies['title'].sample(20, random_state=41).to_list())

# Randomly select a movie title from the available titles
random_title = random.choice(unique_movies['title'].to_list())

print(f"\n Randomly selected movie for recommendation: '{random_title}'")

# Explanation
print("\n Content-based Recommendations are based on GENRE similarity using cosine similarity between genre vectors.")

# Get recommendations
print(f"\nTop 20 Movies Most Similar in Genre to '{random_title}':")
for title, sim in get_recommendations(random_title):
    print(f"{title:<45} Similarity: {sim:.4f}")






Available sample titles:
['Atomica (2017)', 'Adventures in Babysitting (1987)', 'Big Picture, The (1989)', 'Annabelle (2014)', 'Beau Is Afraid (2023)', 'First Blood (Rambo: First Blood) (1982)', 'The Meg (2018)', 'Little Nemo: Adventures in Slumberland (1992)', 'Amen. (2002)', 'Danger: Diabolik (Diabolik) (1968)', 'Bugsy Malone (1976)', 'The Good Dinosaur (2015)', 'Goofy Movie, A (1995)', 'Man Called Horse, A (1970)', 'Terms and Conditions May Apply (2013)', 'StageFright: Aquarius (1987)', 'Shanghai Dreams (Qing hong) (2005)', 'I, Daniel Blake (2016)', "Amores Perros (Love's a Bitch) (2000)", "Cookie's Fortune (1999)"]

 Randomly selected movie for recommendation: 'Brave One, The (2007)'

 Content-based Recommendations are based on GENRE similarity using cosine similarity between genre vectors.

Top 20 Movies Most Similar in Genre to 'Brave One, The (2007)':
Amateur (1994)                                Similarity: 1.0000
Kiss of Death (1995)                          Similarity: 1.0000

# Hybrid Content-Based Rating Prediction Using Genre Similarity, User Behavior, and Fallback Handling

This code demonstrates a *hybrid recommendation system* that combines **content-based filtering using genre similarity** with **collaborative filtering using user-specific ratings**. The objective is to predict how much a user will like a movie they've never seen, based on the genres of that movie and their past rating behavior.

The prediction process incorporates a **fallback mechanism** and **debug printouts** to gracefully handle edge cases where standard hybrid predictions aren’t possible. These cases include users with no rating history, movies not present in the similarity matrix, or when no meaningful similarity is found.

#### How it Works:

1. **Genre Vector Normalization**:

   * The genre columns are multi-hot encoded (e.g., Action, Comedy, etc.).
   * Each movie’s genre vector is normalized using L2 norm so that cosine similarity is well-defined and scale-invariant.

2. **Genre-Based Similarity Matrix**:

   * Cosine similarity is computed between all pairs of movies based on genre vectors.

3. **Mapping Setup**:

   * The code builds lookup maps between `movieId` and its corresponding row index in the genre matrix to allow fast access.

4. **Hybrid Prediction Function**:

   * For a given `user_id` and `movie_id`, the function:

     * Retrieves all movies rated by the user.
     * Finds the top-K rated movies that are most genre-similar to the target movie.
     * Computes a **weighted average of the ratings**, where the weights are the genre similarity scores.
     * **If no such ratings or similarities are available**, the function **falls back to the global average rating of the movie**.
     * Each fallback trigger is logged with a `[Debug]` message.

5. **Application**:

   * The model is tested on a sample user and generates predicted ratings for movies that are most similar in genre to a reference movie (e.g., *Heat (1995)*).

This hybrid approach offers:

* Personalization from collaborative filtering.
* Interpretability from content-based features (genres).
* Robustness from fallback logic to handle cold-starts or sparse data situations.


In [5]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import random
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from sklearn.metrics.pairwise import cosine_similarity

# --- Step 1: Keep All Ratings (No User Filtering) ---
ratings_filtered = ratings.copy()

# Build user-movie matrix
user_movie_matrix = ratings_filtered.pivot(index='userId', columns='movieId', values='rating')

# --- Step 2: Prepare Genre Matrix ---
# Convert genre strings to lists
movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])

# Filter only movies present in ratings
valid_movie_ids = user_movie_matrix.columns
movies_filtered = movies[movies['movieId'].isin(valid_movie_ids)].copy()

# One-hot encode genres
mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_filtered['genres'])
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

# Create mappings
unique_movies = movies_filtered.reset_index(drop=True)
movieId_to_index = dict(zip(unique_movies['movieId'], unique_movies.index))
index_to_movieId = dict(zip(unique_movies.index, unique_movies['movieId']))
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# --- Step 3: Compute Cosine Similarity Between Genre Vectors ---
genre_sim_matrix = cosine_similarity(genre_matrix_normalized)

# --- Step 4: Hybrid Prediction Function with Fallbacks and Recommendation Message ---
def predict_rating_genre_weighted(user_id, target_movie_id, k=10):
    if user_id not in user_movie_matrix.index or target_movie_id not in movieId_to_index:
        print(f"[Debug] Invalid user_id {user_id} or movie_id {target_movie_id}. Returning NaN.")
        return np.nan

    user_ratings = user_movie_matrix.loc[user_id].dropna()
    if user_ratings.empty:
        print(f"[Fallback] User {user_id} has no ratings. Using global average for movieId {target_movie_id}.")
        pred = ratings_filtered[ratings_filtered['movieId'] == target_movie_id]['rating'].mean()
        print(f"[Recommendation] Predicted Rating: {pred:.2f} → {'Recommend' if pred >= 3.5 else 'Not Recommended'}")
        return pred

    target_idx = movieId_to_index[target_movie_id]
    rated_movie_indices = [movieId_to_index[mid] for mid in user_ratings.index if mid in movieId_to_index]

    if not rated_movie_indices:
        print(f"[Fallback] Rated movies not found for user {user_id}. Using global average.")
        pred = ratings_filtered[ratings_filtered['movieId'] == target_movie_id]['rating'].mean()
        print(f"[Recommendation] Predicted Rating: {pred:.2f} → {'Recommend' if pred >= 3.5 else 'Not Recommended'}")
        return pred

    sims = genre_sim_matrix[target_idx, rated_movie_indices]
    sims_series = pd.Series(sims, index=[index_to_movieId[i] for i in rated_movie_indices])

    top_similar = sims_series.sort_values(ascending=False).head(k)
    top_ratings = user_ratings[top_similar.index]

    # Debug logs for inspection
    print(f"\n[Debug] Similarity Weights for User {user_id} on Target Movie {target_movie_id}:")
    print(top_similar)
    print("[Debug] Corresponding Ratings:")
    print(top_ratings)

    weighted_sum = np.dot(top_similar.values, top_ratings.values)
    normalization = np.sum(top_similar.values)

    if normalization > 0:
        pred = weighted_sum / normalization
        print(f"[Prediction] Personalized prediction used for user {user_id} on movieId {target_movie_id}.")
        print(f"[Recommendation] Predicted Rating: {pred:.2f} → {'Recommend' if pred >= 3.5 else 'Not Recommended'}")
        return pred
    else:
        print(f"[Fallback] No similarity weights found. Using global average for movieId {target_movie_id}.")
        pred = ratings_filtered[ratings_filtered['movieId'] == target_movie_id]['rating'].mean()
        print(f"[Recommendation] Predicted Rating: {pred:.2f} → {'Recommend' if pred >= 3.5 else 'Not Recommended'}")
        return pred

# --- Step 5: Run with Fixed User and Movie ---

# Set static user and movie
user_id = 174949
target_movie = 'O.J.: Made in America (2016)'

print(f"Using user {user_id} for prediction.")
print(f"Target movie exists: '{target_movie}' →", target_movie in movie_idx)

if target_movie in movie_idx:
    idx = movie_idx[target_movie]
    sim_scores = cosine_similarity(genre_matrix_normalized[idx].reshape(1, -1), genre_matrix_normalized)[0]
    sim_indices = np.argsort(sim_scores)[::-1][1:11]  # Exclude the movie itself

    top_similar_movie_ids = unique_movies.loc[sim_indices, 'movieId']
    top_similar_titles = unique_movies.loc[sim_indices, 'title']

    print(f"\nTop 10 Genre-Similar Movies to '{target_movie}':")
    print(top_similar_titles)

    print(f"\nPredicted Ratings for User {user_id} Using Hybrid Genre-Based Model:\n")
    for movie_id, title in zip(top_similar_movie_ids, top_similar_titles):
        pred = predict_rating_genre_weighted(user_id=user_id, target_movie_id=movie_id, k=100)
        print(f"{title:<45} Predicted Rating: {pred:.2f}")
else:
    print("Target movie not found in index.")


Using user 174949 for prediction.
Target movie exists: 'O.J.: Made in America (2016)' → True

Top 10 Genre-Similar Movies to 'O.J.: Made in America (2016)':
11172    Indiana Jones: The Search for the Lost Golden ...
98                                          Catwalk (1996)
104                           Anne Frank Remembered (1995)
118                                 Man of the Year (1995)
139                                           Crumb (1994)
177                                        Unzipped (1995)
213                                     Hoop Dreams (1994)
315      Wonderful, Horrible Life of Leni Riefenstahl, ...
480                                   War Room, The (1993)
496                           Celluloid Closet, The (1995)
Name: title, dtype: object

Predicted Ratings for User 174949 Using Hybrid Genre-Based Model:


[Debug] Similarity Weights for User 174949 on Target Movie 287443:
1207    0.0
2671    0.0
dtype: float64
[Debug] Corresponding Ratings:
1207    5.0
2671    

**Hybrid Recommender: Genre-Weighted Collaborative Filtering**

This code predicts a user's rating for a movie by combining collaborative filtering and genre-based similarity. Here's how it works:

### **Step-by-Step Explanation**

**1. Data Preparation**

* It loads the `ratings` and `movies` datasets.
* The user-movie ratings matrix is built using `.pivot()` (rows = users, columns = movies, values = ratings).
* Movie genres are split and one-hot encoded using `MultiLabelBinarizer`.
* Genre vectors are normalized to enable cosine similarity comparison.

**2. Genre Similarity Calculation**

* Cosine similarity is computed between normalized genre vectors of all movies.
* This generates a matrix showing how similar each pair of movies is based on genre.

**3. `hybrid_predict()` Function:**
This is the main prediction function. Here's what it does:

* **Step 1**: Skips invalid user/movie inputs.
* **Step 2**: Loops through all other users (excluding the target user).
* **Step 3**: For each user, checks if they rated the target movie.
* **Step 4**: Collects that user's other rated movies and looks up genre similarity between those and the target movie.
* **Step 5**: Uses a weighted average of the other user's ratings on similar movies, weighted by genre similarity.
* **Step 6**: Averages all such weighted predictions from other users to generate the final prediction.
* **Fallback**: If no useful ratings are found, it falls back to the global average rating for the movie.

**4. Prediction Execution**

* The code sets `user_id = 174949` and `target_movie = 'O.J.: Made in America (2016)'`.
* It retrieves the `movieId` and runs the `hybrid_predict()` function.
* Finally, it prints the predicted rating for that user and movie.

*This approach combines user behavior (collaborative filtering) with genre-based content similarity to improve prediction accuracy, especially for sparse data or cold-start problems.*


# 1. Vectorized NumPy Logic – Genre-Based Hybrid Prediction

In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load subset datasets
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv

ratings = pd.read_csv("ratings_subset.csv")
movies = pd.read_csv("movies_subset.csv")

# Step 1: Prepare Data
ratings_filtered = ratings.copy()
user_movie_matrix = ratings_filtered.pivot(index='userId', columns='movieId', values='rating')

movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])
valid_movie_ids = user_movie_matrix.columns
movies_filtered = movies[movies['movieId'].isin(valid_movie_ids)].copy()

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_filtered['genres'])
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

unique_movies = movies_filtered.reset_index(drop=True)
movieId_to_index = dict(zip(unique_movies['movieId'], unique_movies.index))
index_to_movieId = dict(zip(unique_movies.index, unique_movies['movieId']))
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# Step 2: Vectorized Hybrid Prediction Function (Genre-only Weighted)
def vectorized_hybrid_predict(user_id, target_movie_id, k=10):
    if user_id not in user_movie_matrix.index or target_movie_id not in movieId_to_index:
        return np.nan

    target_idx = movieId_to_index[target_movie_id]
    sim_vector = cosine_similarity(genre_matrix_normalized[target_idx].reshape(1, -1), genre_matrix_normalized)[0]

    user_ratings = user_movie_matrix.loc[user_id].dropna()
    rated_movie_ids = user_ratings.index.intersection(user_movie_matrix.columns)
    rated_indices = [movieId_to_index[mid] for mid in rated_movie_ids if mid in movieId_to_index]

    sim_scores = sim_vector[rated_indices]
    ratings_values = user_ratings.loc[rated_movie_ids].values

    if len(sim_scores) == 0 or np.sum(sim_scores) == 0:
        return np.nan

    top_k_indices = np.argsort(sim_scores)[-k:]
    sim_top = sim_scores[top_k_indices]
    rating_top = ratings_values[top_k_indices]

    return np.dot(sim_top, rating_top) / np.sum(sim_top)

# Example Usage
# user_id = 174949
valid_user = None
for uid in user_movie_matrix.index:
    rated_movies = user_movie_matrix.loc[uid].dropna().index
    if rated_movies.intersection(movieId_to_index.keys()).any():
        valid_user = uid
        break  # Exit the loop immediately once a valid user is found

user_id = valid_user


target_movie = 'O.J.: Made in America (2016)'
target_movie_id = unique_movies.loc[movie_idx[target_movie], 'movieId']
pred = vectorized_hybrid_predict(user_id, target_movie_id, k=100)
print(f"Predicted rating for '{target_movie}' by user {user_id}: {pred:.2f}")


Predicted rating for 'O.J.: Made in America (2016)' by user 10: nan


# 2. Blended Hybrid (Genre + Collaborative Filtering) with Precomputed Hybrid Similarity

In [3]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer, normalize
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load subset datasets
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv
!wget -q https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv

# Step 1: Prepare Data
ratings_filtered = ratings.copy()
user_movie_matrix = ratings_filtered.pivot(index='userId', columns='movieId', values='rating')

movies['genres'] = movies['genres'].apply(lambda x: x.split('|') if isinstance(x, str) else [])
valid_movie_ids = user_movie_matrix.columns
movies_filtered = movies[movies['movieId'].isin(valid_movie_ids)].copy()

mlb = MultiLabelBinarizer()
genre_matrix = mlb.fit_transform(movies_filtered['genres'])
genre_matrix_normalized = normalize(genre_matrix, norm='l2')

unique_movies = movies_filtered.reset_index(drop=True)
movieId_to_index = dict(zip(unique_movies['movieId'], unique_movies.index))
index_to_movieId = dict(zip(unique_movies.index, unique_movies['movieId']))
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# Step 2: Compute Hybrid Similarity Matrix
genre_sim = cosine_similarity(genre_matrix_normalized)
user_movie_centered = user_movie_matrix.sub(user_movie_matrix.mean(axis=1), axis=0).fillna(0)
item_sim = cosine_similarity(user_movie_centered.T.fillna(0))

# Ensure both matrices are same shape
alpha = 0.5  # genre-collab blend weight
hybrid_sim = alpha * genre_sim + (1 - alpha) * item_sim

# Step 3: Prediction Function Using Hybrid Similarity
def blended_hybrid_predict(user_id, target_movie_id, k=10):
    if user_id not in user_movie_matrix.index or target_movie_id not in movieId_to_index:
        return np.nan

    target_idx = movieId_to_index[target_movie_id]
    sim_vector = hybrid_sim[target_idx]

    user_ratings = user_movie_matrix.loc[user_id].dropna()
    rated_movie_ids = user_ratings.index.intersection(user_movie_matrix.columns)
    rated_indices = [movieId_to_index[mid] for mid in rated_movie_ids if mid in movieId_to_index]

    sim_scores = sim_vector[rated_indices]
    ratings_values = user_ratings.loc[rated_movie_ids].values

    if len(sim_scores) == 0 or np.sum(sim_scores) == 0:
        return np.nan

    top_k_indices = np.argsort(sim_scores)[-k:]
    sim_top = sim_scores[top_k_indices]
    rating_top = ratings_values[top_k_indices]

    return np.dot(sim_top, rating_top) / np.sum(sim_top)

# Example Usage
user_id = 174949
target_movie = 'O.J.: Made in America (2016)'
target_movie_id = unique_movies.loc[movie_idx[target_movie], 'movieId']
pred = blended_hybrid_predict(user_id, target_movie_id, k=100)
print(f"Predicted rating for '{target_movie}' by user {user_id}: {pred:.2f}")


ValueError: Found array with 0 feature(s) (shape=(11190, 0)) while a minimum of 1 is required by the normalize function.

## Comparison of Overlapping and Divergent Recommendations

Both methods returned several overlapping recommendations, but also differed in meaningful ways:

#### Similar Recommendations from Both Methods

* **Assassins (1995)**
* **Net, The (1995)**

These consistent suggestions indicate that both the pure content-based and hybrid genre-weighted models identify core genre traits effectively.

#### Recommendations Unique to Each Method

**Only in Content-Based (Cosine Genre Similarity):**

* *Die Hard (1988)*
* *Batman (1989)*
* *U.S. Marshals (1998)*

**Only in Hybrid Model (Genre + Ratings Fallback):**

* *Sin City: A Dame to Kill For (2014)*
* *John Wick: Chapter Two (2017)*
* *Transporter 2 (2005)*

These differences show that the hybrid method is able to introduce newer or slightly more nuanced genre matches, even when rating data for a specific user is missing and fallback mechanisms are triggered.


# Optimized Jaccard Similarity for Content-Based Filtering

This block introduces a more efficient method for computing **Jaccard similarity** between movies based on their genre information. Unlike the traditional nested-loop approach, this implementation uses the `pdist()` function from `scipy.spatial.distance` to compute all pairwise Jaccard distances in a **fully vectorized** manner. The result is a symmetric similarity matrix, which is then used to identify the most similar movies to a given title. This optimization drastically reduces computation time and is highly recommended for medium-to-large datasets.

using `scipy.spatial.distance.pdist()` **does calculate all pairwise similarities**, but it does so much more efficiently than a manual loop.

Here’s how it works:

* `pdist(binary_matrix, metric='jaccard')` computes the **Jaccard distance** (which is `1 - Jaccard similarity`) between **all unique pairs** of rows (i.e., movies) in the binary genre matrix.
* The output is a **condensed distance matrix** — a flat array containing the upper triangle of the full pairwise distance matrix.
* This condensed matrix is converted back into a full square **symmetric matrix** using `squareform()`, giving us the distance between all pairs.
* We then compute similarity as `1 - distance`.

Every possible movie-to-movie similarity is calculated — but with optimized vectorized operations under the hood, which is much faster than nested Python loops.



In [5]:
from scipy.spatial.distance import pdist, squareform
import numpy as np
import pandas as pd

# Step 1: Prepare genre binary matrix
unique_movies = movies.copy().reset_index(drop=True)
genre_cols = all_genres
genre_matrix = unique_movies[genre_cols].astype(bool).astype(int).values  # ensure binary format

# Step 2: Compute Jaccard distance (1 - similarity)
# pdist returns a condensed distance matrix; squareform converts it to square form
jaccard_distance = pdist(genre_matrix, metric='jaccard')  # returns 1 - Jaccard similarity
jaccard_sim_matrix = 1 - squareform(jaccard_distance)      # convert to full similarity matrix

# Step 3: Create mapping from title to matrix index
movie_idx = pd.Series(unique_movies.index, index=unique_movies['title']).drop_duplicates()

# Step 4: Define recommendation function
def get_recommendations_jaccard(title, topN=10):
    if title not in movie_idx:
        return f"Movie '{title}' not found in dataset."

    idx = movie_idx[title]
    sim_scores = list(enumerate(jaccard_sim_matrix[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:topN+1]  # exclude self
    top_indices = [i[0] for i in sim_scores]
    return unique_movies['title'].iloc[top_indices]

# Step 5: Try a sample movie
print("Sample titles:", unique_movies['title'].sample(5, random_state=42).to_list())
print(f"\nJaccard Recommendations for {random_title}:")
# print(get_recommendations_jaccard("Heat (1995)"))
print(get_recommendations_jaccard(random_title))


Sample titles: ['Murder on the Orient Express (2017)', 'Rhapsody in August (Hachi-gatsu no kyôshikyoku) (1991)', 'First Position (2011)', 'Wait Until Dark (1967)', 'Coffy (1973)']

Jaccard Recommendations for Youth (2015):
25                      Othello (1995)
30              Dangerous Minds (1995)
38     Cry, the Beloved Country (1995)
41                  Restoration (1995)
51                      Georgia (1995)
52        Home for the Holidays (1995)
57           Mr. Holland's Opus (1995)
62                     Two Bits (1995)
103           Margaret's Museum (1995)
108    Boys of St. Vincent, The (1992)
Name: title, dtype: object


# Comparison of Content-Based Recommendations: Cosine vs. Jaccard Similarity

Both the **cosine similarity** and **Jaccard similarity** methods returned *identical top-10 movie recommendations* for the query movie **"Heat (1995)"**. This indicates that in the context of the MovieLens genre-based content filtering:

* **Both methods effectively captured the same neighborhood of similar films**.
* The movies recommended (e.g., *Assassins*, *Die Hard*, *The Net*, *Natural Born Killers*) suggest that the genre combinations for these titles closely match those of *Heat (1995)*.
* While **cosine similarity** operates on normalized multi-hot vectors and measures angular proximity,
  **Jaccard similarity** measures the overlap in genre tags directly.

### Key Takeaway:

Despite their different mathematical underpinnings, both methods **produced the same results** because:

* The genre vectors are binary (multi-hot encoded), where normalization (in cosine) doesn’t distort information significantly.
* The dominant factor influencing similarity is the **overlap of genre labels**, which both metrics capture well.

However:

* **Cosine similarity is computationally faster** and more scalable.
* **Jaccard similarity is slower** when computed pairwise using loops, though vectorized solutions like `pdist()` improve it significantly.

You can safely use either in this binary genre context, but for large-scale systems, cosine is typically preferred for efficiency.


# User-User Collaborative Filtering with Bias Adjustment and Fallback Logic

This recommender system applies a user-user collaborative filtering approach enhanced with user and item bias adjustments and robust fallback logic to ensure stable and interpretable predictions. The method is designed to make personalized movie rating predictions even in cases of sparse data.

**1. Data Sampling and Matrix Construction**
A sample of 10,000 users is drawn randomly from the full ratings dataset to manage memory and computational requirements. A user-movie matrix is constructed using these ratings, where each cell represents the rating a user has given to a movie. The system computes average ratings per user (user bias) and per movie (item bias) to help model baseline tendencies.

**2. Centering and Similarity Calculation**
To isolate users' preferences from their general rating behavior, the user-movie matrix is centered by subtracting each user's average rating. This centered matrix is then used to calculate cosine similarity between users, generating a user-user similarity matrix that quantifies how closely users' preferences align.

**3. Predicting Ratings Using Top-k Neighbors**
To predict a rating for a given user and movie, the system:

* Identifies the top-k most similar users who have rated the target movie.
* Computes the deviation of these neighbors’ ratings from their respective means.
* Uses a weighted average of these deviations, weighted by similarity, and adds it to the target user’s mean to produce the prediction.

This formula is:

$$
\hat{r}_{u,i} = \mu_u + \frac{\sum_{v \in N(u)} \text{sim}(u,v) \cdot (r_{v,i} - \mu_v)}{\sum_{v \in N(u)} \text{sim}(u,v)}
$$

**4. Bias-Based Fallback Logic**
If the user has no similar neighbors who have rated the movie, or if the similarity weights sum to zero, the system falls back to a bias-based estimate:

$$
\hat{r}_{u,i} = \mu_u + \mu_i - \mu_{global}
$$

This combines the user’s and the item’s average rating, adjusted by subtracting the global mean to avoid double-counting. If either the user or item bias is unavailable, the system defaults to the global average rating.

**5. Clamping Predictions to Rating Scale**
All final predictions are clamped to the valid rating range \[0.5, 5.0] to ensure they remain realistic and consistent with actual rating values.

**6. Fallback Testing for Cold-Start Scenarios**
The system includes a test routine to simulate cold-start scenarios by selecting users who have not rated the target movie. This verifies that the fallback mechanism generates meaningful predictions even when minimal user-item interaction data is available.

This hybrid approach ensures personalized predictions while remaining resilient in sparse data conditions, making it suitable for practical recommender systems.


# Sampling and Computing Cosine Similarity for User-Based Collaborative Filtering

This code prepares a smaller, manageable dataset from a larger ratings file and computes a user-user cosine similarity matrix to be used in a recommender system. Each step has a clear purpose:

**1. Load Full Ratings Data**
*Purpose: To retrieve the entire dataset of user-movie ratings for processing.*
The code loads the full ratings dataset from a remote source and reports how many unique users and movies are present.

**2. Sample 10,000 Unique Users**
*Purpose: To reduce computational load by working with a representative subset of the data.*
The code randomly selects 10,000 unique users and filters the ratings dataset to include only those users. This sampled dataset is then saved for future use.

**3. Create User-Movie Matrix**
*Purpose: To structure the data into a matrix format suitable for similarity calculations.*
A pivot table is created where rows are users, columns are movies, and values are the corresponding ratings. This format allows for pairwise comparisons between users.

**4. Center Ratings**
*Purpose: To normalize user behavior by removing individual rating biases.*
The code subtracts each user's average rating from their rated movies. This centers the data around zero and ensures that similarity is based on rating patterns rather than absolute values.

**5. Compute Cosine Similarity**
*Purpose: To quantify how similar users are based on their centered rating patterns.*
Using the centered matrix, the cosine similarity is calculated between every pair of users. This measures how aligned users are in terms of their movie preferences.

**6. Save Similarity Matrix**
*Purpose: To preserve the computed similarity matrix for use in building and testing recommendation algorithms.*
The resulting cosine similarity matrix is converted into a labeled DataFrame and saved as a CSV file for later use in prediction models.

This process builds a scalable foundation for collaborative filtering by focusing on user similarity based on normalized preferences.


In [1]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import os
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load full ratings data
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv")
print(f"Loaded dataset with {ratings['userId'].nunique()} users and {ratings['movieId'].nunique()} movies.")

# Step 2: Sample 10,000 unique users
sampled_user_ids = ratings['userId'].drop_duplicates().sample(n=10000, random_state=42)
ratings_sampled = ratings[ratings['userId'].isin(sampled_user_ids)]
ratings_sampled.to_csv("ratings_sampled.csv", index=False)
print(f"Sampled dataset saved with {ratings_sampled['userId'].nunique()} users and {ratings_sampled['movieId'].nunique()} movies.")

# Step 3: Create user-movie matrix from sampled data
user_movie_matrix = ratings_sampled.pivot(index='userId', columns='movieId', values='rating')
user_means = user_movie_matrix.mean(axis=1)
user_ids = user_movie_matrix.index.tolist()

# Step 4: Center ratings
user_movie_centered = user_movie_matrix.sub(user_means, axis=0).fillna(0)

# Step 5: Compute cosine similarity
print("Computing cosine similarity matrix for sampled users...")
cosine_sim_matrix = cosine_similarity(user_movie_centered.values)

# Step 6: Convert to DataFrame and save
cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=user_ids, columns=user_ids)
cosine_sim_df.to_csv("cosine_user_similarity_sampled.csv")
print("Cosine similarity matrix saved as 'cosine_user_similarity_sampled.csv'.")


Loaded dataset with 59029 users and 11190 movies.
Sampled dataset saved with 10000 users and 4933 movies.
Computing cosine similarity matrix for sampled users...
Cosine similarity matrix saved as 'cosine_user_similarity_sampled.csv'.


In [2]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import os
from sklearn.metrics import jaccard_score
from tqdm import tqdm

# Step 1: Load full ratings data
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv")
print(f"Loaded dataset with {ratings['userId'].nunique()} users and {ratings['movieId'].nunique()} movies.")

# Step 2: Sample 10,000 unique users
sampled_user_ids = ratings['userId'].drop_duplicates().sample(n=10000, random_state=42)
ratings_sampled = ratings[ratings['userId'].isin(sampled_user_ids)]
ratings_sampled.to_csv("ratings_sampled.csv", index=False)
print(f"Sampled dataset saved with {ratings_sampled['userId'].nunique()} users and {ratings_sampled['movieId'].nunique()} movies.")

# Step 3: Create binary user-movie matrix (1 if rated, 0 if not)
user_movie_matrix = ratings_sampled.pivot(index='userId', columns='movieId', values='rating')
user_movie_binary = user_movie_matrix.notna().astype(int)
user_ids = user_movie_binary.index.tolist()

# Step 4: Compute Jaccard similarity
print("Computing Jaccard similarity matrix for sampled users...")

jaccard_sim_matrix = np.zeros((len(user_ids), len(user_ids)))

for i, user_i in enumerate(tqdm(user_ids)):
    for j in range(i, len(user_ids)):
        user_j = user_ids[j]
        sim = jaccard_score(user_movie_binary.loc[user_i], user_movie_binary.loc[user_j])
        jaccard_sim_matrix[i, j] = sim
        jaccard_sim_matrix[j, i] = sim  # symmetric

# Step 5: Convert to DataFrame and save
jaccard_sim_df = pd.DataFrame(jaccard_sim_matrix, index=user_ids, columns=user_ids)
jaccard_sim_df.to_csv("jaccard_user_similarity_sampled.csv")
print("Jaccard similarity matrix saved as 'jaccard_user_similarity_sampled.csv'.")


Loaded dataset with 59029 users and 11190 movies.
Sampled dataset saved with 10000 users and 4933 movies.
Computing Jaccard similarity matrix for sampled users...


  0%|          | 8/10000 [04:33<94:50:02, 34.17s/it]


KeyboardInterrupt: 

# User-Based Collaborative Filtering with Cosine Similarity and Bias Adjustment

This system predicts how a user might rate a movie they haven’t seen, using the behavior of similar users. It employs a user-based collaborative filtering approach, enhanced with cosine similarity and bias adjustment, and includes fallback logic to handle missing data. Below is a breakdown of the methodology with the purpose of each step.

**1. Data Preparation**
*Purpose: To structure the raw data into a usable format for similarity computation and rating prediction.*

* Loads the movie metadata and user ratings datasets.
* Constructs a user-movie matrix, where rows represent users and columns represent movies.
* Calculates:

  * Each user’s average rating (to normalize personal biases)
  * Each movie’s average rating (used in fallback logic)
  * The global average rating (used as a last-resort fallback)

**2. Similarity Matrix Handling**
*Purpose: To determine how similar each user is to every other user, based on shared rating behavior.*
This step ensures that a valid user-user cosine similarity matrix is available by following one of three approaches:

* **Check for a Local File:**
  If the matrix already exists on the local machine, it is loaded directly for efficiency.

* **Download from Cloud Storage:**
  If the local file is missing, the system attempts to download a precomputed matrix from Google Drive.

* **Compute Similarity Manually:**
  If downloading fails:

  * User ratings are centered by subtracting their individual means
  * Missing ratings are filled with zeros to allow matrix operations
  * Cosine similarity is computed between users
  * The resulting matrix is saved locally for future reuse

This three-step fallback ensures the system is flexible and always functional, regardless of file availability.

**3. Rating Prediction with Bias Adjustment**
*Purpose: To predict how a specific user would rate a specific movie using insights from similar users.*

* Identifies users who have rated the target movie.
* Measures similarity between the target user and those users using cosine similarity.
* Selects the top *k* most similar users (neighbors).
* Calculates how much each neighbor’s rating deviates from their average and weighs it by their similarity score.
* Adjusts the target user’s mean rating by the weighted deviation to produce a prediction.
* Applies fallback rules using movie mean or global mean if not enough neighbors are found or similarity is too low.
* Prediction is capped between 0.5 and 5.0 to stay within valid rating bounds.

**4. Random Test Pair Selection**
*Purpose: To automatically select a valid (user, movie) pair for prediction testing.*

* Randomly picks a user who has not rated a given movie.
* Ensures that at least *k* other users have rated the movie to allow meaningful prediction.
* Returns a user-movie pair for evaluation of the recommender system.

**5. Prediction Test and Fallback Demonstration**
*Purpose: To test and demonstrate the prediction capability and the fallback mechanism.*

* Predicts a rating for the selected user-movie pair using the cosine similarity method.
* Also tests a fallback scenario where a user has not rated the movie and may lack sufficient neighbor data.
* This helps verify that the system can return predictions even when data is sparse.

**Conclusion**
Each step in this system is designed to make the recommender engine both accurate and resilient. The approach prioritizes reusability and speed (by checking local files first), enhances prediction quality through bias correction, and ensures coverage with intelligent fallback strategies. The result is a scalable and dependable collaborative filtering system for personalized movie recommendations.


In [1]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import random
import os
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load full dataset (already sampled before upload)
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_sampled.csv")
movies = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv")
print("Data loaded successfully.")

# Step 2: Use all users (no sampling)
ratings_full = ratings.copy()
print(f"Using {ratings_full['userId'].nunique()} users and {ratings_full['movieId'].nunique()} movies.")

# Step 3: Create user-movie matrix
user_movie_matrix = ratings_full.pivot(index='userId', columns='movieId', values='rating')
user_means = user_movie_matrix.mean(axis=1)
item_means = user_movie_matrix.mean(axis=0)
global_mean = ratings_full['rating'].mean()
user_ids = user_movie_matrix.index.tolist()

# Step 4: Compute or load cosine similarity matrix
try:
    import gdown
except ImportError:
    import subprocess
    subprocess.check_call(["pip", "install", "gdown"])
    import gdown

cosine_sim_file_local = "cosine_user_similarity_sampled.csv"
cosine_file_drive_id = "1YMOWK5Acsf9hxDfPHng4k9T0AcO0aQtn"
gdown_url = f"https://drive.google.com/uc?id={cosine_file_drive_id}"

if not os.path.exists(cosine_sim_file_local):
    print("Cosine similarity file not found locally. Attempting download...")
    try:
        gdown.download(gdown_url, cosine_sim_file_local, quiet=False)
    except Exception as e:
        print("Download failed. Computing cosine similarity matrix...")
        user_movie_centered = user_movie_matrix.sub(user_means, axis=0).fillna(0)
        cosine_sim_matrix = cosine_similarity(user_movie_centered.values)
        cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=user_ids, columns=user_ids)
        cosine_sim_df.to_csv(cosine_sim_file_local)
        print("Cosine similarity matrix computed and saved locally.")
else:
    print("File already exists locally.")
    cosine_sim_df = pd.read_csv(cosine_sim_file_local, index_col=0)
    cosine_sim_df.columns = cosine_sim_df.columns.astype(int)
    cosine_sim_df.index = cosine_sim_df.index.astype(int)

# Load the full similarity matrix
cosine_sim_df = pd.read_csv(cosine_sim_file_local, index_col=0)
cosine_sim_df.columns = cosine_sim_df.columns.astype(int)
cosine_sim_df.index = cosine_sim_df.index.astype(int)

# Ensure matrix is restricted to the actual users in the data (in case of mismatches)
cosine_sim_df = cosine_sim_df.loc[user_ids, user_ids]

# Step 5: Define prediction function with bias fallback
def predict_user_user_cosine_with_bias(user_id, movie_id, k=10):
    if user_id not in user_movie_matrix.index or movie_id not in user_movie_matrix.columns:
        return global_mean

    user_mean = user_means[user_id]
    sims = cosine_sim_df[user_id]

    neighbors = user_movie_matrix[movie_id].dropna()
    neighbors = neighbors[neighbors.index != user_id]
    if neighbors.empty:
        return user_mean + item_means.get(movie_id, global_mean) - global_mean

    neighbor_sims = sims[neighbors.index]
    neighbor_means = user_means[neighbors.index]
    neighbor_ratings = neighbors

    top_neighbors = neighbor_sims.sort_values(ascending=False).head(k)
    top_ratings = neighbor_ratings[top_neighbors.index]
    top_means = neighbor_means[top_neighbors.index]

    deviations = top_ratings - top_means
    weighted_sum = np.dot(top_neighbors, deviations)
    sim_sum = np.abs(top_neighbors).sum()

    if sim_sum > 0:
        prediction = user_mean + (weighted_sum / sim_sum)
    else:
        prediction = user_mean + item_means.get(movie_id, global_mean) - global_mean

    return max(0.5, min(prediction, 5.0))

# Step 6: Find testable (user, movie) pair
def find_random_user_movie_pair(k=10):
    users = user_movie_matrix.index.tolist()
    random.shuffle(users)

    for user_id in users:
        rated = user_movie_matrix.loc[user_id].dropna().index
        unrated = user_movie_matrix.columns.difference(rated)
        unrated = unrated.tolist()
        random.shuffle(unrated)
        for movie_id in unrated:
            if user_movie_matrix[movie_id].count() > k:
                return user_id, movie_id
    return None, None


# Step 7: Run prediction test
user_id, movie_id = find_random_user_movie_pair(k=10)

if user_id and movie_id:
    pred = predict_user_user_cosine_with_bias(user_id, movie_id, k=10)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"\nPredicted rating for user {user_id} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")
else:
    print("No suitable user-movie pair found.")

# Step 8: Fallback test
def test_fallback_same_movie(movie_id, k=10):
    eligible_users = user_movie_matrix.index.difference(user_movie_matrix[movie_id].dropna().index)
    if eligible_users.empty:
        print("No eligible users for fallback.")
        return

    random_user = random.choice(eligible_users.tolist())
    pred = predict_user_user_cosine_with_bias(random_user, movie_id, k=k)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"[Fallback] Predicted rating for random user {random_user} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")

if movie_id:
    test_fallback_same_movie(movie_id, k=10)


Data loaded successfully.
Using 10000 users and 4933 movies.
Downloading cosine similarity matrix using gdown...


Downloading...
From (original): https://drive.google.com/uc?id=1YMOWK5Acsf9hxDfPHng4k9T0AcO0aQtn
From (redirected): https://drive.google.com/uc?id=1YMOWK5Acsf9hxDfPHng4k9T0AcO0aQtn&confirm=t&uuid=20e4f4d7-f6c4-4fda-bfdd-6496755249a0
To: /content/cosine_user_similarity_sampled.csv
100%|██████████| 401M/401M [00:03<00:00, 121MB/s]



Predicted rating for user 141770 on movie 'Outbreak (1995)' (movieId 292): 3.51
[Fallback] Predicted rating for random user 197720 on movie 'Outbreak (1995)' (movieId 292): 3.05


# Item-Item Collaborative Filtering Using Jaccard Similarity with Bias-Based Fallback

This methodology implements a recommender system based on **item-item collaborative filtering**. It leverages **Jaccard similarity** between items and incorporates a **bias-adjusted fallback mechanism** to produce robust rating predictions in sparse or cold-start scenarios. The approach focuses on whether users have interacted with items rather than how they rated them, making it suitable when explicit feedback is limited.

### 1. **Data Preparation and Sampling**

To reduce memory and computational overhead, a random sample of 10,000 users is extracted from the original ratings dataset. The system then constructs a **user-movie rating matrix**, where rows represent users, columns represent movies, and values represent ratings.

From this matrix, the following statistics are calculated:

* **User means** – average rating per user
* **Item means** – average rating per item
* **Global mean** – overall average rating in the dataset

These statistics serve as fallback predictors when sufficient similarity-based signals are not available.

### 2. **Binary Matrix and Jaccard Similarity**

A binary matrix is generated where:

* A value of 1 indicates that a user rated a movie.
* A value of 0 indicates no rating.

This binary matrix is transposed to form a **movie-user matrix**, which is used to compute **Jaccard similarity** between all pairs of movies:

$$
\text{Jaccard}(A, B) = \frac{|A \cap B|}{|A \cup B|}
$$

Where:

* $A$ and $B$ are sets of users who rated movies A and B, respectively.
* The intersection represents the number of users who rated both.
* The union represents users who rated either.

To improve efficiency:

* The similarity matrix is cached to a local file.
* If the file exists or is downloadable from Google Drive, it's reused to avoid recomputation.

### 3. **Item-Item Rating Prediction with Bias Adjustment**

To predict a user’s rating for a movie using **item-item collaborative filtering**, the algorithm follows these steps:

1. Retrieve all movies the user has rated.
2. Compute similarity scores between the target movie and these rated movies.
3. Select the top-k most similar movies.
4. Take a **similarity-weighted average** of the user's ratings for those movies:

$$
\hat{r}_{u,i} = \frac{\sum_{j \in N(i)} \text{sim}(i,j) \cdot r_{u,j}}{\sum_{j \in N(i)} \text{sim}(i,j)}
$$

If the denominator (sum of similarities) is zero, indicating no informative neighbors, the system falls back to a **bias-adjusted estimate**:

$$
\hat{r}_{u,i} = \mu_u + \mu_i - \mu_{\text{global}}
$$

This combines the user’s average rating ($\mu_u$) and the item’s average rating ($\mu_i$), offset by the global average to reduce bias accumulation.

All predictions are **clamped** to the valid rating range $0.5, 5.0$.

### 4. **Cold-Start and Fallback Simulation**

A fallback test is included to simulate **cold-start scenarios**, where a user has not rated the target movie. In such cases:

* A random user who hasn’t rated the movie is selected.
* The prediction function is run with fallback logic engaged.
* This ensures the system remains functional even in sparse user-item interaction environments.

### 5. **Conclusion**

This hybrid item-item collaborative filtering system blends **Jaccard-based similarity** with **statistical bias correction**, ensuring:

* Interpretability from co-engagement patterns.
* Resilience to sparse data.
* Compatibility with binary interaction datasets.

It is well-suited for systems where users’ presence or absence (rather than rating intensity) carries the signal of interest — such as click, view, or purchase histories.


In [52]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import random
import os
from sklearn.metrics import jaccard_score


# Step 1: Load data
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv")
movies = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv")
print("Data loaded successfully.")

# Step 2: Sample 10,000 users
sampled_user_ids = ratings['userId'].drop_duplicates().sample(n=10000, random_state=41)
ratings_small = ratings[ratings['userId'].isin(sampled_user_ids)]
print(f"Using {ratings_small['userId'].nunique()} users and {ratings_small['movieId'].nunique()} movies.")

# Step 3: Create user-movie matrix
user_movie_matrix = ratings_small.pivot(index='userId', columns='movieId', values='rating')
user_means = user_movie_matrix.mean(axis=1)
item_means = user_movie_matrix.mean(axis=0)
global_mean = ratings_small['rating'].mean()

# Step 4: Convert to binary matrix (rated=1, unrated=0), then transpose
movie_user_binary = user_movie_matrix.notna().astype(int).T
movie_ids = movie_user_binary.index.tolist()

# Step 5: Load or compute Jaccard similarity matrix
# sim_file = "jaccard_similarity.csv"

# Step 1: Try loading Jaccard similarity matrix from Google Drive
sim_file_drive_id = "1z-VAYMQF9ZQSuJgkflDg3vJH9m60jwr8"
sim_file_drive_url = f"https://drive.google.com/uc?export=download&id={sim_file_drive_id}"
sim_file_local = "jaccard_similarity.csv"

try:
    print("Trying to load Jaccard similarity matrix from Google Drive...")
    jaccard_sim_df = pd.read_csv(sim_file_drive_url, index_col=0)
    jaccard_sim_df.columns = jaccard_sim_df.columns.astype(str).astype(int)
    jaccard_sim_df.index = jaccard_sim_df.index.astype(str).astype(int)
    print("Successfully loaded Jaccard similarity matrix from Google Drive.")
except Exception as e:
    if os.path.exists(sim_file_local):
        print("Failed to load from Drive. Loading from local file...")
        jaccard_sim_df = pd.read_csv(sim_file_local, index_col=0)
        jaccard_sim_df.columns = jaccard_sim_df.columns.astype(str).astype(int)
        jaccard_sim_df.index = jaccard_sim_df.index.astype(str).astype(int)
    else:
        print("Computing Jaccard similarity matrix...")
        binary_array = movie_user_binary.values.astype(bool)
        intersection = np.dot(binary_array, binary_array.T)
        row_sums = binary_array.sum(axis=1, keepdims=True)
        union = row_sums + row_sums.T - intersection
        jaccard_sim_matrix = intersection / np.maximum(union, 1)
        jaccard_sim_df = pd.DataFrame(jaccard_sim_matrix, index=movie_ids, columns=movie_ids)
        jaccard_sim_df.to_csv(sim_file_local)
        print("Jaccard similarity matrix computed and saved locally.")


# if os.path.exists(sim_file):
#     print("Loading Jaccard similarity matrix from file...")
#     jaccard_sim_df = pd.read_csv(sim_file, index_col=0)
#     jaccard_sim_df.columns = jaccard_sim_df.columns.astype(str).astype(int)
#     jaccard_sim_df.index = jaccard_sim_df.index.astype(str).astype(int)
# else:
#     print("Computing Jaccard similarity matrix...")
#     binary_array = movie_user_binary.values.astype(bool)
#     intersection = np.dot(binary_array, binary_array.T)
#     row_sums = binary_array.sum(axis=1, keepdims=True)
#     union = row_sums + row_sums.T - intersection
#     jaccard_sim_matrix = intersection / np.maximum(union, 1)
#     jaccard_sim_df = pd.DataFrame(jaccard_sim_matrix, index=movie_ids, columns=movie_ids)
#     jaccard_sim_df.to_csv(sim_file)
#     print("Jaccard similarity matrix computed and saved to file.")

# Step 6: Define prediction function
def predict_item_item_jaccard_with_bias(user_id, movie_id, k=10):
    if user_id not in user_movie_matrix.index or movie_id not in user_movie_matrix.columns:
        return global_mean

    user_ratings = user_movie_matrix.loc[user_id].dropna()
    if user_ratings.empty or movie_id not in jaccard_sim_df.index:
        return global_mean

    sims = jaccard_sim_df.loc[movie_id, user_ratings.index]
    top_items = sims.sort_values(ascending=False).head(k)
    top_ratings = user_ratings[top_items.index]

    if top_items.sum() > 0:
        prediction = np.dot(top_items, top_ratings) / top_items.sum()
    else:
        prediction = user_means.get(user_id, global_mean) + item_means.get(movie_id, 0) - global_mean

    return max(0.5, min(prediction, 5.0))

# Step 7: Find a predictable pair
def find_predictable_pair(k=10):
    for user_id in user_movie_matrix.index:
        rated = user_movie_matrix.loc[user_id].dropna().index
        unrated = user_movie_matrix.columns.difference(rated)
        for movie_id in unrated:
            if user_movie_matrix[movie_id].count() > k:
                return user_id, movie_id
    return None, None

# Step 8: Test prediction
user_id, movie_id = find_predictable_pair(k=10)
if user_id and movie_id:
    pred = predict_item_item_jaccard_with_bias(user_id, movie_id, k=10)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"\nPredicted rating for user {user_id} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")
else:
    print("No suitable user-movie pair found.")

# Step 9: Fallback test
def test_bias_fallback_same_movie(movie_id, k=10):
    eligible_users = user_movie_matrix.index.difference(user_movie_matrix[movie_id].dropna().index)
    if eligible_users.empty:
        print("No eligible users for fallback.")
        return
    random_user = random.choice(eligible_users.tolist())
    pred = predict_item_item_jaccard_with_bias(random_user, movie_id, k=k)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"[Fallback] Predicted rating for random user {random_user} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")

if movie_id:
    test_bias_fallback_same_movie(movie_id, k=10)


Data loaded successfully.
Using 10000 users and 5036 movies.
Trying to load Jaccard similarity matrix from Google Drive...
Successfully loaded Jaccard similarity matrix from Google Drive.

Predicted rating for user 34 on movie 'Toy Story (1995)' (movieId 1): 4.09
[Fallback] Predicted rating for random user 146618 on movie 'Toy Story (1995)' (movieId 1): 3.59


# Item-Item Collaborative Filtering Using Cosine Similarity with Bias-Aware Fallback

This recommender system employs item-item collaborative filtering powered by cosine similarity and enhanced with bias-aware fallback logic. It is designed to deliver personalized movie rating predictions while ensuring robustness in cases of sparse data or cold-start users.

**1. Data Preparation and Sampling**
A random sample of 10,000 users is selected from the ratings dataset to reduce computational load. A user-movie matrix is then constructed with user IDs as rows, movie IDs as columns, and rating values as the matrix entries.
From this matrix, the following statistics are computed:

* User mean: the average rating each user gives
* Item mean: the average rating each movie receives
* Global mean: the overall average rating across all users and movies

These serve as the baseline for fallback predictions.

**2. Cosine Similarity Matrix Construction**
The user-movie matrix is transposed to obtain a movie-user matrix. Missing values are filled with zeros so that cosine similarity can be calculated between every pair of movies. Cosine similarity measures how similar two movies are based on users who rated them both, using the formula:

cosine(A, B) = (A ⋅ B) / (||A|| × ||B||)

Where A and B are rating vectors for two movies.
To avoid recomputation, the similarity matrix is saved locally or loaded from a Google Drive file when available.

**3. Predicting Ratings Using Top-k Similar Movies**
To estimate how a user would rate a movie they haven’t seen, the system:

* Identifies the set of movies the user has already rated
* Retrieves cosine similarities between the target movie and those rated movies
* Selects the top-k most similar movies
* Calculates a weighted average of the user’s ratings for those top-k movies using their similarity scores as weights

The predicted rating is computed as:

r̂(u,i) = ∑ sim(i,j) × r(u,j) / ∑ sim(i,j)

If the similarity weights sum to zero, the system triggers the fallback mechanism.

**4. Bias-Based Fallback Strategy**
When there are no similar movies rated by the user, or if similarity weights are zero, a bias-aware fallback formula is used:

r̂(u,i) = μ\_u + μ\_i − μ\_global

Where:

* μ\_u is the user’s average rating
* μ\_i is the movie’s average rating
* μ\_global is the global average rating

This ensures that even without similarity-based support, the model can make meaningful predictions. All predictions are clamped to the valid range \[0.5, 5.0].

**5. Cold-Start Testing and Prediction Validation**
The system includes functionality to identify suitable user-movie pairs for prediction testing, as well as simulate cold-start conditions by selecting users who haven’t rated a specific movie. This allows for evaluation of the fallback mechanism under realistic sparse data scenarios.

**Conclusion**
This item-item collaborative filtering approach with cosine similarity provides an interpretable and resilient recommendation system. By incorporating user, item, and global bias in its fallback logic, the model remains functional and reliable even in the absence of strong similarity signals.


In [51]:
# --- Required Libraries ---
import pandas as pd
import numpy as np
import random
import os
from sklearn.metrics.pairwise import cosine_similarity

# Step 1: Load data
ratings = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/ratings_subset.csv")
movies = pd.read_csv("https://raw.githubusercontent.com/hawa1983/DATA-612/refs/heads/main/movies_subset.csv")
print("Data loaded successfully.")

# Step 2: Sample 10,000 users
sampled_user_ids = ratings['userId'].drop_duplicates().sample(n=10000, random_state=41)
ratings_small = ratings[ratings['userId'].isin(sampled_user_ids)]
print(f"Using {ratings_small['userId'].nunique()} users and {ratings_small['movieId'].nunique()} movies.")

# Step 3: Create user-movie matrix
user_movie_matrix = ratings_small.pivot(index='userId', columns='movieId', values='rating')
user_means = user_movie_matrix.mean(axis=1)
item_means = user_movie_matrix.mean(axis=0)
global_mean = ratings_small['rating'].mean()

# Step 4: Transpose and fill NA with 0 for item-item similarity
movie_user_matrix = user_movie_matrix.T.fillna(0)
movie_ids = movie_user_matrix.index.tolist()

# Step 5: Load or compute cosine similarity matrix
# Google Drive File ID for cosine similarity matrix
cosine_file_drive_id = "1z-VAYMQF9ZQSuJgkflDg3vJH9m60jwr8"
cosine_file_drive_url = f"https://drive.google.com/uc?export=download&id={cosine_file_drive_id}"
cosine_sim_file_local = "cosine_similarity.csv"

try:
    print("Trying to load cosine similarity matrix from Google Drive...")
    cosine_sim_df = pd.read_csv(cosine_file_drive_url, index_col=0)
    cosine_sim_df.columns = cosine_sim_df.columns.astype(int)
    cosine_sim_df.index = cosine_sim_df.index.astype(int)
    print("Successfully loaded cosine similarity matrix from Google Drive.")
except Exception as e:
    if os.path.exists(cosine_sim_file_local):
        print("Failed to load from Drive. Loading from local file...")
        cosine_sim_df = pd.read_csv(cosine_sim_file_local, index_col=0)
        cosine_sim_df.columns = cosine_sim_df.columns.astype(int)
        cosine_sim_df.index = cosine_sim_df.index.astype(int)
    else:
        print("Computing cosine similarity matrix...")
        cosine_sim_matrix = cosine_similarity(movie_user_matrix.values)
        cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=movie_ids, columns=movie_ids)
        cosine_sim_df.to_csv(cosine_sim_file_local)
        print("Cosine similarity matrix computed and saved locally.")

# sim_file = "cosine_item_similarity.csv"
# if os.path.exists(sim_file):
#     print("Loading cosine similarity matrix from file...")
#     cosine_sim_df = pd.read_csv(sim_file, index_col=0)
#     cosine_sim_df.columns = cosine_sim_df.columns.astype(int)
#     cosine_sim_df.index = cosine_sim_df.index.astype(int)
# else:
#     print("Computing cosine similarity matrix...")
#     cosine_sim_matrix = cosine_similarity(movie_user_matrix.values)
#     cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=movie_ids, columns=movie_ids)
#     cosine_sim_df.to_csv(sim_file)
#     print("Cosine similarity matrix computed and saved to file.")

# Step 6: Define prediction function
def predict_item_item_cosine_with_bias(user_id, movie_id, k=10):
    if user_id not in user_movie_matrix.index or movie_id not in user_movie_matrix.columns:
        return global_mean

    user_ratings = user_movie_matrix.loc[user_id].dropna()
    if user_ratings.empty or movie_id not in cosine_sim_df.index:
        return global_mean

    sims = cosine_sim_df.loc[movie_id, user_ratings.index]
    top_items = sims.sort_values(ascending=False).head(k)
    top_ratings = user_ratings[top_items.index]

    if top_items.sum() > 0:
        prediction = np.dot(top_items, top_ratings) / top_items.sum()
    else:
        prediction = user_means.get(user_id, global_mean) + item_means.get(movie_id, 0) - global_mean

    return max(0.5, min(prediction, 5.0))

# Step 7: Find a predictable pair
def find_predictable_pair(k=10):
    for user_id in user_movie_matrix.index:
        rated = user_movie_matrix.loc[user_id].dropna().index
        unrated = user_movie_matrix.columns.difference(rated)
        for movie_id in unrated:
            if user_movie_matrix[movie_id].count() > k:
                return user_id, movie_id
    return None, None

# Step 8: Test prediction
user_id, movie_id = find_predictable_pair(k=10)
if user_id and movie_id:
    pred = predict_item_item_cosine_with_bias(user_id, movie_id, k=10)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"\nPredicted rating for user {user_id} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")
else:
    print("No suitable user-movie pair found.")

# Step 9: Fallback test
def test_bias_fallback_same_movie(movie_id, k=10):
    eligible_users = user_movie_matrix.index.difference(user_movie_matrix[movie_id].dropna().index)
    if eligible_users.empty:
        print("No eligible users for fallback.")
        return
    random_user = random.choice(eligible_users.tolist())
    pred = predict_item_item_cosine_with_bias(random_user, movie_id, k=k)
    movie_title = movies[movies['movieId'] == movie_id]['title'].values[0]
    print(f"[Fallback] Predicted rating for random user {random_user} on movie '{movie_title}' (movieId {movie_id}): {pred:.2f}")

if movie_id:
    test_bias_fallback_same_movie(movie_id, k=10)


Data loaded successfully.
Using 10000 users and 5036 movies.
Trying to load cosine similarity matrix from Google Drive...
Successfully loaded cosine similarity matrix from Google Drive.

Predicted rating for user 34 on movie 'Toy Story (1995)' (movieId 1): 4.09
[Fallback] Predicted rating for random user 110110 on movie 'Toy Story (1995)' (movieId 1): 4.09


# 6. Evaluation: RMSE Comparison

In [None]:
# Split data for evaluation
train, test = train_test_split(ratings, test_size=0.2, random_state=42)

# Predict ratings using both collaborative methods
user_preds = test.apply(lambda row: predict_user_user(row['userId'], row['movieId']), axis=1)
item_preds = test.apply(lambda row: predict_item_item(row['userId'], row['movieId']), axis=1)

# Calculate RMSE
user_rmse = np.sqrt(mean_squared_error(test['rating'].dropna(), user_preds.dropna()))
item_rmse = np.sqrt(mean_squared_error(test['rating'].dropna(), item_preds.dropna()))

# Plot RMSE comparison
plt.figure(figsize=(8, 4))
plt.bar(['User-User (Pearson)', 'Item-Item (Adjusted Cosine)'], [user_rmse, item_rmse], color=['blue', 'green'])
plt.title("RMSE Comparison of Collaborative Filtering Models")
plt.ylabel("RMSE")
plt.show()


# 7. Summary Output

In [None]:
print("\n--- Summary ---")
print(f"User-User RMSE (Pearson): {user_rmse:.4f}")
print(f"Item-Item RMSE (Adjusted Cosine): {item_rmse:.4f}")
print("Content-Based filtering used L2-normalized cosine similarity on genre vectors.")
print("User-user filtering used Pearson correlation and centered ratings.")
print("Item-item filtering used adjusted cosine similarity with user-centered item vectors.")
