# (PSL) Project 4: Movie Recommender System

Members:
- Amy Hwang (ahwang22)
- Christian Tam (cmtam2)
- Monil Kaneria (kaneria2)

Amy Hwang worked on all parts of the HTML file and application.

Monil Kaneria worked on the System I, System II, and the myIBCF function of the HTML file.

The web link to the movie recommendation application: https://cs598-psl-ahwang22-project4.streamlit.app

In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore", message="Thread 'MainThread': missing ScriptRunContext!")
from sklearn.metrics.pairwise import cosine_similarity
import streamlit as st

In [2]:
# Load the dataset
rating_matrix = pd.read_csv("I-w9Wo-HSzmUGNNHw0pCzg_bc290b0e6b3a45c19f62b1b82b1699f1_Rmat.csv", index_col=0)

# System I: Recommendation Based on Popularity

In [3]:
def recommend_popular(rating_matrix, top_n=10):
    # Calculate popularity (e.g., average rating * number of ratings)
    movie_popularity = rating_matrix.apply(lambda col: col.mean(skipna=True) * col.notna().sum(), axis=0)
    top_movies = movie_popularity.sort_values(ascending=False).head(top_n)
    
    return top_movies.index.tolist(), top_movies.values

In [4]:
all_popular_ranking, scores = recommend_popular(rating_matrix, rating_matrix.shape[0])
all_rank_df = pd.DataFrame(all_popular_ranking)
all_rank_df.to_csv("all_popular_ranking.csv")

# System II: IBCF-based Recommendations

### Step 1: Normalize the rating matrix by centering each row.

In [5]:
# Normalize rows by subtracting the mean
def normalize_matrix(matrix):
    row_means = matrix.mean(axis=1, skipna=True)
    
    return matrix.sub(row_means, axis=0)

### Step 2: Compute the cosine similarity. We ignore similarities computed on less than three user ratings.

In [19]:
def compute_cosine_similarity_optimized(matrix):
    matrix = normalize_matrix(matrix)
    similarity = pd.DataFrame(np.nan, index=matrix.columns, columns=matrix.columns)

    for col in matrix.columns:
        cs = cosine_similarity_helper(matrix[col], matrix)
        similarity.loc[col] = cs

    np.fill_diagonal(similarity.values, np.nan)

    return similarity

In [7]:
def cosine_similarity_helper(movie, all_movies):
    # Creating a matrix of the single movie column, copied in the same number of cols as matrix
    movie_vals = movie.values
    movie_matrix = all_movies.copy()
    movie_matrix[:] = movie_vals[:, None]
    movie_matrix.columns = all_movies.columns

    movie_notna = movie_matrix.notna()
    all_movies_notna = all_movies.notna()
    
    both_ratings_notna = movie_notna.values & all_movies_notna.values
    true_counts = both_ratings_notna.sum(axis=0)
    mask = true_counts < 3
    both_ratings_notna[:, mask] = False

    movie_notna_0, all_movies_notna_0 = movie_matrix.where(both_ratings_notna).fillna(0), all_movies.where(both_ratings_notna).fillna(0)
    dot = (movie_notna_0 * all_movies_notna_0).sum()
    mag_movie, mag_all_movies = np.sqrt(np.sum(movie_notna_0**2, axis=0)), np.sqrt(np.sum(all_movies_notna_0**2, axis=0))

    cs = 0.5 * (1 + dot / (mag_movie * mag_all_movies))
    return cs

In [20]:
# Load the rating matrix
rating_matrix = pd.read_csv("I-w9Wo-HSzmUGNNHw0pCzg_bc290b0e6b3a45c19f62b1b82b1699f1_Rmat.csv", index_col=0)

# Compute similarity matrix
similarity_matrix = compute_cosine_similarity_optimized(rating_matrix)

# Save to CSV
similarity_matrix.to_csv("similarity_matrix.csv")

#### Pairwise similarity values from the S matrix for these movies: "m1", "m10", "m100", “m1510”, “m260”, “m3212”.

In [24]:
pd.set_option('display.float_format', '{:.7f}'.format)
print(similarity_matrix.loc[["m1", "m10", "m100", "m1510", "m260", "m3212"],["m1", "m10", "m100", "m1510", "m260", "m3212"]])

             m1       m10      m100  m1510      m260  m3212
m1          NaN 0.5121055 0.3919999    NaN 0.7411482    NaN
m10   0.5121055       NaN 0.5474583    NaN 0.5343338    NaN
m100  0.3919999 0.5474583       NaN    NaN 0.3296943    NaN
m1510       NaN       NaN       NaN    NaN       NaN    NaN
m260  0.7411482 0.5343338 0.3296943    NaN       NaN    NaN
m3212       NaN       NaN       NaN    NaN       NaN    NaN


### Step 3: Keep the top 30 similarities in each row, setting the rest to NA. 

In [22]:
def keep_top_n(row, n=30):
    # Find the indices of the top 'n' values
    top_indices = row.nlargest(n).index

    # Set all other values to NaN
    return row.where(row.index.isin(top_indices), np.nan)

In [25]:
similarity_matrix_copy = similarity_matrix.copy()

similarity_top30 = similarity_matrix_copy.apply(keep_top_n, axis=1)
similarity_top30.to_csv("similarity_matrix_top30.csv")

### Step 4: Define myIBCF

In [17]:
def myIBCF(new_user_ratings):
    similarity_url = "https://raw.githubusercontent.com/hwangsamy1/CS598-PSL/refs/heads/main/Project4/similarity_matrix_top30.csv"
    similarity_matrix = pd.read_csv(similarity_url, index_col=0)
    predictions = {}

    for movie in similarity_matrix.index:
        if pd.isna(new_user_ratings[movie]):
            related_movies = similarity_matrix.loc[movie].dropna()
            rated_movies = new_user_ratings[~new_user_ratings.isna()]
            relevant_movies = related_movies.index.intersection(rated_movies.index)
            
            if relevant_movies.any():
                weights = related_movies.loc[relevant_movies]
                ratings = rated_movies.loc[relevant_movies]
                prediction = (weights * ratings).sum() / weights.sum()
                predictions[movie] = prediction
    
    # Sort by predicted ratings
    sorted_predictions = sorted(predictions.items(), key=lambda x: x[1], reverse=True)
    predictions_top10_vals = np.array([val for movie, val in sorted_predictions[:10]])
    predictions_top10_movies = np.array([movie for movie, val in sorted_predictions[:10]])

    notna_count = np.count_nonzero(np.isnan(predictions_top10_vals))
    if predictions_top10_vals.size == 0:
        notna_count = 10
        
    if notna_count > 0:
        popularity_ranks_url = "https://raw.githubusercontent.com/hwangsamy1/CS598-PSL/refs/heads/main/Project4/all_popular_ranking.csv"
        popularity_ranks = pd.read_csv(popularity_ranks_url, index_col=0)

        mask = ~np.isin(popularity_ranks, predictions_top10_movies)
        popular_noranked = popularity_ranks[mask]
        remaining_movies = popular_noranked[:notna_count].to_numpy().flatten()

        new_predictions = np.full(10, '', dtype='<U10')
        new_predictions[:len(predictions_top10_movies)] = predictions_top10_movies

        new_predictions[(10-notna_count):] = remaining_movies
        return new_predictions

    return predictions_top10_movies

# Test our function

In [18]:
# Test the implementation with user u1181
if __name__ == "__main__":
    # Popularity-based recommendations
    top_movies, scores = recommend_popular(rating_matrix)
    print("Top 10 Popular Movies:\n", top_movies)

    # User "u1181" input from the rating matrix
    user_ratings_u1181 = rating_matrix.loc["u1181"]

    # IBCF-based recommendations for u1181
    ibcf_recommendations_u1181 = myIBCF(user_ratings_u1181)
    print("\nTop 10 IBCF Recommendations for u1181:\n", ibcf_recommendations_u1181)

    # Hypothetical user input
    user_ratings_hypothetical = pd.Series(index=rating_matrix.columns, dtype="float")
    user_ratings_hypothetical["m1613"] = 5
    user_ratings_hypothetical["m1755"] = 4

    # IBCF-based recommendations for hypothetical user
    ibcf_recommendations_hypothetical = myIBCF(user_ratings_hypothetical)
    print("\nTop 10 IBCF Recommendations for hypothetical user:\n", ibcf_recommendations_hypothetical)

Top 10 Popular Movies:
 ['m2858', 'm260', 'm1196', 'm1210', 'm2028', 'm1198', 'm593', 'm2571', 'm2762', 'm589']

Top 10 IBCF Recommendations for u1181:
 ['m3732' 'm749' 'm3899' 'm1039' 'm1235' 'm1253' 'm1734' 'm1914' 'm2082'
 'm2361']

Top 10 IBCF Recommendations for hypothetical user:
 ['m1017' 'm2805' 'm3269' 'm592' 'm691' 'm74' 'm765' 'm1100' 'm1468'
 'm1541']
