# John Zeiders Project 4 Submission

In [67]:
import pandas as pd
import numpy as np
from typing import List, Tuple, Dict
import os
from collections import defaultdict
import logging
from multiprocessing import Pool, cpu_count
from itertools import combinations

# Set up logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler('recommender.log'),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

In [68]:
ratings_matrix = pd.read_csv('/Users/jzeiders/Documents/Code/Learnings/GraduateML/src/Project4/submission/rating_matrix.csv', index_col=0)

# Load movie data
movie_data = {}
with open('/Users/jzeiders/Documents/Code/Learnings/GraduateML/src/Project4/submission/movies.dat', 'r', encoding='latin-1') as f:
    for line in f:
        movie_id, title, genres = line.strip().split('::')
        movie_data[f"m{movie_id}"] = {
            'title': title,
            'genres': genres.split('|')
        }

print(f"Loaded ratings matrix with shape: {ratings_matrix.shape}")
print(f"Loaded {len(movie_data)} movies")

Loaded ratings matrix with shape: (6040, 3706)
Loaded 3883 movies


In [69]:
def compute_popularity(ratings_matrix: pd.DataFrame) -> List[Tuple[str, float]]:
    """
    Compute movie popularity based on number of ratings and average rating.
    
    Criteria:
    1. Must have at least 10 ratings
    2. Must have average rating > 3.5
    3. Score = (average_rating * number_of_ratings) / total_possible_ratings
    """
    popularity_scores = []
    
    for movie_id in ratings_matrix.columns:
        ratings = ratings_matrix[movie_id]
        num_ratings = ratings.notna().sum()
        avg_rating = ratings.mean()
        
        if num_ratings >= 10 and avg_rating > 3.5:
            popularity_score = (avg_rating * num_ratings) / ratings_matrix.shape[0]
            popularity_scores.append((movie_id, popularity_score))
    
    popularity_scores.sort(key=lambda x: x[1], reverse=True)
    return popularity_scores
popular_movies = compute_popularity(ratings_matrix)[:10]
print("\nSystem I: Top 10 Popular Movies")
print("-" * 80)
for movie_id, score in popular_movies:
    title = movie_data[movie_id]['title']
    print(f"{movie_id}: {title} (Score: {score:.3f})")


System I: Top 10 Popular Movies
--------------------------------------------------------------------------------
m2858: American Beauty (1999) (Score: 2.450)
m260: Star Wars: Episode IV - A New Hope (1977) (Score: 2.205)
m1196: Star Wars: Episode V - The Empire Strikes Back (1980) (Score: 2.125)
m1210: Star Wars: Episode VI - Return of the Jedi (1983) (Score: 1.920)
m2028: Saving Private Ryan (1998) (Score: 1.905)
m1198: Raiders of the Lost Ark (1981) (Score: 1.864)
m593: Silence of the Lambs, The (1991) (Score: 1.857)
m2571: Matrix, The (1999) (Score: 1.851)
m2762: Sixth Sense, The (1999) (Score: 1.794)
m589: Terminator 2: Judgment Day (1991) (Score: 1.780)


# Part 2: System II - Item-Based Collaborative Filtering


In [70]:
def compute_similarity_chunk(args):
    """Compute cosine similarity for a chunk of movie pairs."""
    chunk, R, rated_mask = args
    results = []
    processed_count = 0
    for i, j in chunk:
        processed_count += 1
        # Find users who rated both movies
        common = rated_mask[:, i] & rated_mask[:, j]
        n_common = np.sum(common)
        
        if n_common >= 3:
            # Extract ratings from common users
            Ri = R[common, i]
            Rj = R[common, j]
            
            numerator = np.sum(Ri * Rj)
            denom = np.sqrt(np.sum(Ri**2)) * np.sqrt(np.sum(Rj**2))
            
            if denom != 0:
                cos_sim = numerator / denom
                # Transform similarity to be in [0, 1]
                sim = 0.5 + 0.5 * cos_sim
                results.append((i, j, sim))
    print(f"Processed {processed_count} pairs for chunk {chunk[0][0]}")
    return results

def compute_transformed_cosine_similarity(Rdf: pd.DataFrame) -> pd.DataFrame:
    """Compute transformed cosine similarities between movies in parallel."""
    logger.info("Computing similarity matrix...")
    R = np.asarray(Rdf)
    num_users, num_movies = R.shape
    
    rated_mask = ~np.isnan(R)
    similarities = pd.DataFrame(
        np.full((num_movies, num_movies), np.nan, dtype=np.float64), 
        index=Rdf.columns, 
        columns=Rdf.columns
    )
    
    all_pairs = list(combinations(range(num_movies), 2))
    n_cores = cpu_count()
    chunk_size = len(all_pairs) // (n_cores * 4)
    chunks = [all_pairs[i:i + chunk_size] for i in range(0, len(all_pairs), chunk_size)]
    
    with Pool(processes=n_cores) as pool:
        results = pool.map(compute_similarity_chunk, [(chunk, R, rated_mask) for chunk in chunks])
    
    for chunk_results in results:
        for i, j, sim in chunk_results:
            similarities.iloc[i, j] = sim
            similarities.iloc[j, i] = sim
    
    # Keep only top 30 similarities per movie
    for movie in similarities.columns:
        movie_similarities = similarities[movie].copy()
        movie_similarities[movie] = np.nan
        sorted_indices = movie_similarities.sort_values(ascending=False).index[:30]
        similarities.loc[~similarities.index.isin(sorted_indices), movie] = np.nan
    
    return similarities

# Center the ratings matrix
user_means = ratings_matrix.mean(axis=1)
centered_matrix = ratings_matrix.sub(user_means, axis=0)

SIMILARITY_MATRIX_FILE = '/Users/jzeiders/Documents/Code/Learnings/GraduateML/src/Project4/submission/similarity_matrix.csv'        
def get_similarity_matrix():
    if os.path.exists(SIMILARITY_MATRIX_FILE):
        # Caching because this takes a while
        return pd.read_csv(SIMILARITY_MATRIX_FILE, index_col=0)
    else:
        similarity_matrix = compute_transformed_cosine_similarity(centered_matrix)
        similarity_matrix.to_csv(SIMILARITY_MATRIX_FILE)
        return similarity_matrix

similarity_matrix = get_similarity_matrix()

In [71]:
# Display similarities for specified movies
movies_to_check = ["m1", "m10", "m100", "m1510", "m260", "m3212"]
similarity_subset = similarity_matrix.loc[movies_to_check, movies_to_check]
print("\nPairwise Similarities for Specified Movies:")
print(similarity_subset.round(7))


Pairwise Similarities for Specified Movies:
             m1       m10      m100  m1510      m260  m3212
m1          NaN  0.512105  0.392000    NaN  0.741148    NaN
m10    0.512105       NaN  0.547458    NaN  0.534334    NaN
m100   0.392000  0.547458       NaN    NaN  0.329694    NaN
m1510       NaN       NaN       NaN    NaN       NaN    NaN
m260   0.741148  0.534334  0.329694    NaN       NaN    NaN
m3212       NaN       NaN       NaN    NaN       NaN    NaN


In [89]:
def part3(similarity_matrix: pd.DataFrame) -> pd.DataFrame:
    out_matrix = similarity_matrix.copy()
    for movie in out_matrix.index:
         # Get similarities for current movie
        movie_similarities = out_matrix.loc[movie].copy()
        # Set self-similarity to NaN
        movie_similarities[movie] = np.nan
        # Sort similarities
        sorted_indices = movie_similarities.sort_values(ascending=False).index
        # Keep only top 30
        keep_indices = sorted_indices[:30]
        # Set all other similarities to NaN
        out_matrix.loc[movie, ~out_matrix.columns.isin(keep_indices)] = np.nan
    return out_matrix


def myIBCF(similarity_matrix: pd.DataFrame, newuser: pd.Series) -> List[str]:
    similarity_matrix = part3(similarity_matrix)
    """Generate IBCF recommendations for a new user."""
    user_mean = newuser[newuser.notna()].mean()
    predictions = []

    for movie_i in similarity_matrix.index:
        if pd.isna(newuser[movie_i]):
            similar_movies = similarity_matrix.loc[movie_i].dropna()
            rated_similar = similar_movies[newuser[similar_movies.index].notna()]
            
            if len(rated_similar) > 0:
                weights = rated_similar.values
                ratings = newuser[rated_similar.index].values
                # Center the ratings using user's mean
                centered_ratings = ratings - user_mean
                # Compute prediction with baseline adjustment
                weighted_sum = np.sum(weights * centered_ratings)
                weight_sum = np.sum(np.abs(weights))  # Use absolute weights for normalization
                if weight_sum > 0:  # Avoid division by zero
                    prediction = user_mean + (weighted_sum / weight_sum)
                    # Clip predictions to valid rating range (1-5)
                    prediction = np.clip(prediction, 1, 5)
                    predictions.append((movie_i, prediction))
    predictions.sort(key=lambda x: x[1], reverse=True)

    if len(predictions) < 10:
        popular_movies = compute_popularity(ratings_matrix)
        new_predictions = [p for p in popular_movies if p[0] not in [p[0] for p in predictions] and pd.isna(newuser[p[0]])]
        predictions.extend(new_predictions[:10 - len(predictions)])
    
    return [p[0] for p in predictions[:10]]

## Part 3: Testing and Validation

def validate_recommendations(user_id: str, recommendations: List[str], movie_data: Dict):
    """Validate recommendations against known correct movies."""
    if user_id == 'u1181':
        required_top_3 = ['m3732', 'm749', 'm3899']
        expected_label = "Required top 3"
        top_k = 3
    else:
        required_top_10 = ['m1017', 'm2805', 'm3269', 'm691', 'm74', 
                          'm765', 'm1100', 'm1468', 'm1541', 'm158']
        required_top_3 = required_top_10
        expected_label = "Expected top 10"
        top_k = 10
        
    top_k_found = [movie for movie in required_top_3 if movie in recommendations[:top_k]]
    print(f"\nValidation Results for {user_id}:")
    print(f"{expected_label}: {', '.join(required_top_3)}")
    print(f"Found in top {top_k}: {', '.join(top_k_found)}")
    print(f"Top {top_k} accuracy: {len(top_k_found)}/{len(required_top_3)}")

In [88]:
print("\nTesting recommendations for user u1181:")
user_ratings = ratings_matrix.loc['u1181']
recommendations = myIBCF(similarity_matrix, user_ratings)
print(recommendations)

# Test case 2: Hypothetical user
print("\nTesting recommendations for hypothetical user (m1613:5, m1755:4):")
hypo_user = pd.Series(np.nan, index=ratings_matrix.columns)
hypo_user['m1613'] = 5
hypo_user['m1755'] = 4
recommendations = myIBCF(similarity_matrix, hypo_user)
print(recommendations)


Testing recommendations for user u1181:
['m2858', 'm318', 'm1968', 'm2804', 'm3418', 'm1073', 'm364', 'm3897', 'm36', 'm1912']

Testing recommendations for hypothetical user (m1613:5, m1755:4):
['m2858', 'm260', 'm1196', 'm1210', 'm2028', 'm1198', 'm593', 'm2571', 'm2762', 'm589']
