# 🎯 Movie Recommendation System using Pearson Correlation

This notebook builds a Collaborative Filtering recommendation system using Pearson correlation to find similar users and suggest movies.

In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import csr_matrix

#### Load Data

In [3]:
full_data = pd.read_csv('../data/Full_Data.csv')

In [30]:
full_data.head(3)

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date,Year_Release,Name
0,1488844,1,3,2005-09-06,2003.0,Dinosaur Planet
1,822109,1,5,2005-05-13,2003.0,Dinosaur Planet
2,885013,1,4,2005-10-19,2003.0,Dinosaur Planet


In [32]:
full_data.shape

(100480507, 6)

#### Cleaning

In [35]:
users = full_data['Cust_Id'].unique()
movies = full_data['Name'].unique()

#### Mapping

In [37]:
user_map = {user: i for i, user in enumerate(users)}
movie_map = {movie: i for i, movie in enumerate(movies)}
reverse_user_map = {i: user for i, user in enumerate(users)}
reverse_movie_map = {i: movie for i, movie in enumerate(movies)}

In [38]:
rows = full_data['Cust_Id'].map(user_map).values
cols = full_data['Name'].map(movie_map).values
ratings = full_data['Rating'].values

In [41]:
valid_indices = ~np.isnan(rows) & ~np.isnan(cols)
rows = rows[valid_indices].astype(int)
cols = cols[valid_indices].astype(int)
ratings = ratings[valid_indices]

#### Sparse Matrix

In [43]:
user_movie_matrix = csr_matrix((ratings, (rows, cols)), 
                              shape=(len(users), len(movies)))

#### Recommend Movies

In [69]:
def get_recommendations(target_user_id, num_recommendations=5):
    target_idx = user_map[target_user_id]
    target_ratings = user_movie_matrix[target_idx].toarray().flatten()
    target_watched = target_ratings > 0
    
    print(f"User has rated {np.sum(target_watched)} movies")
    
    if not np.any(target_watched):
        return []
    
    target_mean = float(target_ratings[target_watched].mean())
    print(f"User's average rating: {target_mean:.2f}")
    
    n_users = user_movie_matrix.shape[0]
    similarities = np.zeros(n_users)
    
    print("Finding similar users...")
    similar_count = 0
    for other_idx in range(n_users):
        if other_idx == target_idx:
            continue
        
        other_ratings = user_movie_matrix[other_idx].toarray().flatten()
        other_watched = other_ratings > 0
        
        common = np.logical_and(target_watched, other_watched)
        common_count = np.sum(common)
        
        if common_count < 3:
            continue
        
        other_mean = float(other_ratings[other_watched].mean())
        
        target_norm = target_ratings.astype(float).copy()
        target_norm[target_watched] -= target_mean
        other_norm = other_ratings.astype(float).copy()
        other_norm[other_watched] -= other_mean
        
        numerator = np.sum(target_norm[common] * other_norm[common])
        denominator = np.sqrt(np.sum(target_norm[common]**2) * np.sum(other_norm[common]**2))
        
        if denominator > 0:
            similarities[other_idx] = numerator / denominator
            similar_count += 1
    
    print(f"Found {similar_count} users with similar tastes")
    
    top_similar = np.argsort(similarities)[-10:][::-1]
    top_scores = similarities[top_similar]
    
    print("Top 5 similarity scores:", top_scores[:5])
    
    movie_scores = np.zeros(user_movie_matrix.shape[1])
    weight_sums = np.zeros(user_movie_matrix.shape[1])
    
    potential_recs = 0
    
    for i, other_idx in enumerate(top_similar):
        if top_scores[i] <= 0:
            continue
        
        other_ratings = user_movie_matrix[other_idx].toarray().flatten()
        
        for movie_idx in range(len(movie_scores)):
            if not target_watched[movie_idx] and other_ratings[movie_idx] > 0:
                movie_scores[movie_idx] += other_ratings[movie_idx] * top_scores[i]
                weight_sums[movie_idx] += top_scores[i]
                potential_recs += 1
    
    print(f"Considering {potential_recs} potential movie recommendations")
    
    for i in range(len(movie_scores)):
        if weight_sums[i] > 0:
            movie_scores[i] /= weight_sums[i]
    
    top_movie_indices = np.argsort(movie_scores)[-num_recommendations:][::-1]
    recommendations = [(reverse_movie_map[idx], movie_scores[idx]) 
                     for idx in top_movie_indices if movie_scores[idx] > 0]
    
    print(f"Found {len(recommendations)} final recommendations")
    return recommendations

#### Test

In [72]:
sample_user_id = 1488844
recommended_movies = get_recommendations(sample_user_id, 5)
print(f"Top recommended movies for User ID {sample_user_id}:")
for movie, score in recommended_movies:
    print(f"- {movie} (Score: {score:.2f})")

User has rated 2169 movies
User's average rating: 3.31
Finding similar users...
Found 470442 users with similar tastes
Top 5 similarity scores: [1. 1. 1. 1. 1.]
Considering 83 potential movie recommendations
Found 5 final recommendations
Top recommended movies for User ID 1488844:
- Million Dollar Baby (Score: 5.00)
- Mobile Suit Gundam SEED (Score: 5.00)
- Angel: Season 3 (Score: 5.00)
- Cowboy Bebop: The Movie (Score: 5.00)
- House (Score: 5.00)
