# 🎬 Day 10 – Movie Recommendation System

📌 Objectives:
* Build a simple movie recommender using collaborative filtering.
* Use cosine similarity on user-movie ratings matrix.
* Recommend similar movies based on rating patterns.

In [3]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import StandardScaler

In [4]:
# Load the local CSV files
movies = pd.read_csv("ml-latest-small/movies.csv")
ratings = pd.read_csv("ml-latest-small/ratings.csv")

print("🎞️ Movies shape:", movies.shape)
print("⭐ Ratings shape:", ratings.shape)


🎞️ Movies shape: (9742, 3)
⭐ Ratings shape: (100836, 4)


In [7]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [9]:
# Merge ratings and movies DataFrames using 'movieId' as the key
# This adds the movie title and genres to each user's rating
df = ratings.merge(movies, on='movieId')

# Display the first few rows of the merged DataFrame
df.head()

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance
2,1,6,4.0,964982224,Heat (1995),Action|Crime|Thriller
3,1,47,5.0,964983815,Seven (a.k.a. Se7en) (1995),Mystery|Thriller
4,1,50,5.0,964982931,"Usual Suspects, The (1995)",Crime|Mystery|Thriller


In [10]:
# Create a pivot table with:
# - Rows as users (userId)
# - Columns as movie titles
# - Values as ratings given by users to movies
user_movie_matrix = df.pivot_table(index='userId', columns='title', values='rating')

# Display the first few rows of the user-movie matrix
user_movie_matrix.head()


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [13]:
# Replace NaN (missing ratings) with 0
# This means if a user hasn't rated a movie, we treat it as zero rating for similarity calculations
movie_user_matrix = user_movie_matrix.fillna(0).T  # Transpose so movies are rows and users are columns

# Check the shape of the matrix
movie_user_matrix.shape

(9719, 610)

In [14]:
print(movie_user_matrix.head())

userId                                   1    2    3    4    5    6    7    \
title                                                                        
'71 (2014)                               0.0  0.0  0.0  0.0  0.0  0.0  0.0   
'Hellboy': The Seeds of Creation (2004)  0.0  0.0  0.0  0.0  0.0  0.0  0.0   
'Round Midnight (1986)                   0.0  0.0  0.0  0.0  0.0  0.0  0.0   
'Salem's Lot (2004)                      0.0  0.0  0.0  0.0  0.0  0.0  0.0   
'Til There Was You (1997)                0.0  0.0  0.0  0.0  0.0  0.0  0.0   

userId                                   8    9    10   ...  601  602  603  \
title                                                   ...                  
'71 (2014)                               0.0  0.0  0.0  ...  0.0  0.0  0.0   
'Hellboy': The Seeds of Creation (2004)  0.0  0.0  0.0  ...  0.0  0.0  0.0   
'Round Midnight (1986)                   0.0  0.0  0.0  ...  0.0  0.0  0.0   
'Salem's Lot (2004)                      0.0  0.0  0.0  ...  0.

### What we want for item-item collaborative filtering:
- We want to compare movies to movies — i.e., find similarity between movies.

- To do that, each movie should be represented as a vector of users’ ratings.

So rows = movies, columns = users.


### Why is this needed?

- To calculate similarity between movies, each movie should be a vector in a common space.

- That vector is the ratings given by all users to that movie.

So, each row = one movie’s ratings from all users.

In [16]:
# Compute cosine similarity between all movie vectors (rows)
similarity_matrix = cosine_similarity(movie_user_matrix)
print(similarity_matrix)

# Store similarity scores in a DataFrame for easy lookup
similarity_df = pd.DataFrame(similarity_matrix, 
                             index=movie_user_matrix.index, 
                             columns=movie_user_matrix.index)

# Display a sample
similarity_df.head()

[[1.         0.         0.         ... 0.32732684 0.         0.        ]
 [0.         1.         0.70710678 ... 0.         0.         0.        ]
 [0.         0.70710678 1.         ... 0.         0.         0.        ]
 ...
 [0.32732684 0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         1.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,1.0,0.857493,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.857493,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
# Function to recommend top N similar movies based on cosine similarity
def recommend_movies(movie_name, top_n=5):
    # Check if the movie exists in the similarity matrix
    if movie_name not in similarity_df.columns:
        print(f"❌ Movie '{movie_name}' not found in the dataset.")
        return pd.DataFrame(columns=["Recommended Movie", "Similarity Score"])
    
    # Get similarity scores for the target movie, sort them in descending order
    similar_scores = similarity_df[movie_name].sort_values(ascending=False)[1:top_n+1]
    
    # Create a clean DataFrame with proper column names
    recommendations = pd.DataFrame({
        "Recommended Movie": similar_scores.index,
        "Similarity Score": similar_scores.values
    })
    
    # Print the results
    print(f"🎬 Top {top_n} movies similar to '{movie_name}':\n")
    return recommendations


In [32]:
recommend_movies("Toy Story (1995)")

🎬 Top 5 movies similar to 'Toy Story (1995)':



Unnamed: 0,Recommended Movie,Similarity Score
0,Toy Story 2 (1999),0.572601
1,Jurassic Park (1993),0.565637
2,Independence Day (a.k.a. ID4) (1996),0.564262
3,Star Wars: Episode IV - A New Hope (1977),0.557388
4,Forrest Gump (1994),0.547096
