In [2]:
import pandas as pd

In [3]:
# Load datasets and create a merged dataframe
ratings = pd.read_csv("../data/ratings.csv")
movies = pd.read_csv("../data/movies.csv")

data = pd.merge(ratings, movies, on="movieId")

In [4]:
# Create a user–item matrix
# Rows represent users, columns represent movies, values are ratings
# Sparse structure is expected in real-world recommender systems
movie_matrix = data.pivot_table(
    index="userId",
    columns="title",
    values="rating"
)

movie_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


In [5]:
# Select a reference movie
# Recommendations will be based on similarity to this movie
movie_name = "Toy Story (1995)"
movie_ratings = movie_matrix[movie_name]
movie_ratings.head()

userId
1    4.0
2    NaN
3    NaN
4    NaN
5    4.0
Name: Toy Story (1995), dtype: float64

In [6]:
# Compute Pearson correlation between the selected movie
# and all other movies based on user rating patterns
similar_movies = movie_matrix.corrwith(movie_ratings)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


In [19]:
corr_df = pd.DataFrame(similar_movies, columns=["correlation"])
corr_df.dropna(inplace=True)

corr_df.sort_values("correlation", ascending=False).head(10)

Unnamed: 0_level_0,correlation
title,Unnamed: 1_level_1
"Wolfman, The (2010)",1.0
"Big Tease, The (1999)",1.0
Persuasion (2007),1.0
"Perfect Candidate, A (1996)",1.0
Project X (1987),1.0
When in Rome (2010),1.0
Wheels on Meals (Kuai can che) (1984),1.0
Dirty Grandpa (2016),1.0
Escaflowne: The Movie (Escaflowne) (2000),1.0
Encounters at the End of the World (2008),1.0


In [8]:
ratings_count = data.groupby("title")["rating"].count()

ratings_count.head()

title
'71 (2014)                                 1
'Hellboy': The Seeds of Creation (2004)    1
'Round Midnight (1986)                     2
'Salem's Lot (2004)                        1
'Til There Was You (1997)                  2
Name: rating, dtype: int64

In [9]:
ratings_mean = data.groupby("title")["rating"].mean()

In [22]:
movie_stats = pd.DataFrame({
    "rating_count": ratings_count,
    "rating_mean": ratings_mean
})

movie_stats.head()

Unnamed: 0_level_0,rating_count,rating_mean
title,Unnamed: 1_level_1,Unnamed: 2_level_1
'71 (2014),1,4.0
'Hellboy': The Seeds of Creation (2004),1,4.0
'Round Midnight (1986),2,3.5
'Salem's Lot (2004),1,5.0
'Til There Was You (1997),2,4.0


In [7]:
def recommend_movies(movie_name, min_ratings=50):

    """
    Recommend movies similar to the given movie based on
    Pearson correlation of user ratings.

    Parameters:
    - movie_title (str): Reference movie
    - movie_matrix (DataFrame): User–item rating matrix
    - top_n (int): Number of recommendations to return

    Returns:
    - DataFrame with similar movies and correlation scores
    """
    
    movie_ratings = movie_matrix[movie_name]
    
    similar_movies = movie_matrix.corrwith(movie_ratings)
    
    corr_df = pd.DataFrame(similar_movies, columns=["correlation"])
    corr_df.dropna(inplace=True)
    
    # Including vote number
    corr_df = corr_df.join(movie_stats["rating_count"])
    
    # Deleting low-rated movies
    recommendations = corr_df[
        corr_df["rating_count"] > min_ratings
    ].sort_values("correlation", ascending=False)
    
    return recommendations.head(10)


In [25]:
recommend_movies(movie_name)

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)
  c *= np.true_divide(1, fact)
  c /= stddev[:, None]
  c /= stddev[None, :]


Unnamed: 0_level_0,correlation,rating_count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story (1995),1.0,215
Toy Story 2 (1999),0.699211,97
Arachnophobia (1990),0.652424,53
"Incredibles, The (2004)",0.643301,125
Finding Nemo (2003),0.618701,141
Aladdin (1992),0.611892,183
Erin Brockovich (2000),0.598016,70
Wallace & Gromit: The Wrong Trousers (1993),0.589625,56
Blazing Saddles (1974),0.585892,62
"Wolf of Wall Street, The (2013)",0.578479,54
