In [None]:
import numpy as np
import pandas as pd
movies = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/WBS-Project-8---Reccomenders/main/movies.csv")
tags = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/WBS-Project-8---Reccomenders/main/tags.csv")
links = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/WBS-Project-8---Reccomenders/main/links.csv")
ratings = pd.read_csv("https://raw.githubusercontent.com/icaromisquita/WBS-Project-8---Reccomenders/main/ratings.csv")

Preparing data for Correlation

In [None]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [None]:
movies_crosstab = pd.pivot_table(data=ratings, values='rating', index='userId', columns='movieId')
movies_crosstab.head(10)

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,
6,,4.0,5.0,3.0,5.0,4.0,4.0,3.0,,3.0,...,,,,,,,,,,
7,4.5,,,,,,,,,,...,,,,,,,,,,
8,,4.0,,,,,,,,2.0,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,,,,...,,,,,,,,,,


Let's look at the most popular movie "Forrest Gump (1994)":

In [None]:
# Tortas Locas
top_popular_movieId = 356

In [None]:
gump_ratings = movies_crosstab[top_popular_movieId]
gump_ratings[gump_ratings>=0] # exclude NaNs

userId
1      4.0
6      5.0
7      5.0
8      3.0
10     3.5
      ... 
605    3.0
606    4.0
608    3.0
609    4.0
610    3.0
Name: 356, Length: 329, dtype: float64

Evaluating Similarity Based on Correlation

In [None]:
# we get warnings because computing the pearson correlation coefficient with NaNs, but the results are still ok
similar_to_gump = movies_crosstab.corrwith(gump_ratings)
similar_to_gump

  c = cov(x, y, rowvar, dtype=dtype)
  c *= np.true_divide(1, fact)


movieId
1         0.303465
2         0.367247
3         0.534682
4         0.388514
5         0.349541
            ...   
193581         NaN
193583         NaN
193585         NaN
193587         NaN
193609         NaN
Length: 9724, dtype: float64

Many movies get a NaN, because not many users didn't rate both movies. But some of them give us a correlation score. Let's drop NaNs and look at the valid results:

In [None]:
corr_gump = pd.DataFrame(similar_to_gump, columns=['PearsonR'])
corr_gump.dropna(inplace=True)
corr_gump.head(10)

Unnamed: 0_level_0,PearsonR
movieId,Unnamed: 1_level_1
1,0.303465
2,0.367247
3,0.534682
4,0.388514
5,0.349541
6,0.137421
7,0.106567
8,0.65602
9,0.0
10,0.217441


In [None]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [None]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [None]:
rating = pd.DataFrame(ratings.groupby('movieId')['rating'].mean())
rating['rating_count'] = ratings.groupby('movieId')['rating'].count()

In [None]:
gump_corr_summary = corr_gump.join(rating['rating_count'])
gump_corr_summary.drop(top_popular_movieId, inplace=True) # drop Tortas Locas itself
gump_corr_summary

Unnamed: 0_level_0,PearsonR,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0.303465,215
2,0.367247,110
3,0.534682,52
4,0.388514,7
5,0.349541,49
...,...,...
185585,-1.000000,2
187541,1.000000,4
187593,-0.203519,12
187595,0.870388,5


Let's filter out movies with a rating count below 10.

Then, take the top 10 movies in terms of similarity to Forest Gump:

In [None]:
top10 = gump_corr_summary[gump_corr_summary['rating_count']>=10].sort_values('PearsonR', ascending=False).head(10)
top10

Unnamed: 0_level_0,PearsonR,rating_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1295,0.932958,11
6793,0.885253,11
328,0.881682,10
4954,0.865633,11
911,0.850591,13
55721,0.799415,10
195,0.786428,10
181,0.785661,17
80906,0.782601,12
150548,0.776636,10


In [None]:
merge_movies = movies.merge(tags, left_on='movieId', right_on='movieId')
names =  merge_movies[['movieId', 'title']]
names.head()

Unnamed: 0,movieId,title
0,1,Toy Story (1995)
1,1,Toy Story (1995)
2,1,Toy Story (1995)
3,2,Jumanji (1995)
4,2,Jumanji (1995)


In [None]:
top10 = top10.merge(names, left_index=True, right_on="movieId")
top10

Unnamed: 0,PearsonR,rating_count,movieId,title
616,0.850591,13,911,Charade (1963)
2883,0.799415,10,55721,Elite Squad (Tropa de Elite) (2007)
3175,0.782601,12,80906,Inside Job (2010)
3176,0.782601,12,80906,Inside Job (2010)
3177,0.782601,12,80906,Inside Job (2010)
3178,0.782601,12,80906,Inside Job (2010)
3179,0.782601,12,80906,Inside Job (2010)
3180,0.782601,12,80906,Inside Job (2010)


Let's look at the genres type:

In [None]:
top10.merge(movies).head()

Unnamed: 0,PearsonR,rating_count,movieId,title,genres
0,0.850591,13,911,Charade (1963),Comedy|Crime|Mystery|Romance|Thriller
1,0.799415,10,55721,Elite Squad (Tropa de Elite) (2007),Action|Crime|Drama|Thriller
2,0.782601,12,80906,Inside Job (2010),Documentary
3,0.782601,12,80906,Inside Job (2010),Documentary
4,0.782601,12,80906,Inside Job (2010),Documentary
