# Setup

The setup is mostly the same of user-user, so for explanation of what is being done is in the other file, the differences will be explained.

In [19]:
import pandas as pd
import numpy as np
import scipy.stats as st

In [20]:
def crop(df, percent):
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    user_counts = df["userId"].value_counts()
    top_users = user_counts[user_counts > user_counts.quantile(percent)]
    df = df[df["userId"].isin(top_users.index)]
    movie_counts = df["movieId"].value_counts()
    top_movies = movie_counts[movie_counts > movie_counts.quantile(percent)]
    df = df[df["movieId"].isin(top_movies.index)]
    print(df.shape)
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    return df

In [21]:
df = pd.read_csv("./rating.csv", sep=",")
df = crop(df, 0.97)
df.drop(columns=["timestamp"], inplace=True)

Users: 138493
Movies: 26744
(1761018, 4)
Users: 4144
Movies: 770


In [22]:
matrix = df.pivot(index="userId", columns="movieId", values="rating")
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,4.0,3.0,3.0,4.0,5.0,2.0,2.0,,2.0,3.0,...,,,,,,,,,,
104,,,,,,,3.0,,3.0,,...,,,,,,,,,,
116,3.0,2.0,1.5,2.0,2.0,3.5,,2.5,3.5,2.0,...,,,,,,,,,,
156,5.0,5.0,4.0,4.0,5.0,4.0,4.0,,4.0,4.0,...,,,,,,,,,,
208,4.0,,,,3.0,1.5,5.0,,4.5,,...,4.0,,3.0,4.5,4.0,4.5,3.5,,3.5,


Instead of getting the deviation for the users, we will get the deviation for the movies

In [23]:
movie_mean = matrix.apply(lambda column: column.mean(), axis=0)
movie_mean.head()

movieId
1     3.874794
2     2.918415
6     3.781544
10    3.258226
11    3.407371
dtype: float64

In [24]:
matrix = matrix.apply(lambda column: column - column.mean(), axis=0)
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.125206,0.081585,-0.781544,0.741774,1.592629,-1.754585,-1.740595,,-1.45417,-0.045602,...,,,,,,,,,,
104,,,,,,,-0.740595,,-0.45417,,...,,,,,,,,,,
116,-0.874794,-0.918415,-2.281544,-1.258226,-1.407371,-0.254585,,0.112069,0.04583,-1.045602,...,,,,,,,,,,
156,1.125206,2.081585,0.218456,0.741774,1.592629,0.245415,0.259405,,0.54583,0.954398,...,,,,,,,,,,
208,0.125206,,,,-0.407371,-2.254585,1.259405,,1.04583,,...,0.192187,,-0.49674,0.698342,0.015837,0.687464,-0.641391,,-0.498801,


# Item-Item Collaborative Filtering 

## Correlation Matrix

In [25]:
min_common_items = 10
correlation_matrix = matrix.corr(method="pearson", min_periods=min_common_items)

In [26]:
correlation_matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.276539,0.144652,0.252121,0.322261,0.108877,0.242555,0.094306,0.22855,0.208346,...,0.354184,0.219426,0.060716,0.213517,0.163525,0.230065,0.189269,0.248859,0.30366,0.136066
2,0.276539,1.0,0.135875,0.304847,0.269987,0.149691,0.185187,0.294449,0.168977,0.361049,...,0.182582,0.194943,0.227374,0.162258,0.006213,0.171186,0.148527,0.249273,0.158311,0.17264
6,0.144652,0.135875,1.0,0.258395,0.197951,0.401036,0.136173,0.098752,0.206873,0.157865,...,0.164174,0.14711,0.155239,0.245282,0.230089,0.127386,0.221842,0.189801,0.17874,0.241999
10,0.252121,0.304847,0.258395,1.0,0.328231,0.231987,0.193134,0.281455,0.212749,0.346872,...,0.177112,0.193046,0.210082,0.267276,0.07228,0.113344,0.231345,0.340094,0.141289,0.208618
11,0.322261,0.269987,0.197951,0.328231,1.0,0.096884,0.283914,0.089616,0.179375,0.407207,...,0.246624,0.141303,0.134532,0.246151,0.025721,0.204431,0.118324,0.236691,0.158988,0.126388


## Recomendation

In [41]:
def predict_ratings(user_id, top_n=20):
    unrated_movies = matrix.loc[user_id][matrix.loc[user_id].isna()].index

    def weighted_avg_rating(movie):
        top_20_corr_movies = correlation_matrix.loc[matrix.loc[user_id].notna()].abs()[movie].nlargest(top_n).index[1:]
        rating = matrix.loc[user_id, top_20_corr_movies] * correlation_matrix.loc[movie, top_20_corr_movies] / correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() if correlation_matrix.loc[movie, top_20_corr_movies].abs().sum() != 0 else 0 
        return rating

    # Apply the function to the unrated movies
    predicted_ratings = pd.DataFrame(unrated_movies).apply(weighted_avg_rating)

    # Rename the columns
    predicted_ratings.columns = ['movie_id', 'predicted_rating']

    return predicted_ratings

In [55]:
matrix.loc[54]

movieId
1        0.125206
2        0.081585
6       -0.781544
10       0.741774
11       1.592629
           ...   
56367         NaN
58559         NaN
59315         NaN
60069         NaN
79132         NaN
Name: 54, Length: 770, dtype: float64

In [51]:
user_id = 54
movie=19
top_n = 20
unrated_movies = matrix.loc[user_id][matrix.loc[user_id].isna()].index

In [56]:
correlation_matrix.loc[matrix.loc[user_id].notna()][movie]

movieId
1       0.094306
2       0.294449
6       0.098752
10      0.281455
11      0.089616
          ...   
3897    0.054327
3977    0.207379
4002    0.223861
4007    0.140418
5060    0.063697
Name: 19, Length: 366, dtype: float64

In [32]:
user_ratings = matrix.loc[54]

In [36]:
user_ratings[user_ratings.isnull()].index.tolist()

[19,
 44,
 48,
 62,
 104,
 141,
 145,
 158,
 168,
 172,
 173,
 180,
 198,
 208,
 216,
 231,
 253,
 266,
 293,
 317,
 333,
 353,
 355,
 364,
 420,
 431,
 432,
 434,
 441,
 466,
 471,
 485,
 497,
 500,
 508,
 509,
 520,
 553,
 555,
 588,
 594,
 595,
 596,
 628,
 653,
 673,
 762,
 778,
 784,
 802,
 805,
 852,
 899,
 902,
 903,
 904,
 908,
 910,
 912,
 913,
 914,
 919,
 920,
 923,
 953,
 969,
 1020,
 1022,
 1027,
 1029,
 1032,
 1035,
 1037,
 1059,
 1060,
 1061,
 1080,
 1093,
 1094,
 1095,
 1183,
 1193,
 1199,
 1206,
 1208,
 1220,
 1222,
 1225,
 1231,
 1233,
 1245,
 1249,
 1250,
 1261,
 1262,
 1263,
 1266,
 1271,
 1276,
 1282,
 1285,
 1287,
 1293,
 1339,
 1378,
 1380,
 1394,
 1500,
 1586,
 1644,
 1645,
 1673,
 1682,
 1704,
 1721,
 1747,
 1748,
 1882,
 1884,
 1907,
 1918,
 1921,
 1960,
 1961,
 1962,
 2000,
 2018,
 2019,
 2020,
 2023,
 2076,
 2078,
 2080,
 2081,
 2085,
 2087,
 2096,
 2140,
 2145,
 2150,
 2167,
 2231,
 2232,
 2268,
 2273,
 2289,
 2294,
 2302,
 2313,
 2329,
 2335,
 2340,
 2366,