# Setup

In [1]:
import pandas as pd
import numpy as np

In [2]:
def crop(df, percent):
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    user_counts = df["userId"].value_counts()
    top_users = user_counts[user_counts > user_counts.quantile(percent)]
    df = df[df["userId"].isin(top_users.index)]
    movie_counts = df["movieId"].value_counts()
    top_movies = movie_counts[movie_counts > movie_counts.quantile(percent)]
    df = df[df["movieId"].isin(top_movies.index)]
    print(df.shape)
    print("Users:", len(df["userId"].unique()))
    print("Movies:", len(df["movieId"].unique()))
    return df

In [3]:
df = pd.read_csv("./rating.csv", sep=",")
df = crop(df, 0.97)

Users: 138493
Movies: 26744
(1761018, 4)
Users: 4144
Movies: 770


In [4]:
df.drop(columns=["timestamp"], inplace=True)

In [5]:
df.head()

Unnamed: 0,userId,movieId,rating
5400,54,1,4.0
5401,54,2,3.0
5403,54,6,3.0
5404,54,10,4.0
5405,54,11,5.0


In [6]:
df.describe()

Unnamed: 0,userId,movieId,rating
count,1761018.0,1761018.0,1761018.0
mean,68691.81,4245.153,3.472575
std,39817.28,8774.164,1.001159
min,54.0,1.0,0.5
25%,34402.0,1129.0,3.0
50%,69282.0,2100.0,3.5
75%,102599.0,3793.0,4.0
max,138437.0,79132.0,5.0


In [7]:
matrix = df.pivot(index="userId", columns="movieId", values="rating")
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,4.0,3.0,3.0,4.0,5.0,2.0,2.0,,2.0,3.0,...,,,,,,,,,,
104,,,,,,,3.0,,3.0,,...,,,,,,,,,,
116,3.0,2.0,1.5,2.0,2.0,3.5,,2.5,3.5,2.0,...,,,,,,,,,,
156,5.0,5.0,4.0,4.0,5.0,4.0,4.0,,4.0,4.0,...,,,,,,,,,,
208,4.0,,,,3.0,1.5,5.0,,4.5,,...,4.0,,3.0,4.5,4.0,4.5,3.5,,3.5,


In [8]:
matrix = matrix.apply(lambda row: row - row.mean(), axis=1)
matrix.head()

movieId,1,2,6,10,11,16,17,19,21,22,...,50872,51255,51662,54286,55820,56367,58559,59315,60069,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,0.289617,-0.710383,-0.710383,0.289617,1.289617,-1.710383,-1.710383,,-1.710383,-0.710383,...,,,,,,,,,,
104,,,,,,,0.027132,,0.027132,,...,,,,,,,,,,
116,0.43763,-0.56237,-1.06237,-0.56237,-0.56237,0.93763,,-0.06237,0.93763,-0.56237,...,,,,,,,,,,
156,0.913861,0.913861,-0.086139,-0.086139,0.913861,-0.086139,-0.086139,,-0.086139,-0.086139,...,,,,,,,,,,
208,0.381643,,,,-0.618357,-2.118357,1.381643,,0.881643,,...,0.381643,,-0.618357,0.881643,0.381643,0.881643,-0.118357,,-0.118357,


# User-User Collaborative Filtering 

## Correlation Matrix

In [10]:
min_common_items = 10
correlation_matrix = matrix.T.corr(method="pearson", min_periods=min_common_items)

In [11]:
correlation_matrix.head()

userId,54,104,116,156,208,298,347,348,359,367,...,138208,138254,138270,138301,138307,138325,138382,138397,138406,138437
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
54,1.0,0.114553,0.269821,0.270802,0.172821,0.245273,0.223953,-0.070332,0.234784,0.116278,...,0.1562,0.235974,0.142072,0.173876,0.156422,0.249429,0.248317,0.173988,0.286939,0.190875
104,0.114553,1.0,0.202915,0.088366,0.30488,-0.099429,0.399825,0.220794,0.183705,0.156648,...,0.112484,0.15868,0.22171,0.224819,0.123594,0.312171,-0.109681,-0.092775,0.242078,0.14731
116,0.269821,0.202915,1.0,0.175049,0.215795,0.336205,0.345677,0.247348,0.2732,0.501648,...,0.317431,0.357426,0.280276,0.415804,0.145095,0.410156,0.152599,0.264419,0.393933,0.437245
156,0.270802,0.088366,0.175049,1.0,0.06158,0.256052,0.178544,-0.031419,0.132963,0.332801,...,0.19305,0.259178,0.121299,0.185104,0.074637,0.224374,0.245798,0.272123,0.176687,0.196021
208,0.172821,0.30488,0.215795,0.06158,1.0,0.005218,0.308763,0.118676,0.250497,0.132541,...,0.260259,0.194055,0.264405,0.142791,0.220351,0.366289,0.012213,-0.117251,0.336622,0.216713


## Recommendation

In [12]:
k = 50

In [53]:
def deviation_per_movie(column, correlations):
    weighted_ratings = column.multiply(correlations).dropna().sum()
    sum_correlations = abs(correlations.loc[column.dropna().index].sum())
    return weighted_ratings/sum_correlations   

In [54]:
def infer_deviations(userId, k=50):
    correlation_user = correlation_matrix.loc[userId].drop(index=[userId]).dropna()
    similar_users = list(np.argpartition(correlation_user.abs(), -k)[-k:])
    matrix_user = matrix.loc[correlation_user.index[similar_users]].iloc[:, list(matrix.loc[userId].isna())]
    return matrix_user.apply(lambda x: deviation_per_movie(x, correlations=correlation_user), axis="index")

In [55]:
infer_deviations(54)

movieId
19      -1.105431
44      -1.304264
48      -0.796795
62       0.106117
104     -0.255468
           ...   
56367    0.446418
58559    1.029973
59315    0.502597
60069    0.833276
79132    0.885760
Length: 404, dtype: float64