In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [2]:
rating_df = pd.read_csv('ratings_small.csv')

In [4]:
unique_user = rating_df['userId'].unique()
len(unique_user)

671

In [5]:
unique_movie = rating_df['movieId'].unique()
len(unique_movie)

9066

In [6]:
unique_rating = rating_df['rating'].unique()
len(unique_rating), sorted(unique_rating)

(10, [0.5, 1.0, 1.5, 2.0, 2.5, 3.0, 3.5, 4.0, 4.5, 5.0])

In [7]:
rating_df['rating'].value_counts()

4.0    28750
3.0    20064
5.0    15095
3.5    10538
4.5     7723
2.0     7271
2.5     4449
1.0     3326
1.5     1687
0.5     1101
Name: rating, dtype: int64

In [21]:
user_count_df = rating_df.groupby('userId').size().reset_index(name='user_rating_count')
user_count_df = user_count_df.sort_values(by=['user_rating_count'])
user_count_df.head()

Unnamed: 0,userId,user_rating_count
0,1,20
497,498,20
447,448,20
444,445,20
443,444,20


In [22]:
movie_count_df = rating_df.groupby('movieId').size().reset_index(name='movie_rating_count')
movie_count_df = movie_count_df.sort_values(by=['movie_rating_count'], ascending=False)
movie_count_df.head(3)

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311


In [17]:
user_limit, movie_limit = 100, 100

In [19]:
filtered_userId = list(user_count_df[user_count_df['user_rating_count']>user_limit]['userId'])

In [23]:
filtered_movieId = list(movie_count_df[movie_count_df['movie_rating_count']>movie_limit]['movieId'])

In [27]:
filtered = rating_df[rating_df['userId'].isin(filtered_userId)]
filtered = filtered[filtered['movieId'].isin(filtered_movieId)]

In [29]:
filtered.head(3)

Unnamed: 0,userId,movieId,rating,timestamp
147,4,10,4.0,949810645
148,4,34,5.0,949919556
151,4,153,4.0,949811346


In [35]:
matrix = filtered.pivot_table(values='rating',index=['userId'],columns=['movieId'],aggfunc=np.average,fill_value=0, dropna=False)
matrix.head(3)

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,...,4.0,0.0,5.0,4.0,4.0,4.0,0.0,4.5,0.0,0.0
15,2.0,2.0,4.0,3.0,3.0,4.0,3.0,1.0,2.5,5.0,...,1.0,3.5,1.0,1.5,5.0,0.5,2.0,4.5,4.5,5.0


In [37]:
matrix.T

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,2.0,0.0,3.0,0.0,0.0,3.0,5.0,4.0,...,4.0,0.0,0.0,5.0,0.0,0.0,0.0,3.5,0.0,5.0
2,0.0,0.0,2.0,0.0,3.0,0.0,0.0,0.0,0.0,2.0,...,0.0,0.0,0.0,3.0,4.0,0.0,0.0,0.0,3.0,0.0
6,0.0,0.0,4.0,4.5,3.0,0.0,0.0,3.5,0.0,4.0,...,4.0,4.0,0.0,0.0,0.0,0.0,3.0,4.0,0.0,0.0
10,4.0,0.0,3.0,0.0,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0
25,0.0,0.0,3.0,4.5,3.0,0.0,0.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0
32,0.0,5.0,4.0,4.5,3.0,4.0,4.5,4.0,4.5,2.0,...,0.0,0.0,0.0,5.0,4.0,0.0,4.0,5.0,4.0,0.0
34,5.0,0.0,3.0,0.0,4.0,4.0,0.0,3.5,0.0,4.0,...,5.0,0.0,0.0,4.5,0.0,0.0,0.0,0.0,2.0,0.0
36,0.0,0.0,1.0,4.5,3.0,3.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,4.5,0.0,0.0,4.0,0.0,0.0,4.0
39,0.0,0.0,2.5,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0
47,0.0,5.0,5.0,5.0,5.0,4.0,3.5,4.5,4.5,4.0,...,5.0,3.0,0.0,4.5,0.0,0.0,4.0,4.5,0.0,0.0


In [33]:
def cos_sim(vector_1, vector_2):
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx)==0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [34]:
a = np.array([1,0,5,0,1,2])
b = np.array([1,2,3,4,0,0])

cos_sim(a,b)

0.99227787671366774

In [40]:
def sim_mat(df, sim_func):
    index = df.index
    
    df = df.T
    
    matrix = []
    
    for idx_1, value_1 in df.items():
        row = []
        
        for idx_2, value_2 in df.items():
            row.append(sim_func(value_1,value_2))
        matrix.append(row)
        
    return pd.DataFrame(matrix, columns=index, index=index)

In [41]:
sim = sim_mat(matrix, cos_sim)

In [42]:
sim

userId,4,8,15,17,19,21,22,23,26,30,...,647,648,652,654,655,656,659,664,665,671
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4,1.000000,0.991116,0.956762,0.948457,0.985932,0.980286,0.981591,0.982744,0.986789,0.979119,...,0.979131,0.951088,0.986368,0.991149,0.983037,0.997707,0.970241,0.994377,0.968998,0.985579
8,0.991116,1.000000,0.914253,0.966828,0.972568,0.985269,0.964117,0.982010,0.984022,0.971471,...,0.974777,0.947942,0.970261,0.988689,0.979823,0.998645,0.972875,0.990196,0.974638,0.982713
15,0.956762,0.914253,1.000000,0.914953,0.950125,0.950927,0.906975,0.923247,0.888292,0.920392,...,0.957841,0.856947,0.893839,0.917356,0.900642,0.873927,0.938017,0.930106,0.903008,0.892096
17,0.948457,0.966828,0.914953,1.000000,0.949537,0.933276,0.939038,0.961024,0.966644,0.942020,...,0.963750,0.933889,0.869626,0.947757,0.964055,0.960849,0.932213,0.964792,0.933463,0.952986
19,0.985932,0.972568,0.950125,0.949537,1.000000,0.963805,0.955135,0.980127,0.954985,0.962846,...,0.971151,0.966500,0.980166,0.979269,0.957911,0.977106,0.962211,0.979273,0.954240,0.971782
21,0.980286,0.985269,0.950927,0.933276,0.963805,1.000000,0.971693,0.971804,0.968064,0.960822,...,0.967762,0.945211,0.981616,0.985406,0.982254,0.985212,0.981214,0.980579,0.966095,0.975929
22,0.981591,0.964117,0.906975,0.939038,0.955135,0.971693,1.000000,0.953184,0.971452,0.951718,...,0.977421,0.916472,0.941138,0.971286,0.956515,0.970108,0.972618,0.978374,0.953578,0.973081
23,0.982744,0.982010,0.923247,0.961024,0.980127,0.971804,0.953184,1.000000,0.956024,0.976404,...,0.978953,0.944953,0.975375,0.981476,0.979839,0.981914,0.974022,0.985208,0.967124,0.980022
26,0.986789,0.984022,0.888292,0.966644,0.954985,0.968064,0.971452,0.956024,1.000000,0.954793,...,0.974563,0.915747,0.890158,0.962669,0.942070,0.984443,0.930512,0.976470,0.972752,0.982440
30,0.979119,0.971471,0.920392,0.942020,0.962846,0.960822,0.951718,0.976404,0.954793,1.000000,...,0.968585,0.926989,0.984888,0.978836,0.963939,0.974163,0.960456,0.974926,0.951942,0.982680


In [43]:
def mean_score(df, sm_df, target, closer_count=10):
    
    sms_df = sm_df.drop(target)
    sms_df = sms_df.sort_values(target, ascending=False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = df.loc[sms_df.index]
    
    ms_df = pd.DataFrame(columns = df.columns)
    ms_df.loc['user'] = df.loc[target]
    ms_df.loc['mean'] = smsw_df.mean()
    
    return ms_df

In [47]:
ms_df = mean_score(matrix, sim, 4, 5)

In [52]:
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,...,1.8,0.0,1.7,2.0,1.8,0.8,1.5,0.8,1.9,1.0


In [48]:
def recommend(ms_df):
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df['user']==0]
    recommend_df = recommend_df.sort_values('mean',ascending=False)
    
    return recommend_df, list(recommend_df.index)

In [49]:
rec, rec_list = recommend(ms_df)

In [51]:
rec.head(3)

Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
4226,0.0,3.0
2858,0.0,2.8
2959,0.0,2.7


In [58]:
def mae(value, pred):
    idx = value.nonzero()[0]
    value,pred = np.array(value)[idx], np.array(pred)[idx]
    
    idx = pred.nonzero()[0]
    value,pred = np.array(value)[idx], np.array(pred)[idx]
    
    return np.absolute(sum(value-pred))/len(idx)

In [59]:
mae(ms_df.loc['user'], ms_df.loc['mean'])

3.5749999999999997

In [60]:
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,0.0,0.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
mean,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.8,...,1.8,0.0,1.7,2.0,1.8,0.8,1.5,0.8,1.9,1.0


In [63]:
def evaluate(df, sm_df, algorithm, closer_count=10):
    users = df.index
    evaluate_list = []
    
    for target in users:
        result_df = mean_score(df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(result_df.loc['user'], result_df.loc['mean']))
    return np.average(evaluate_list)

In [68]:
evaluate(matrix, sim, mae, 1000)

2.154441665652981

In [77]:
def euc_sim(vector_1, vector_2):
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return 0
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    idx = vector_2.nonzero()[0]
    if len(idx)==0:
        return 0
    
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx]
    
    return np.linalg.norm(vector_1 - vector_2)
    

In [78]:
euc_sim(a,b)

2.0

In [80]:
def find_best(user_df, sim_func, closer_count = 10):
    
    sm_df = sim_mat(matrix, sim_func)
    return evaluate(user_df, sm_df, mae, closer_count)

In [81]:
find_best(matrix, euc_sim, 5)

1.5419005295189465

In [82]:
for count in range(1,6):
    print(find_best(matrix, euc_sim, count))

1.0059198272
1.33777298177
1.4675817548
1.50583303772
1.54190052952
