### 1. Dataset

In [1]:
import pandas as pd
import numpy as np
from scipy import spatial

In [5]:
rating_df = pd.read_csv("ratings_small.csv")
rating_df.drop("timestamp", axis=1, inplace=True)
rating_df.tail(2)

Unnamed: 0,userId,movieId,rating
100002,671,6385,2.5
100003,671,6565,3.5


##### 2. Check Dataset

In [6]:
unique_user = rating_df["userId"].unique()
len(unique_user)

671

In [7]:
unique_movie = rating_df["movieId"].unique()
len(unique_movie)

9066

In [8]:
# rating 분포
rating_df.groupby("rating").size().reset_index(name="rating_counts")

Unnamed: 0,rating,rating_counts
0,0.5,1101
1,1.0,3326
2,1.5,1687
3,2.0,7271
4,2.5,4449
5,3.0,20064
6,3.5,10538
7,4.0,28750
8,4.5,7723
9,5.0,15095


In [12]:
# user 분포
user_counts_df = rating_df.groupby("userId").size().reset_index(name="user_rating_count")
user_counts_df = user_counts_df.sort_values("user_rating_count", ascending=False)
user_counts_df.head()

Unnamed: 0,userId,user_rating_count
546,547,2391
563,564,1868
623,624,1735
14,15,1700
72,73,1610


In [15]:
# movie 분포
movie_counts_df = rating_df.groupby("movieId").size().reset_index(name="movie_rating_count")
movie_counts_df = movie_counts_df.sort_values("movie_rating_count", ascending=False)
movie_counts_df.head()

Unnamed: 0,movieId,movie_rating_count
321,356,341
266,296,324
284,318,311
525,593,304
232,260,291


### 3. Preprocessing

In [22]:
# user 최소 평가수, movie 최소 평가수
user_limit, movie_limit = 365, 100

In [23]:
filtered_userId = user_counts_df[user_counts_df["user_rating_count"] > user_limit]
filtered_userId = list(filtered_userId["userId"])
len(filtered_userId), filtered_userId[:5]

(59, [547, 564, 624, 15, 73])

In [26]:
filtered_movieId = movie_counts_df[movie_counts_df["movie_rating_count"] > movie_limit]
filtered_movieId = list(filtered_movieId["movieId"])
len(filtered_movieId), filtered_movieId[:5]

(149, [356, 296, 318, 593, 260])

In [36]:
# 10004 -> 5570
filtered_df = rating_df[rating_df["userId"].isin(filtered_userId)]
len(filterd_df)

filtered_df = filtered_df[filtered_df["movieId"].isin(filtered_movieId)]
len(filterd_df)

5570

### 4. pivot

In [47]:
user_df = filtered_df.pivot_table(values="rating",\
                                  index="userId", columns="movieId",\
                                  aggfunc=np.average, fill_value=0,\
                                  dropna=False
                                 )
user_df.head()

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
15,2.0,2.0,4.0,3.0,3.0,4.0,3.0,1.0,2.5,5.0,...,1.0,3.5,1.0,1.5,5.0,0.5,2.0,4.5,4.5,5.0
19,3.0,3.0,3.0,3.0,3.0,3.0,4.0,3.0,3.0,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
23,3.0,0.0,3.5,0.0,0.0,4.0,3.5,0.0,0.0,4.5,...,3.5,4.5,5.0,4.5,4.5,3.5,0.0,3.5,0.0,0.0
30,4.0,2.0,4.0,0.0,4.0,2.0,4.0,0.0,0.0,4.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
48,4.0,3.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,4.0,0.0,4.0,4.5,0.0,3.5,0.0,4.0,4.0


### 5. function

In [45]:
def cosine_similarity(vector_1, vector_2):
    
    idx = vector_1.nonzero()[0]
    if len(idx) == 0:
        return -1
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx] 
    
    idx = vector_2.nonzero()[0]
    if len(idx) == 0:
        return -1
    vector_1, vector_2 = np.array(vector_1)[idx], np.array(vector_2)[idx] 
    
    return 1 - spatial.distance.cosine(vector_1, vector_2)

In [48]:
# test code - cosine_similarity
cosine_similarity(user_df.loc[15], user_df.loc[19])

0.9501250301799182

In [49]:
def similarity_matrix(user_df, similarity_func):
    
    users = user_df.index
    
    df = user_df.T
    
    matrix=[]
    for idx_1, value_1 in df.items():
        row=[]
        for idx_2, value_2 in df.items():
            row.append(similarity_func(value_1, value_2))
        matrix.append(row)
    
    return pd.DataFrame(matrix, index=users, columns=users)

In [51]:
# test code - similarity_matrix
sm_df = similarity_matrix(user_df, cosine_similarity)
sm_df.tail()

userId,15,19,23,30,48,56,73,102,105,119,...,580,587,596,605,607,615,624,654,664,665
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
615,0.913701,0.986879,0.977971,0.976351,0.990701,0.986844,0.977249,0.977529,0.981156,0.975449,...,0.98418,0.982973,0.984113,0.945157,0.979301,1.0,0.977023,0.991736,0.991745,0.97756
624,0.933455,0.973056,0.974726,0.970812,0.977897,0.964744,0.968863,0.967962,0.974071,0.964382,...,0.967887,0.974954,0.970422,0.949962,0.968329,0.977023,1.0,0.978762,0.977336,0.959331
654,0.917356,0.979269,0.981476,0.978836,0.987746,0.977249,0.97678,0.976687,0.983877,0.977521,...,0.983758,0.97794,0.981081,0.955068,0.98024,0.991736,0.978762,1.0,0.993751,0.973894
664,0.930106,0.979273,0.985208,0.974926,0.993049,0.976032,0.982777,0.976597,0.982344,0.982795,...,0.98368,0.97312,0.988273,0.956637,0.988647,0.991745,0.977336,0.993751,1.0,0.974557
665,0.903008,0.95424,0.967124,0.951942,0.97652,0.94866,0.962269,0.937813,0.964187,0.967379,...,0.962619,0.955417,0.954341,0.92442,0.960054,0.97756,0.959331,0.973894,0.974557,1.0


In [52]:
def mean_score(user_df, sm_df, target, closer_count=10):
    
    sms_df = sm_df.drop(target)
    sms_df = sms_df.sort_values(target, ascending=False)
    sms_df = sms_df[target][:closer_count]
    
    smsw_df = user_df.loc[sms_df.index] 
    
    ms_df = pd.DataFrame(columns=user_df.columns)
    ms_df.loc["user"] = user_df.loc[target]
    ms_df.loc["mean"] = smsw_df.mean()
    
    return ms_df

In [57]:
# test code - mean_score
ms_df = mean_score(user_df, sm_df, 48, 10)
ms_df

movieId,1,2,6,10,25,32,34,36,39,47,...,6377,6539,6874,7153,7361,7438,8961,33794,58559,79132
user,4.0,3.5,0.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,...,4.0,4.0,0.0,4.0,4.5,0.0,3.5,0.0,4.0,4.0
mean,3.6,1.65,1.5,1.5,1.2,2.1,1.55,0.95,1.55,2.95,...,2.6,2.85,2.4,3.05,2.35,2.35,3.0,2.6,1.4,1.75


In [58]:
# recommend
def recommend(ms_df):
    recommend_df = ms_df.T
    recommend_df = recommend_df[recommend_df["user"] == 0]
    recommend_df = recommend_df.sort_values("mean", ascending=False)
    return recommend_df, list(recommend_df.index)

In [61]:
# test code - recommend
recommend_df, recommend_list = recommend(ms_df)
print(recommend_list[:10])
recommend_df.head()

[260, 1198, 1291, 318, 1036, 1196, 1210, 1265, 1136, 2028]


Unnamed: 0_level_0,user,mean
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
260,0.0,4.4
1198,0.0,4.35
1291,0.0,4.25
318,0.0,4.0
1036,0.0,3.95


In [62]:
# MAE
def mae(value, pred):
    idx = value.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx] 
    
    idx = pred.nonzero()[0]
    value, pred = np.array(value)[idx], np.array(pred)[idx] 
    
    return sum(np.absolute(value - pred)) / len(idx)

In [63]:
# test code - mae
mae(ms_df.loc["user"], ms_df.loc["mean"])

1.2492187500000003

In [68]:
def evaluate(user_df, sm_df, closer_count, algorithm):
    
    users = user_df.index
    evaluate_list = []
    
    for target in users:
        pred_df = mean_score(user_df, sm_df, target, closer_count)
        evaluate_list.append(algorithm(pred_df.loc["user"], pred_df.loc["mean"]))
    
    return np.average(evaluate_list)

In [70]:
# test code - evaluate
evaluate(user_df, sm_df, 10, mae)

1.4807110633380405

In [74]:
start, end = 2, 30

for closer_count in range(start, end + 1):
    print(closer_count, evaluate(user_df, sm_df, closer_count, mae))

2 1.3048712324774463
3 1.4891618021647992
4 1.5568822047461945
5 1.5484965993474276
6 1.5375646916686694
7 1.5316502433082964
8 1.5144641478552185
9 1.4995695256828276
10 1.4807110633380405
11 1.4558345404661428
12 1.4378479960514683
13 1.4208386632721917
14 1.4161047676988603
15 1.3970124136329434
16 1.3866435766974392
17 1.3694303019200813
18 1.3593708078060438
19 1.353514097757527
20 1.3433332605017512
21 1.3290482949231113
22 1.3213094041644209
23 1.3128248157474336
24 1.3095068822032607
25 1.3054660665050735
26 1.2993295075445872
27 1.2993470519666857
28 1.2999265049990254
29 1.3031327558747277
30 1.2987115384834071
