In [1]:
%matplotlib inline
import pandas as pd

In [5]:
# 데이터 불러오기 (valid는 20% uniform random sampling)
metadata = pd.read_csv('metadata.csv')
ratings_train = pd.read_csv('ratings-train.csv')
ratings_valid = pd.read_csv('ratings-valid.csv').sample(frac=0.2, random_state=17)

In [6]:
metadata.head()

Unnamed: 0,itemid,title,genres,country,running_min
0,55396,시라노;연애조작단 (2010),로맨스/멜로/코미디,한국,121
1,42858,본 얼티메이텀 (2007),액션,미국,-1
2,121058,어느 가족 (2018),드라마,일본,121
3,4285,펀치 드렁크 러브 (2002),코미디,미국,95
4,36948,자토이치 (2003),액션/시대극,일본,115


In [18]:
ratings_train.head()

Unnamed: 0,userid,itemid,rating
0,MVdyRkY=,55396,9
1,TzZqVQ==,55396,7
2,REI4WTI=,55396,9
3,bDJ4aA==,55396,7
4,QktyMw==,55396,7


In [8]:
# RMSE
def rmse(expected, answer):
    merged = pd.merge(answer, expected, on=['userid', 'itemid'], how='left')
    merged['rating_y'] = merged['rating_y'].fillna(0)
    merged['square_error'] = (merged['rating_x'] - merged['rating_y']) ** 2
    return merged['square_error'].mean() ** 0.5

In [11]:
# jaccard similarity
# u, v => len(i_dict[u] & i_dict[v]) / len(i_dict[u] | i_dict[v])
# u와 v가 둘다 평점을 남긴 영화의 수 / u나 v가 평점을 남긴 영화의 수

all_users = ratings_train['userid'].unique()
i_dict = { u: set(ratings_train[ratings_train['userid'] == u]['itemid']) for u in all_users }

def sim(u,v):
    i_u = i_dict[u]
    i_v = i_dict[v]
    
    cup = i_dict[u] | i_dict[v]
    cap = i_dict[u] & i_dict[v]
    
    # 합집합이 공집합일 경우 -> 같이 본영화가 없음 -> similarity = 0
    if len(cup) == 0:
        return 0.0
    return len(cap) / len(cup)

In [13]:
sim('TERhUA==', 'Q1ladXM=')

0.08333333333333333

In [14]:
def similar_users(u, k):
    #모든 유저에 대한 jaccard similarity를 계산 (본인이 아닐경우에만)
    sims = [(sim(u,v), v) for v in all_users if u != v]
    
    #유사도 내림차순으로 정렬해서 top-k명
    sorted_sims = sorted(sims, reverse=True)
    topk_sims = sorted_sims[:k]
    topk_users = [v for s, v in topk_sims]
    
    return pd.DataFrame(topk_users, columns=['userid'])

In [15]:
similar_users('TERhUA==', 5)

Unnamed: 0,userid
0,YzkyQQ==
1,NGdmcVQ=
2,M2hETGQ=
3,V0NyaQ==
4,QTB5d0E=


In [17]:
def predict(u, i):
    # return <expected rating of i given u>
    return 8

# rmse함수에 predict결과를 넣어 validation값이랑 비교
expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis = 1)
rmse(expected, ratings_valid)

2.7423239390279464

In [84]:
def predict(u, i):
    topk_users = similar_users(u, 5)
    u_mean = ratings_train[ratings_train['userid'] == u]['rating'].mean()
    topk_users['mean'] = topk_users.apply(lambda x: ratings_train[ratings_train['userid'] == x['userid']]['rating'].mean(), axis = 1)
    topk_users['sim'] = topk_users.apply(lambda row: sim(u, row['userid']), axis=1)
    i_ratings = ratings_train[ratings_train['itemid'] == i]
    joined = pd.merge(topk_users, i_ratings, on='userid')
    joined['weighted_rating'] = joined['sim'] * (joined['rating'] - joined['mean'])
    return u_mean + (joined['weighted_rating'].sum() / joined['sim'].sum())

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis = 1)
rmse(expected, ratings_valid)

  if __name__ == '__main__':


4.304172132614776

In [87]:
from surprise import Reader, Dataset
reader = Reader(rating_scale=(0,10))
train_ds = Dataset.load_from_df(ratings_train, reader).build_full_trainset()

In [88]:
from surprise import KNNBasic

model = KNNBasic (k = 5)
model.fit(train_ds)

Computing the msd similarity matrix...
Done computing similarity matrix.


<surprise.prediction_algorithms.knns.KNNBasic at 0x27877182948>

In [89]:
def predict(u, i):
    return model.predict(u, i).est

expected = ratings_valid.copy()
expected['rating'] = expected.apply(lambda x: predict(x['userid'], x['itemid']), axis = 1)
rmse(expected, ratings_valid)

2.275156622021938

In [None]:
r