In [1]:
import pandas as pd
import numpy as np

## LOAD data

In [5]:
datas=pd.read_csv('ratings.csv')
datas.head()

Unnamed: 0,사람,책,평점
0,민지,백설공주,5.0
1,민지,신데렐라,4.0
2,민지,어린왕자,1.0
3,민지,흥부전,3.0
4,현우,노인과바다,3.0


## pivoting method(교차표)

In [8]:
favor=pd.pivot(data=datas, index='사람',columns='책',values='평점')
favor

책,노인과바다,백설공주,신데렐라,어린왕자,콩쥐팥쥐,흥부전
사람,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
민수,3.0,4.0,4.0,3.0,4.0,
민지,,5.0,4.0,1.0,,3.0
지민,4.0,1.0,,5.0,2.0,3.0
지연,5.0,,3.0,4.0,3.0,3.0
현우,3.0,,2.0,,1.0,2.0


# 추천 시스템의 method = 결측 값에 대해서는 계산을 불허한다.
# method 파헤치기

In [9]:
#np.isfinite => 한정된 값만을 파악하는 함수

In [10]:
def get_pearson_correlation(u,v,epsilon):
    mask = ~np.isnan(u) & ~np.isnan(v)
    u=u[mask]
    u_mean=u.mean()
    v=v[mask]
    v_mean=v.mean()
    uvdot=((u-u_mean)*(v-v_mean)).sum()
    norm1=((u-u_mean)**2).sum()
    norm2=((v-v_mean)**2).sum()
    score=uvdot/(np.sqrt(norm1*norm2)+epsilon)
    return score

In [16]:
from numpy import dot
from numpy.linalg import norm

def get_cosine_similarity(u,v):
    mask=np.isfinite(u) & np.isfinite(v)
    u=u[mask]
    v=v[mask]
    return dot(u,v)/(norm(u)*norm(v))

In [12]:
from itertools import product

In [15]:
favor.index
list(product(favor.index,repeat=2))

[('민수', '민수'),
 ('민수', '민지'),
 ('민수', '지민'),
 ('민수', '지연'),
 ('민수', '현우'),
 ('민지', '민수'),
 ('민지', '민지'),
 ('민지', '지민'),
 ('민지', '지연'),
 ('민지', '현우'),
 ('지민', '민수'),
 ('지민', '민지'),
 ('지민', '지민'),
 ('지민', '지연'),
 ('지민', '현우'),
 ('지연', '민수'),
 ('지연', '민지'),
 ('지연', '지민'),
 ('지연', '지연'),
 ('지연', '현우'),
 ('현우', '민수'),
 ('현우', '민지'),
 ('현우', '지민'),
 ('현우', '지연'),
 ('현우', '현우')]

In [22]:
def get_cosine_correlation_table(favor, epsilon=0.0001):

    person_tuple=list(product(favor.index, repeat=2))
    list_similarity=[]
    for uname,vname in person_tuple:
        u=favor.loc[uname]
        v=favor.loc[vname]
        score=get_cosine_similarity(u,v)
        similarity={
        'u':uname,
        'v':vname,
        'score':score
        }
        list_similarity.append(similarity)
    df_similarity=pd.DataFrame(list_similarity)
    table_sim=pd.pivot_table(df_similarity,index='u',columns='v', values='score')

    return table_sim

In [23]:
tbl_sim=get_cosine_correlation_table(favor)

In [24]:
tbl_sim

v,민수,민지,지민,지연,현우
u,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
민수,1.0,0.939827,0.813206,0.938986,0.876523
민지,0.939827,1.0,0.542857,0.840841,0.989949
지민,0.813206,0.542857,1.0,0.974406,0.992583
지연,0.938986,0.840841,0.974406,1.0,0.980581
현우,0.876523,0.989949,0.992583,0.980581,1.0


In [31]:
def knn(user_name, item_name, favor, sim,k):
    r=favor[item_name].drop(index=user_name)
    s=tbl_sim[user_name].drop(index=user_name)

    mask=np.isfinite(r) & np.isfinite(s)
    r= r[mask]
    s=s[mask]

    list_nearst=s.sort_values(ascending=False).head(k).index

    return list_nearst

In [33]:
knn('민지','백설공주',favor,tbl_sim,k=2)

Index(['민수', '지민'], dtype='object', name='u')

In [34]:
favor

책,노인과바다,백설공주,신데렐라,어린왕자,콩쥐팥쥐,흥부전
사람,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
민수,3.0,4.0,4.0,3.0,4.0,
민지,,5.0,4.0,1.0,,3.0
지민,4.0,1.0,,5.0,2.0,3.0
지연,5.0,,3.0,4.0,3.0,3.0
현우,3.0,,2.0,,1.0,2.0


In [36]:
def avg_cor_predict_rating(user_name, item_name, pivoted_df, tbl_sim,k):
    nn=knn(user_name, item_name, pivoted_df, tbl_sim,k)
    if len(nn) == 0:
        return np.nan

    ur=favor.loc[user_name].mean()
    score1=0
    for user in nn:
        r=(favor.loc[user] - favor.loc[user].mean())
        s=tbl_sim.loc[user][user_name]
        srdot= (s*r).sum()
        norm=s.sum()
        score1=score1+(srdot/norm)

    score=ur+score1

    return score

In [41]:
avg_cor_predict_rating('지민', '신데렐라', favor, tbl_sim, k=2)

2.9999999999999996