# 추천시스템: 협업필터링

## 2. 최근접이웃필터링(Nearest Neighbor Filtering)
최근접이웃필터링에는 사용자 기반 필터링(나와 비슷한 고객이 00 상품을 구매)과 아이템 기반 필터링(이 상품을 선택한 다른 고객이 00 상품을 구매)이 있다. 일반적으로 사용자 기반 필터링보단 아이템 기반 필터링이 더 많이 쓰인다. 


In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import warnings
warnings.filterwarnings('ignore')

### (1) 데이터 불러오기
사용한 데이터는 [영화평점데이터](https://grouplens.org/datasets/movielens/latest/)이다. 여기서 **movies**와 **ratings** 데이터를 사용하여 최근접이웃필터링을 진행한다.

In [2]:
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')

print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [3]:
movies.head(2)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy


In [4]:
ratings.head(2)

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247


In [6]:
ratings['rating'].describe()

count    100836.000000
mean          3.501557
std           1.042529
min           0.500000
25%           3.000000
50%           3.500000
75%           4.000000
max           5.000000
Name: rating, dtype: float64

### Step1: 사용자-아이템 평점행렬 생성
movies와 ratings 데이터를 movieId를 기준으로 merge를 한 후, `pd.pivot_table`을 이용하여 index는 userId, column은 영화제목, 값은 평점을 가지는 사용자-아이템 평점행렬을 생성한다.

In [10]:
# 사용자-아이템 평점행렬 생성
df = pd.merge(ratings, movies, how = 'left', on = 'movieId')
df.head(2)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,1,3,4.0,964981247,Grumpier Old Men (1995),Comedy|Romance


In [15]:
rating_mat = df.pivot_table('rating', index = 'userId', columns = 'title')
# fill NaN values with 0
rating_mat.fillna(0, inplace = True)
rating_mat

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
607,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,4.5,3.5,0.0,0.0,0.0
609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


하지만 위의 평점행렬의 경우, userId가 index에 위치하기 때문에 사용자 기반 필터링이다. 지금 구현하고자 하는 것은 아이템 기반 최근접이웃 필터링이기 때문에, transpose를 이용해 index와 column의 위치를 바꾸어 영화제목이 index로 가도록 한다.

In [17]:
# 아이템기반 사용자-아이템 평점 행렬
rating_mat_T = rating_mat.transpose()
rating_mat_T.head(3)

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.metrics.pairwise import cosine_similarity

### Step 2: 코사인 유사도 계산
`cosine_similarity`함수를 이용하여 영화간 평점 유사도를 측정한다.

In [22]:
item_sim = cosine_similarity(rating_mat_T, rating_mat_T)
item_sim = pd.DataFrame(item_sim, index = rating_mat_T.index, columns = rating_mat_T.index)
item_sim

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.141653,0.000000,...,0.000000,0.342055,0.543305,0.707107,0.0,0.000000,0.139431,0.327327,0.000000,0.0
'Hellboy': The Seeds of Creation (2004),0.000000,1.000000,0.707107,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Round Midnight (1986),0.000000,0.707107,1.000000,0.000000,0.000000,0.0,0.176777,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Salem's Lot (2004),0.000000,0.000000,0.000000,1.000000,0.857493,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
'Til There Was You (1997),0.000000,0.000000,0.000000,0.857493,1.000000,0.0,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
eXistenZ (1999),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.211467,0.216295,0.097935,0.132489,...,0.000000,0.000000,0.000000,0.000000,0.0,1.000000,0.192259,0.000000,0.170341,0.0
xXx (2002),0.139431,0.000000,0.000000,0.000000,0.000000,0.0,0.089634,0.000000,0.276512,0.019862,...,0.069716,0.305535,0.173151,0.246482,0.0,0.192259,1.000000,0.270034,0.100396,0.0
xXx: State of the Union (2005),0.327327,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.156764,0.000000,...,0.000000,0.382543,0.177838,0.231455,0.0,0.000000,0.270034,1.000000,0.000000,0.0
¡Three Amigos! (1986),0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.372876,0.180009,0.169385,0.249586,...,0.180009,0.000000,0.000000,0.000000,0.0,0.170341,0.100396,0.000000,1.000000,0.0


In [27]:
# 예시: 영화 인셉션과 코사인 유사도가 높은 영화들 출력
item_sim['Inception (2010)'].sort_values(ascending = False)[:4]

title
Inception (2010)               1.000000
Dark Knight, The (2008)        0.727263
Inglourious Basterds (2009)    0.646103
Shutter Island (2010)          0.617736
Name: Inception (2010), dtype: float64

## 개인화된 추천하기
전체 평점을 이용하여 영화를 추천할 경우 누락된 값들도 다수 포함되기 때문에 정확도가 다소 떨어질 수 있다. 따라서 이보다는 개인화된 추천을 하는 것이 더 정확도가 높을 수 있다. 본 코드에서 개인화된 예측 평점은 다음의 식으로 예측한다.

$$ \hat{R_{u, i}} = \sum(S_{i, N} * R_{u, N})/\sum(|S_{i, N}|) $$

 + $\hat{R_{u, i}}$: 사용자 u, 아이템 i의 개인화된 예측 평점 값
 + $S_{i, N}$: 아이템 i와 가장 유사도가 높은 top n개의 유사도 벡터
 + $R_{u, N}$: 사용자 u, 아이템 i와 가장 유사도가 높은 top n개 아이템에 대한 실제 평점 벡터

### Step 1: 개인화된 예측 평점 값을 계산하는 함수 생성
[알고리즘]
 1. 사용자-아이템 행렬과 shape이 똑같은 pred matrix를 생성한다(모두 0)
 2. 사용자-아이템 평점행렬의 column, 즉 영화제목 하나씩 for문을 수행한다.
  + `np.argsort`를 이용해 평점 코사인 유사도 행렬에서 기준 영화제목에 해당하는 데이터 행을 뽑은 뒤, 평점이 높은 20개에 해당하는 영화제목의 index를 추출한다.(top_n)
 3. 평점 코사인 유사도 행렬에서 기준 영화제목, top_n에 해당하는 데이터를 추출하고($S_{i,N}$) 이를 사용자 아이템 행렬에서 각 유저와 top n에 해당하는 데이터와 내적한다.
 4. 3에서 구한 값을 $S_{i,N}$의 absolute sum으로 나눠준다.

In [67]:
def predict_rating(ratings_arr, item_sim_arr, n = 20):
    pred = np.zeros(ratings_arr.shape)
    
    for col in range(ratings_arr.shape[1]):
        # col(영화)와 유사한 다른 영화 선택(20개)
        top_n = np.argsort(item_sim.iloc[:, col])[:-n-1:-1] # 맨뒤에서부터 20번째까지 1씩 감소시키며 indecing
        
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n].dot(ratings_arr[row, :][top_n].T) # 20 x 
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n]))
    return pred

In [None]:
rating_pred = predict_rating(rating_mat.values, item_sim.values, n = 20)
rating_pred

### Step 2: 유저가 보지 않은 영화중에 추천하기
좀 더 정확한 추천을 위해서는 유저가 보지 않은 영화만 골라서 추천하는 것이 더 합리적이다. 보통 봤던 영화를 또 추천하지는 않으니!
<br>
유저가 보지 않은 영화 리스트를 뽑아내는 함수를 만들고 **Step 1**에서 만든 rating_pred를 인덱싱하면 아직 보지 않은 영화 중에서 추천이 가능하다.

In [73]:
# 안본 영화중에 추천
def get_unseen_list(ratings_matrix, userId):
    # 특정 유저의 유저-아이템 평점 행렬 데이터를 추출한다.
    user_rating = ratings_matrix.iloc[userId, :]
    # 아직 평점을 주지 않았으면 0으로 처리되어있기 때문에 평점이 0인 데이터가 아직 보지 않은 영화이다.
    seen_movies = user_rating[user_rating > 0].index.tolist()
    all_movies = ratings_matrix.columns.tolist()
    unseen_list = [movie for movie in all_movies if movie not in seen_movies]
    
    return unseen_list

In [74]:
# 3번 유저의 안 본 영화 리스트 추출
get_unseen_list(rating_mat, 3)

["'71 (2014)",
 "'Hellboy': The Seeds of Creation (2004)",
 "'Round Midnight (1986)",
 "'Salem's Lot (2004)",
 "'Til There Was You (1997)",
 "'Tis the Season for Love (2015)",
 "'burbs, The (1989)",
 "'night Mother (1986)",
 '(500) Days of Summer (2009)',
 '*batteries not included (1987)',
 '...All the Marbles (1981)',
 '...And Justice for All (1979)',
 '00 Schneider - Jagd auf Nihil Baxter (1994)',
 '1-900 (06) (1994)',
 '10 (1979)',
 '10 Cent Pistol (2015)',
 '10 Cloverfield Lane (2016)',
 '10 Items or Less (2006)',
 '10 Things I Hate About You (1999)',
 '10 Years (2011)',
 '10,000 BC (2008)',
 '100 Girls (2000)',
 '100 Streets (2016)',
 '101 Dalmatians (1996)',
 '101 Dalmatians (One Hundred and One Dalmatians) (1961)',
 "101 Dalmatians II: Patch's London Adventure (2003)",
 '101 Reykjavik (101 Reykjavík) (2000)',
 '102 Dalmatians (2000)',
 '10th & Wolf (2006)',
 '10th Kingdom, The (2000)',
 '10th Victim, The (La decima vittima) (1965)',
 '11\'09"01 - September 11 (2002)',
 '11:14 (2