# Item Based Filtering

협업 필터링에서 사용되는 사용자 기반, 아이템 기반 중 일반적으로 정확도가 더 높은 아이템 기반의 협업필터링을 구현해 볼 것이다. 

## Data Load

In [1]:
import pandas as pd
import numpy as np

movies=pd.read_csv('./ml-latest-small/movies.csv')
ratings=pd.read_csv('./ml-latest-small/ratings.csv')
print(movies.shape)
print(ratings.shape)

(9742, 3)
(100836, 4)


In [2]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [3]:
movies.shape

(9742, 3)

In [4]:
a=pd.merge(ratings,movies,on='movieId')
a.head(3)

Unnamed: 0,userId,movieId,rating,timestamp,title,genres
0,1,1,4.0,964982703,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,847434962,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,1106635946,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [5]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [6]:
print('rating_max:',ratings['rating'].max(),'ratings_min:',ratings['rating'].min())

rating_max: 5.0 ratings_min: 0.5


아이템 기반 collaborative filtering 을 위해  
ratings data의 모든 사용자를 row로 영화를 column으로 바꿔야 한다.

In [7]:
ratings_matrix=ratings.pivot(index='userId', columns='movieId', values='rating')
ratings_matrix.tail()

movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
606,2.5,,,,,,2.5,,,,...,,,,,,,,,,
607,4.0,,,,,,,,,,...,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,...,,,,,,,,,,
609,3.0,,,,,,,,,4.0,...,,,,,,,,,,
610,5.0,,,,,5.0,,,,,...,,,,,,,,,,


rating의 최소 값은 0.5이므로 NaN값은 모두 0으로 변환하겠다. 그리고 movieID가 숫자 값으로 할당되어 있어 사용자가 평점을 준 영화가 어떤 영화인지 파악할 수 없다. 가독성을 높이기 위해 칼럼명을 moiveId가 아닌 영화명 title로 변경할 것이다.

In [8]:
# title 칼럼을 얻기 위해 movies와 조인
ratings_movies=pd.merge(ratings,movies,on='movieId')

# columns='title'로 title 칼럼으로 피벗 수행
ratings_matrix=ratings_movies.pivot_table('rating',index='userId',columns='title')

# NaN 값을 모두 0으로 반환
ratings_matrix=ratings_matrix.fillna(0)
ratings_matrix.head()

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## 영화 간 유사도 산출

이제 변환된 사용자-영화 평점 행렬 데이터 세트를 이용해 영화 간의 유사도를 측정하겠다.  
scikit_learn의 cosine_similarity()를 이용해 코사인 유사도를 측정할 것이다.
먼저, 영화 별(아이템 별)간의 유사도를 측정하기 위해 transpose를 시행한다

In [9]:
ratings_matrix_transposed = ratings_matrix.T

In [10]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim=cosine_similarity(ratings_matrix_transposed,ratings_matrix_transposed)

#cosine_simiarity()로 반환된 넘파이 행렬을 영화명을 매핑해 DataFrame으로 변환
item_sim_df=pd.DataFrame(data=item_sim, index=ratings_matrix.columns,columns=ratings_matrix.columns)

In [11]:
print(item_sim_df.shape)
item_sim_df.head(3)

(9719, 9719)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
item_sim_df["'Hellboy': The Seeds of Creation (2004)"].sort_values(ascending=False)[:6]

title
'Hellboy': The Seeds of Creation (2004)                       1.000000
Monsters (2010)                                               1.000000
Space Battleship Yamato (2010)                                1.000000
All the Right Moves (1983)                                    0.780869
Hidden Fortress, The (Kakushi-toride no san-akunin) (1958)    0.747409
...And Justice for All (1979)                                 0.715542
Name: 'Hellboy': The Seeds of Creation (2004), dtype: float64

'Hellboy': The Seeds of Creation (2004)와 유사도가 높은 상위 6개 영화를 추출해 보았다. 주로 액션 영화를 추천해 준 결과를 확인할 수 있다.  
또한,And Justice for All (1979)와 같은 드라마/미스터리 같은 장르의 영화도 추천한 것을 확인할 수 있다.

## Item based + 개인화된 영화 추천

앞 예제에서 만든 item based 영화 유사도 데이터는 모든 사용자의 평점을 기준으로 영화의 유사도를 생성하여 추천했다.  
하지만 이는 개인적인 취향을 반영할 수 없다. 그래서 이번에는 개인이 아직 관람하지 않은 영화에 대해서 아이템 유사도와  
기존에 관람한 영화의 평점 데이터를 기반으로 해 개인화가 가능한 추천을 해보겠다.

+ ratings_arr.dot(item_sim_arr): 평저

In [13]:
def predict_rating(rating_arr, item_sim_arr):
    ## 예측 평점을 구하는 식
    ratings_pred=rating_arr.dot(item_sim_arr)/np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred

In [14]:
ratings_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
ratings_pred=predict_rating(ratings_matrix.values,item_sim_df.values)
ratings_pred_matrix=pd.DataFrame(data=ratings_pred,index=ratings_matrix.index,columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501


예측 평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적(dot)한 값이기 때문에 기존에 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생한다. 또한, 예측 평점이 실제 평점에 비해 작을 수 있다. 이는 내적 결과를 코사인 유사도 벡터 합으로 나누었기 때문에 생기는 현상이다.

그렇다면 이 예측 결과가 원래의 실제 평점과 얼마나 차이가 있는지 확인해보자. 예측 평가 지표는 MSE  
주의할 것은 우리는 관람하지 않은 영화 평점에 대해서 0으로 했지만, 이런 자료에 대해서도 예측을 수행했다.  
따라서, 실제로 관람을 한 영화에 대해서만 MSE를 구할 것이다.

In [16]:
from sklearn.metrics import mean_squared_error

# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 MSE를 구함
def get_mse(pred,actual):
    # 평점이 있는 실제 영화만 추출
    pred=pred[actual.nonzero()].flatten() # nonzero 함수는 요소들 중 0이 아닌 값들의 index 들을 반환해 주는 함수
    actual=actual[actual.nonzero()].flatten()
    return mean_squared_error(pred,actual)

print('아이템 기반 모든 최근접 이웃 MSE:',get_mse(ratings_pred,ratings_matrix.values))

아이템 기반 모든 최근접 이웃 MSE: 9.895354759094706


실제 값과 예측 값은 서로 스케일이 다르기 때문에 MSE가 클 수도 있다. 따라서 MSE를 감소시켜보자.  
predict_rating() 함수는 사용자별 영화의 예측 평점을 계산하기 위해 해당 영화와 다른 모든 영화 간의 유사도 벡터를 적용했다.  
많은 영화의 유사도 벡터를 이용하다 보니 상대적으로 평점이 떨어진 것이다.  
따라서, 특정 영화와 가장 비슷한 유사도를 가지는 영화에 대해서만 유사도 벡터를 적용하는 함수로 변경할 것이다.

In [73]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred=np.zeros(ratings_arr.shape)
    
    # 사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:,col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row,col] =item_sim_arr[col,:][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            
            pred[row,col] /=np.sum(np.abs(item_sim_arr[col,:][top_n_items]))
            
    return pred
        
        

In [75]:
ratings_pred=predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)
print('아이템 기반 최근접 top-20 이웃 MSE:',get_mse(ratings_pred,ratings_matrix.values))

# 계산된 예측 평점 데이터는 DataFrame으로 재생성
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)

  # This is added back by InteractiveShellApp.init_path()
  del sys.path[0]


아이템 기반 최근접 top-20 이웃 MSE: 3.69501623729494


In [77]:
user_ratings_id=ratings_matrix.loc[9,:]
user_ratings_id[user_ratings_id>0].sort_values(ascending=False)

title
Adaptation (2002)                                                                 5.0
Austin Powers in Goldmember (2002)                                                5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Producers, The (1968)                                                             5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Back to the Future (1985)                                                         5.0
Glengarry Glen Ross (1992)                                                        4.0
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)                                     4.0
Last Seduction, The (1994)                                                        4.0
Pumpkin (2002)                                  

In [84]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화 정보를 추출해 Series로 반환함.
    # 반환한 user_rating은 영화명(title)을 인덱스로 가지는 Series로 객체임.
    
    user_rating= ratings_matrix.loc[userId,:]
    
    # user_ratings이 0보다 크면 기존에 관람한 영화임. 대상 인덱스를 추출해 list 객체로 만듬.
    already_seen= user_rating[ user_rating>0].index.tolist()
    
    # 모든 영화명을 list 객체로 만듬
    movies_list= ratings_matrix.columns.tolist()
    
    # list comprehension으로 already_seen에 해당하는 영화는 movies_list에서 제외함.
    unseen_list=[ movie for movie in movies_list if movie not in already_seen]
    
    return unseen_list

In [89]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame에서 사용자 id 인덱스와 unseen_list로 들어온 영화명 칼럼을 추출해 가장 예측 평점이 높은 순으로 정렬함.
    recomm_movies= pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

# 사용자가 관람하지 않는 영화명 추출
unseen_list= get_unseen_movies(ratings_matrix,9)

# 아이템 기반의 최근접 이웃 협업 필터링으로 영화 추천
recomm_movies= recomm_movie_by_userid(ratings_pred_matrix,9, unseen_list,top_n=10)

# 평점 데이터를 DataFrame 으로 생성
recomm_movies= pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711
