# 추천시스템 기초 실습

## 0. 데이터 불러오기 및 탐색

데이터는 movielens에서 제공하는 데이터 중 하나로, 가장 크기가 작은 버전으로 가져왔습니다.
- movies.csv: 영화 제목과 영화 장르에 대한 데이터
- ratings.csv: user(사용자)-movie(아이템) 선호도 데이터
- tags.csv: user가 해당 movie에 부여한 tag 데이터

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
DIR = '/content/drive/MyDrive/Colab Notebooks/tobigs15/'

## 1. Content-based Filtering 실습

content-based filtering은 movie(아이템) 간의 유사도를 계산하여 해당 movie와 유사한 movie를 추천해주는 간단한 추천 알고리즘

제공된 데이터에서 movie의 콘텐츠를 확인할 수 있는 데이터는 movies.csv의 장르와 tags.csv의 태그 정보가 있음

In [4]:
# 아이템 간 유사도 계산을 위한 데이터프레임 만들기
# 열은 영화 장르가 될 수도 있고, 영화 태그가 될 수도 있음 (영화 태그의 경우 모든 영화에 대한 태그는 없음)
movies = pd.read_csv(DIR+"ml-latest-small/movies.csv")
tags = pd.read_csv(DIR+"ml-latest-small/tags.csv")
ratings = pd.read_csv(DIR+"ml-latest-small/ratings.csv")
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


![image.png](attachment:image.png)

In [5]:
piv = pd.DataFrame(columns = ["movieId"])

for i in range(len(movies)):
    # print(m.movieId)
    genres = movies.loc[i, 'genres'].split("|") # genre를 |로 분리하고, 이에 해당하는 칸을 1로 설정하였다.
    piv.loc[i, 'movieId'] = movies.loc[i, 'movieId']
    piv.loc[i, genres] = 1
piv = piv.set_index("movieId")
piv

Unnamed: 0_level_0,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,War,Musical,Documentary,IMAX,Western,Film-Noir,(no genres listed)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,1.0,1.0,1.0,1.0,,,,,,,,,,,,,,,
2,1.0,,1.0,,1.0,,,,,,,,,,,,,,,
3,,,,1.0,,1.0,,,,,,,,,,,,,,
4,,,,1.0,,1.0,1.0,,,,,,,,,,,,,
5,,,,1.0,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,,1.0,,1.0,1.0,,,1.0,,,,,,,,,,,,
193583,,1.0,,1.0,1.0,,,,,,,,,,,,,,,
193585,,,,,,,1.0,,,,,,,,,,,,,
193587,,1.0,,,,,,1.0,,,,,,,,,,,,


In [6]:
# TF-IDF를 적용할 수 있지만 여기서는 skip
# TF-IDF를 적용하게 되면 다음과 같이 나오긴 합니당

In [7]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [8]:
movies['corpus'] = movies['genres'].apply(lambda x : ' '.join(x.split('|')))

In [9]:
tfidv = TfidfVectorizer().fit(movies['corpus'])

In [10]:
temp = pd.DataFrame(tfidv.transform(movies['corpus']).toarray())

In [11]:
temp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
0,0.000000,0.416846,0.516225,0.504845,0.267586,0.0,0.0,0.000000,0.482990,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.000000,0.512361,0.000000,0.620525,0.000000,0.0,0.0,0.000000,0.593662,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.000000,0.000000,0.000000,0.000000,0.570915,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.821009,0.0,0.0,0.0,0.0
3,0.000000,0.000000,0.000000,0.000000,0.505015,0.0,0.0,0.466405,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.726241,0.0,0.0,0.0,0.0
4,0.000000,0.000000,0.000000,0.000000,1.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9737,0.436010,0.000000,0.614603,0.000000,0.318581,0.0,0.0,0.000000,0.575034,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9738,0.000000,0.000000,0.682937,0.000000,0.354002,0.0,0.0,0.000000,0.638968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9739,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,1.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
9740,0.578606,0.000000,0.815607,0.000000,0.000000,0.0,0.0,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [12]:
# 아이템 간 유사도 구하기
from sklearn.metrics.pairwise import cosine_similarity

In [13]:
# 영화-영화의 유사도를 코사인 유사도를 사용해서 구해줍니당
CBF = pd.DataFrame(cosine_similarity(temp)) # cosine similarity 함수를 사용한 후 보기 편하도록 DataFrame으로 변환하였습니다.

In [14]:
# movie - moive 아이템 간 유사도 데이터프레임
CBF.index = piv.index
CBF.columns = piv.index
CBF

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,185135,185435,185473,185585,186587,187031,187541,187593,187595,187717,188189,188301,188675,188751,188797,188833,189043,189111,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.000000,0.813578,0.152769,0.135135,0.267586,0.000000,0.152769,0.654698,0.000000,0.262413,0.135135,0.136349,0.833737,0.000000,0.257331,0.000000,0.000000,0.267586,0.267586,0.094253,0.115861,0.000000,0.000000,0.000000,0.000000,0.000000,0.453434,0.000000,0.379305,0.000000,0.000000,0.0,0.453434,0.000000,0.571377,0.152769,0.000000,0.000000,0.000000,0.000000,...,0.0,0.157867,0.267586,0.264947,0.205390,0.182414,0.763343,0.091411,0.435076,0.0,0.361137,0.473788,0.000000,0.152769,0.267586,0.691840,0.196578,0.0,0.262413,0.000000,0.000000,0.128103,0.0,0.000000,0.267586,0.000000,0.000000,0.516225,0.0,0.276524,0.360397,0.465621,0.196578,0.516225,0.0,0.680258,0.755891,0.000000,0.421037,0.267586
2,0.813578,1.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.804715,0.000000,0.322542,0.000000,0.000000,0.631907,0.000000,0.316295,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.557333,0.000000,0.466219,0.000000,0.000000,0.0,0.557333,0.000000,0.548271,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.000000,0.325657,0.252453,0.224212,0.578554,0.000000,0.534769,0.0,0.339639,0.495234,0.000000,0.000000,0.000000,0.723156,0.000000,0.0,0.322542,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.240700,0.000000,0.000000,0.000000,0.000000,0.0,0.341376,0.379331,0.000000,0.000000,0.000000
3,0.152769,0.000000,1.000000,0.884571,0.570915,0.000000,1.000000,0.000000,0.000000,0.000000,0.884571,0.290910,0.000000,0.000000,0.467874,0.000000,0.690816,0.570915,0.570915,0.201096,0.247197,0.000000,0.000000,0.000000,0.690816,0.000000,0.000000,0.690816,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.267370,1.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.336821,0.570915,0.000000,0.000000,0.000000,0.000000,0.195033,0.000000,0.0,0.180957,0.151220,0.000000,1.000000,0.570915,0.220816,0.419413,0.0,0.000000,0.000000,0.000000,0.273316,0.0,0.690816,0.570915,0.000000,0.000000,0.000000,0.0,0.172171,0.162848,0.000000,0.419413,0.000000,0.0,0.181883,0.202105,0.000000,0.000000,0.570915
4,0.135135,0.000000,0.884571,1.000000,0.505015,0.000000,0.884571,0.000000,0.000000,0.000000,1.000000,0.257331,0.000000,0.466405,0.413868,0.234877,0.863110,0.505015,0.505015,0.329607,0.218663,0.124930,0.000000,0.168322,0.863110,0.466405,0.205061,0.863110,0.107408,0.234877,0.466405,0.0,0.205061,0.234877,0.236508,0.884571,0.466405,0.182705,0.188238,0.466405,...,0.0,0.297942,0.505015,0.000000,0.000000,0.121002,0.000000,0.172521,0.000000,0.0,0.160070,0.133765,0.234877,0.884571,0.505015,0.195327,0.687440,0.0,0.000000,0.000000,0.000000,0.447980,0.0,0.863110,0.505015,0.466405,0.466405,0.000000,0.0,0.152298,0.144051,0.201391,0.687440,0.000000,0.0,0.160888,0.178776,0.466405,0.000000,0.505015
5,0.267586,0.000000,0.570915,0.505015,1.000000,0.000000,0.570915,0.000000,0.000000,0.000000,0.505015,0.509550,0.000000,0.000000,0.000000,0.000000,0.000000,1.000000,1.000000,0.352234,0.432984,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.468319,0.570915,0.000000,0.000000,0.000000,0.000000,...,0.0,0.589966,1.000000,0.000000,0.000000,0.000000,0.000000,0.341614,0.000000,0.0,0.316960,0.264872,0.000000,0.570915,1.000000,0.386775,0.734632,0.0,0.000000,0.000000,0.000000,0.478734,0.0,0.000000,1.000000,0.000000,0.000000,0.000000,0.0,0.301570,0.285240,0.000000,0.734632,0.000000,0.0,0.318581,0.354002,0.000000,0.000000,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
193581,0.680258,0.341376,0.181883,0.160888,0.318581,0.239513,0.181883,0.000000,0.436010,0.241142,0.160888,0.162333,0.380544,0.000000,0.236471,0.000000,0.000000,0.318581,0.318581,0.322401,0.137940,0.000000,0.239513,0.000000,0.000000,0.000000,0.000000,0.000000,0.258812,0.000000,0.000000,0.0,0.000000,0.000000,0.149197,0.181883,0.000000,0.000000,0.260771,0.000000,...,0.0,0.539999,0.318581,0.496790,0.188741,0.167627,0.523761,0.312681,0.162077,0.0,0.429959,0.517358,0.000000,0.181883,0.318581,0.524664,0.234040,0.0,0.241142,0.239513,0.216898,0.152515,0.0,0.000000,0.318581,0.000000,0.000000,0.614603,0.0,0.276029,0.599288,0.554355,0.234040,0.614603,0.0,1.000000,0.899942,0.000000,0.753553,0.318581
193583,0.755891,0.379331,0.202105,0.178776,0.354002,0.000000,0.202105,0.000000,0.000000,0.000000,0.178776,0.180381,0.422854,0.000000,0.000000,0.000000,0.000000,0.354002,0.354002,0.124691,0.153277,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.287587,0.000000,0.000000,0.0,0.000000,0.000000,0.165786,0.202105,0.000000,0.000000,0.000000,0.000000,...,0.0,0.208849,0.354002,0.350510,0.000000,0.000000,0.387152,0.120932,0.000000,0.0,0.477763,0.399250,0.000000,0.202105,0.354002,0.582998,0.260061,0.0,0.000000,0.000000,0.000000,0.169472,0.0,0.000000,0.354002,0.000000,0.000000,0.682937,0.0,0.106756,0.476784,0.615990,0.260061,0.682937,0.0,0.899942,1.000000,0.000000,0.557008,0.354002
193585,0.000000,0.000000,0.000000,0.466405,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.466405,0.000000,0.000000,1.000000,0.000000,0.503590,0.540377,0.000000,0.000000,0.325304,0.000000,0.267858,0.000000,0.360892,0.540377,1.000000,0.439662,0.540377,0.230290,0.503590,1.000000,0.0,0.439662,0.503590,0.000000,0.000000,1.000000,0.391730,0.403594,1.000000,...,0.0,0.000000,0.000000,0.000000,0.000000,0.259436,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.503590,0.000000,0.000000,0.000000,0.678466,0.0,0.000000,0.000000,0.000000,0.442132,0.0,0.540377,0.000000,1.000000,1.000000,0.000000,0.0,0.000000,0.000000,0.431794,0.678466,0.000000,0.0,0.000000,0.000000,1.000000,0.000000,0.000000
193587,0.421037,0.000000,0.000000,0.000000,0.000000,0.317844,0.000000,0.000000,0.578606,0.320007,0.000000,0.000000,0.505000,0.000000,0.313808,0.000000,0.000000,0.000000,0.000000,0.278927,0.000000,0.000000,0.317844,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.346055,0.000000,...,0.0,0.467183,0.000000,0.240662,0.250469,0.222449,0.695056,0.270518,0.215083,0.0,0.000000,0.209747,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.320007,0.317844,0.287834,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.815607,0.0,0.238808,0.674692,0.735655,0.000000,0.815607,0.0,0.753553,0.557008,0.000000,1.000000,0.000000


In [15]:
# 각 영화와 다른 영화와의 유사도가 모두 계산되어 있고, 유사도가 높은 영화가 가장 비슷한 영화라고 할 수 있습니당
# 입력 아이템과 가장 유사한 아이템 TOP10개를 뽑아봅시당
def getCBF(movieId):
    print("입력 영화: {}".format(movies[movies.movieId==movieId].title.item()))
    print("입력 영화 장르: {}".format(movies[movies.movieId==movieId].genres.item()))
    movie_list = CBF.loc[movieId].nlargest(10) # 해당 movieId 행에서 유사도가 높은 순서대로 10개를 뽑는다.
    top10 = pd.DataFrame(columns = movies.columns)
    for i in movie_list.index:
        # print(movies[movies["movieId"]==i])
        top10 = pd.concat([top10, movies[movies["movieId"]==i]])    # top10 DataFrame에 유사도가 높은 movie를 하나씩 추가한다.
    print("=====Most similar List=====")
    return top10.set_index('movieId')

In [16]:
getCBF(193583)

입력 영화: No Game No Life: Zero (2017)
입력 영화 장르: Animation|Comedy|Fantasy
=====Most similar List=====


Unnamed: 0_level_0,title,genres,corpus
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
4158,Monkeybone (2001),Animation|Comedy|Fantasy,Animation Comedy Fantasy
6773,"Triplets of Belleville, The (Les triplettes de...",Animation|Comedy|Fantasy,Animation Comedy Fantasy
7228,Cool World (1992),Animation|Comedy|Fantasy,Animation Comedy Fantasy
72692,Mickey's Once Upon a Christmas (1999),Animation|Comedy|Fantasy,Animation Comedy Fantasy
79008,South Park: Imaginationland (2008),Animation|Comedy|Fantasy,Animation Comedy Fantasy
126577,"Daddy, I'm A Zombie (2012)",Animation|Comedy|Fantasy,Animation Comedy Fantasy
139855,Anomalisa (2015),Animation|Comedy|Fantasy,Animation Comedy Fantasy
182297,Porky in Wackyland (1938),Animation|Comedy|Fantasy,Animation Comedy Fantasy
193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,Animation Comedy Fantasy
1151,Lesson Faust (1994),Animation|Comedy|Drama|Fantasy,Animation Comedy Drama Fantasy


영화간의 유사도를 비교하여 추천 영화를 추출한 결과, 쿼리로 주어진 영화의 장르인 Animation, Comedy, Fantasy장르의 영화들이 결과로 나온 것을 확인할 수 있다.

## 2. Collaborative Filtering 실습

User-based CF를 실습해 봅시당

In [17]:
# User-Item 간 선호도 데이터프레임 만들기
ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


![image.png](attachment:image.png)

In [28]:
# 해당 데이터프레임은 사용자-영화 선호도 데이터프레임입니당
piv = pd.pivot_table(ratings, index = 'userId', columns = 'movieId', values = 'rating')
for mov in np.unique(movies.movieId):
    if mov not in piv.columns:  # 어떤 사용자도 보지 않은 영화가 있다면
        piv[mov] = np.nan     # NaN 열을 추가한다.
piv

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609,1076,2939,3338,3456,4194,5721,6668,6849,7020,7792,8765,25855,26085,30892,32160,32371,34482,85565
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,4.0,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,2.5,,,,,,2.5,,,,2.5,,,,3.5,,4.0,4.0,2.0,,,,,,,,,3.5,4.5,,,4.0,,3.5,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
607,4.0,,,,,,,,,,3.0,,,,,,,,,,,,,,3.0,,,,,,,,3.0,4.0,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
608,2.5,2.0,2.0,,,,,,,4.0,,,,,,4.5,,,2.0,,3.5,,,2.0,,,,,,,3.0,3.5,3.5,,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
609,3.0,,,,,,,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [29]:
# 사용자 간 유사도 구하기
CF = pd.DataFrame(cosine_similarity(piv.fillna(0)))

In [30]:
# 사용자와 사용자의 유사도를 구할 때 꼭 코사인 유사도를 사용하지 않아도 됩니당
# 보완 코사인 유사도 혹은 피어슨 상관계수를 적용해보아유
CF.index = piv.index
CF.columns = piv.index
CF

userId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,40,...,571,572,573,574,575,576,577,578,579,580,581,582,583,584,585,586,587,588,589,590,591,592,593,594,595,596,597,598,599,600,601,602,603,604,605,606,607,608,609,610
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,1.000000,0.027283,0.059720,0.194395,0.129080,0.128152,0.158744,0.136968,0.064263,0.016875,0.132499,0.016458,0.092971,0.113238,0.160689,0.169858,0.264358,0.214868,0.325376,0.160969,0.153162,0.050691,0.106669,0.155193,0.099261,0.105791,0.238933,0.202866,0.145251,0.099781,0.164454,0.146211,0.151945,0.083544,0.074508,0.071673,0.144736,0.138588,0.329782,0.094755,...,0.090224,0.131072,0.253480,0.093450,0.062668,0.051803,0.317319,0.000000,0.141129,0.249326,0.057399,0.048914,0.052175,0.100406,0.114076,0.123280,0.122782,0.183922,0.118112,0.324766,0.136809,0.143934,0.174413,0.141960,0.110558,0.123713,0.312843,0.011280,0.282412,0.291272,0.080554,0.164455,0.221486,0.070669,0.153625,0.164191,0.269389,0.291097,0.093572,0.145321
2,0.027283,1.000000,0.000000,0.003726,0.016614,0.025333,0.027585,0.027257,0.000000,0.067445,0.044419,0.000000,0.043918,0.016901,0.119778,0.093728,0.103755,0.166253,0.012571,0.014137,0.090880,0.144635,0.013597,0.129902,0.226008,0.000000,0.000000,0.058145,0.104059,0.149324,0.000000,0.017807,0.032428,0.043299,0.000000,0.054012,0.028500,0.023147,0.000000,0.017446,...,0.000000,0.032675,0.167959,0.035531,0.000000,0.000000,0.014870,0.000000,0.000000,0.061953,0.136703,0.219315,0.043620,0.000000,0.127551,0.089562,0.000000,0.055900,0.026429,0.056348,0.036147,0.030684,0.062033,0.013125,0.000000,0.104568,0.011986,0.048508,0.098000,0.023248,0.202671,0.016866,0.011997,0.000000,0.000000,0.028429,0.012948,0.046211,0.027565,0.102427
3,0.059720,0.000000,1.000000,0.002251,0.005020,0.003936,0.000000,0.004941,0.000000,0.000000,0.000000,0.000000,0.000000,0.003064,0.017251,0.032299,0.009813,0.028241,0.019142,0.008756,0.004017,0.003070,0.003081,0.002592,0.005100,0.000000,0.002332,0.029771,0.005974,0.000000,0.003507,0.015494,0.007156,0.000736,0.000000,0.000000,0.000000,0.004196,0.003639,0.002530,...,0.020899,0.008056,0.000000,0.000000,0.000000,0.029830,0.039894,0.000000,0.005749,0.033121,0.008561,0.000000,0.000000,0.000000,0.000000,0.000986,0.015793,0.003923,0.004791,0.027418,0.000000,0.000000,0.006460,0.001983,0.000000,0.025873,0.028970,0.000000,0.039539,0.013143,0.005048,0.004892,0.024992,0.000000,0.010694,0.012993,0.019247,0.021128,0.000000,0.032119
4,0.194395,0.003726,0.002251,1.000000,0.128659,0.088491,0.115120,0.062969,0.011361,0.031163,0.054767,0.049945,0.076949,0.048989,0.071551,0.164761,0.145058,0.123217,0.206053,0.113755,0.053014,0.043403,0.093971,0.071690,0.032862,0.061832,0.128954,0.135593,0.061308,0.058140,0.126182,0.200597,0.167247,0.045571,0.050148,0.113807,0.061031,0.090125,0.240976,0.053436,...,0.018553,0.182857,0.125627,0.064559,0.061035,0.005932,0.169440,0.000000,0.098777,0.148584,0.043871,0.047619,0.048474,0.041246,0.088289,0.064095,0.138766,0.049146,0.064485,0.219298,0.116111,0.068196,0.217116,0.082614,0.091974,0.107908,0.275436,0.016054,0.201610,0.211921,0.085938,0.128273,0.307973,0.052985,0.084584,0.200395,0.131746,0.149858,0.032198,0.107683
5,0.129080,0.016614,0.005020,0.128659,1.000000,0.300349,0.108342,0.429075,0.000000,0.030611,0.183805,0.058860,0.017157,0.221711,0.110152,0.082171,0.162633,0.121313,0.098758,0.096474,0.058264,0.033074,0.066889,0.096215,0.040705,0.294282,0.100491,0.121894,0.068876,0.065534,0.068585,0.233932,0.282323,0.029953,0.311472,0.022065,0.303078,0.377773,0.152956,0.321077,...,0.020016,0.124806,0.129338,0.341347,0.049306,0.000000,0.122199,0.000000,0.182382,0.117199,0.057538,0.000000,0.124645,0.348215,0.029293,0.072713,0.154315,0.377256,0.230961,0.152971,0.000000,0.359595,0.171864,0.137990,0.073238,0.096181,0.116071,0.000000,0.098599,0.137053,0.068048,0.418747,0.110148,0.258773,0.148758,0.106435,0.152866,0.135535,0.261232,0.060792
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
606,0.164191,0.028429,0.012993,0.200395,0.106435,0.102123,0.200035,0.099388,0.075898,0.088963,0.072988,0.075012,0.078030,0.100258,0.176102,0.187991,0.228150,0.244718,0.197557,0.133759,0.154664,0.146306,0.180398,0.120349,0.073508,0.057984,0.116688,0.239141,0.109923,0.083593,0.076391,0.113624,0.183288,0.126329,0.050278,0.094634,0.057503,0.110744,0.150151,0.096332,...,0.066803,0.201933,0.172313,0.050543,0.056985,0.024948,0.161706,0.029230,0.113237,0.303959,0.152900,0.076863,0.056115,0.073394,0.102095,0.120445,0.181949,0.100667,0.107110,0.313587,0.104047,0.076050,0.201965,0.141106,0.089641,0.186620,0.247790,0.060730,0.307964,0.310161,0.178084,0.116534,0.300669,0.066032,0.148141,1.000000,0.153063,0.262558,0.069622,0.201104
607,0.269389,0.012948,0.019247,0.131746,0.152866,0.162182,0.186114,0.185142,0.011844,0.010451,0.269857,0.032357,0.041894,0.163128,0.172278,0.108235,0.232015,0.191474,0.267631,0.076034,0.126051,0.032362,0.060483,0.133275,0.068520,0.172314,0.145752,0.177644,0.104298,0.128971,0.146604,0.152270,0.178559,0.119460,0.054149,0.083398,0.175458,0.163489,0.250775,0.126054,...,0.157236,0.172598,0.230264,0.174995,0.063632,0.005566,0.277245,0.000000,0.142133,0.222980,0.089457,0.042290,0.018075,0.172968,0.037530,0.137029,0.156100,0.195432,0.172363,0.303766,0.096840,0.179560,0.159920,0.230269,0.039137,0.141012,0.261206,0.002461,0.229975,0.219444,0.092525,0.199910,0.203540,0.137834,0.118780,0.153063,1.000000,0.283081,0.149190,0.139114
608,0.291097,0.046211,0.021128,0.149858,0.135535,0.178809,0.323541,0.187233,0.100435,0.077424,0.165329,0.045540,0.150508,0.139580,0.239751,0.182810,0.226019,0.328848,0.357684,0.255559,0.241128,0.194950,0.157319,0.170871,0.077472,0.115403,0.143279,0.341066,0.108109,0.097244,0.124598,0.157475,0.182282,0.245362,0.079864,0.088450,0.137988,0.159527,0.249292,0.138185,...,0.090180,0.162648,0.366207,0.133860,0.084302,0.059132,0.205068,0.000000,0.158486,0.463412,0.178818,0.057913,0.099307,0.164377,0.182266,0.193141,0.135310,0.160833,0.113565,0.397702,0.180687,0.221534,0.256632,0.210040,0.048688,0.273697,0.247656,0.048990,0.427623,0.373028,0.158355,0.197514,0.232771,0.155306,0.178142,0.262558,0.283081,1.000000,0.121993,0.322055
609,0.093572,0.027565,0.000000,0.032198,0.261232,0.214234,0.090840,0.423993,0.000000,0.021766,0.243111,0.000000,0.000000,0.290085,0.092756,0.056137,0.178866,0.107490,0.101163,0.000000,0.066256,0.085359,0.026316,0.102989,0.024313,0.392579,0.065996,0.090873,0.081750,0.081547,0.074893,0.108999,0.186822,0.093544,0.213665,0.036608,0.369436,0.320054,0.072277,0.224892,...,0.033209,0.128006,0.133284,0.381254,0.049083,0.000000,0.126357,0.020983,0.105622,0.125743,0.068017,0.000000,0.062826,0.390869,0.029161,0.090871,0.078900,0.418791,0.255039,0.110791,0.031238,0.379788,0.110493,0.114996,0.000000,0.054495,0.092068,0.000000,0.102966,0.104708,0.035653,0.335231,0.061941,0.236601,0.097610,0.069622,0.149190,0.121993,1.000000,0.053225


In [31]:
# 입력 사용자와 가장 가까운 유저에서 선호도가 가장 높은 아이템 TOP5를 추천해봅시당!
# 이 때, 입력 사용자가 이미 시청한/평가한 영화는 제외해야하도록 추천 리스트를 만들어봅시당
import math

def getCF(userId):
    most_similar_user = np.argmax(CF.loc[userId])   # 입력사용자와 가장 유사한 유저를 뽑는다.
    top_ratings = np.argsort(piv.loc[most_similar_user].fillna(0))[::-1]    # 유사 사용자가 rating을 매긴 것 중 높은 순으로 movieId를 정렬한다.
    top5 = pd.DataFrame(columns = movies.columns)
    for i in top_ratings:
        if len(top5) ==5:
            break
        if math.isnan(piv.loc[userId][i]):  # 사용자가 이전에 rating을 매기지 않은 경우만
            top5 = pd.concat([top5, movies[movies["movieId"]==i]])    # top5 DataFrame에 rating이 높은 movie를 하나씩 추가한다.
    print("=====Recommendation List=====")
    return top5.set_index('movieId')

In [32]:
getCF(604)

=====Recommendation List=====


Unnamed: 0_level_0,title,genres,corpus
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
515,"Remains of the Day, The (1993)",Drama|Romance,Drama Romance
840,House Arrest (1996),Children|Comedy,Children Comedy
2144,Sixteen Candles (1984),Comedy|Romance,Comedy Romance
928,Rebecca (1940),Drama|Mystery|Romance|Thriller,Drama Mystery Romance Thriller
932,"Affair to Remember, An (1957)",Drama|Romance,Drama Romance


604번 사용자와 유사한 유저의 상위 rating 값 중 604번 사용자가 아직 보지 않은 영화를 추천한 결과이다. 앞선 결과와는 다르게, 다양한 장르의 영화가 추천된 것을 확인할 수 있다. 

## 3. Matrix Factorization 실습

같은 폴더에 있는 MF.py의 MatrixFactorization()을 적용해 봅시당

In [33]:
import sys
sys.path.append(DIR)

import MF

In [34]:
# 사용자-영화 선호도 데이터프레임입니당
piv.head()

movieId,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,34,36,38,39,40,41,42,43,...,189333,189381,189547,189713,190183,190207,190209,190213,190215,190219,190221,191005,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609,1076,2939,3338,3456,4194,5721,6668,6849,7020,7792,8765,25855,26085,30892,32160,32371,34482,85565
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
1,4.0,,4.0,,,4.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
3,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,0.5,,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,,,,,,,3.0,,,,,,,,,,,2.0,,,,,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,4.0,,,,,,,,,,,,,,,,,,,,4.0,,,,,,,,,,,,4.0,4.0,,3.0,,,,,...,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [35]:
# MF.py를 참고하여 인자를 입력해 보아유
factorizer = MF.MatrixFactorization(np.array(piv.fillna(0)), k=100, learning_rate=0.01, reg_param=0.01, epochs=200, verbose=True)
"""
R: user-Movie간의 rating 행렬
k: latent의 개수: 임의로 100으로 설정
"""
factorizer.fit()
R_reduced = factorizer.print_results()

Iteration: 10 ; cost = 0.0024
Iteration: 20 ; cost = 0.0021
Iteration: 30 ; cost = 0.0020
Iteration: 40 ; cost = 0.0018
Iteration: 50 ; cost = 0.0017
Iteration: 60 ; cost = 0.0016
Iteration: 70 ; cost = 0.0015
Iteration: 80 ; cost = 0.0014
Iteration: 90 ; cost = 0.0013
Iteration: 100 ; cost = 0.0013
Iteration: 110 ; cost = 0.0012
Iteration: 120 ; cost = 0.0011
Iteration: 130 ; cost = 0.0011
Iteration: 140 ; cost = 0.0011
Iteration: 150 ; cost = 0.0010
Iteration: 160 ; cost = 0.0010
Iteration: 170 ; cost = 0.0009
Iteration: 180 ; cost = 0.0009
Iteration: 190 ; cost = 0.0009
Iteration: 200 ; cost = 0.0009
User Latent P:
[[-0.12891329 -0.16660148  0.17200642 ... -0.31042386 -0.01662172
   0.09057833]
 [ 0.02861807  0.13351684 -1.04943078 ...  0.12289567 -0.17743283
   0.14507268]
 [ 0.23733206 -0.03484917  0.15468689 ... -0.2660257   0.36893373
  -0.22306497]
 ...
 [ 0.17244818  0.18736696 -0.28477706 ... -0.21038304  0.01091063
  -0.16414574]
 [-0.18276056 -0.38956692 -0.05870956 ...  0.

In [36]:
# Matrix Factorizaion으로 찾아낸 잠재요인을 기반으로 재생산한 행렬이네유
R_reduced = pd.DataFrame(R_reduced)
R_reduced

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,37,38,39,...,9702,9703,9704,9705,9706,9707,9708,9709,9710,9711,9712,9713,9714,9715,9716,9717,9718,9719,9720,9721,9722,9723,9724,9725,9726,9727,9728,9729,9730,9731,9732,9733,9734,9735,9736,9737,9738,9739,9740,9741
0,4.215796,3.224931,3.967980,4.175243,4.207083,3.889886,3.520897,5.332156,3.318095,3.828755,4.019672,2.301153,2.161214,4.660634,2.222532,4.420931,4.053109,1.613495,3.565180,4.009523,4.116057,4.150353,4.335226,4.722665,3.536366,2.305268,2.138831,3.371934,4.918367,6.837382,2.568568,5.300958,4.487545,4.559020,5.551047,4.295594,7.153220,3.921221,2.806274,2.550315,...,5.362567,5.350170,7.426979,2.784827,5.493251,2.121759,3.621504,8.548076,2.166154,2.440984,0.235826,4.412678,6.109358,2.021358,12.662430,5.139008,5.867409,3.353468,4.235571,5.220429,2.048535,1.758596,5.320268,3.621423,3.333775,4.580731,2.492370,1.224326,3.117415,5.845304,4.081709,7.950498,6.011818,6.439407,5.669621,7.566310,0.699369,6.734534,4.615412,-1.202474
1,3.933506,1.633423,3.580024,-0.209946,1.875307,2.769146,3.599119,7.514915,4.567758,4.148458,4.913494,1.580104,4.773358,5.528453,2.872181,2.776711,3.842303,4.624334,1.918359,4.368957,4.124190,2.972816,10.672645,0.347415,3.465599,0.555273,2.933355,-0.508017,1.323421,7.760276,3.217128,2.317727,2.460557,3.550561,2.788742,3.487609,1.051851,5.024926,2.166558,-1.200597,...,2.743818,8.991046,10.564579,5.609909,10.092589,5.649239,-4.532881,5.577908,3.535809,6.448157,3.620000,0.355600,1.382116,4.919337,6.080505,4.952143,8.546790,3.520068,2.954511,-1.899338,6.651771,10.165138,-5.817739,6.378915,6.632653,9.816340,6.234304,7.220383,7.879941,3.856402,4.058985,0.241255,13.007848,-1.541566,-1.868889,3.327208,6.987731,-2.223710,1.403589,2.634530
2,1.782991,3.206417,2.686962,5.607762,3.774146,2.864446,0.483633,-1.049318,3.664080,1.964150,1.624599,1.779673,1.612965,4.797946,4.860517,2.079212,2.837375,4.409586,2.699827,-1.074404,0.821140,1.630824,0.885151,3.971530,1.304131,6.101328,3.125426,2.107812,3.504363,2.616760,0.524494,5.442582,3.312533,3.864632,1.953104,2.238204,7.255297,5.015731,5.336366,-4.116234,...,3.064892,-1.684147,2.035446,-0.575801,9.556753,2.970176,-5.136025,3.121221,6.587433,1.697167,4.338676,3.009950,10.770665,0.679137,9.669778,4.999878,-1.066207,3.974397,-1.508075,-0.034419,1.898685,-2.166550,2.650992,4.974295,6.524428,3.881990,0.922115,2.434028,-0.612341,2.846622,6.680456,3.891442,3.144201,-0.791070,0.173875,7.539140,-1.935324,3.346611,1.406661,5.525329
3,2.353303,3.448305,4.425407,3.809167,3.147225,3.205806,3.874848,1.315606,0.072250,3.687749,3.452184,3.287196,-4.981555,6.635064,2.811001,3.924925,4.018668,3.628780,1.979763,3.132716,3.043876,6.119314,1.705743,-0.239782,4.117462,6.654443,0.704328,2.545425,2.783765,1.715702,3.186652,2.115488,3.648391,4.408465,1.355746,3.219409,2.986210,2.494614,-1.894799,4.316992,...,-3.662198,2.595452,9.939620,4.199160,-1.985753,4.140680,2.033172,2.998283,7.553637,2.571796,5.949241,4.817831,4.645110,3.196026,1.021791,2.865827,-0.582216,0.213841,3.189696,4.671468,0.828780,2.020631,-1.648916,5.764719,3.561376,-3.926314,5.345725,5.014603,2.519290,5.544603,4.980535,-1.547016,11.511192,6.583938,-0.165280,1.965773,4.343255,2.386791,2.217595,5.992108
4,4.026791,2.663772,4.328703,1.643235,3.512028,3.008795,3.965978,3.115013,4.294747,3.136258,2.750857,5.646312,3.047890,3.948188,5.884659,5.051237,3.272028,4.705826,2.249095,7.368861,3.874947,4.492914,2.965881,1.583175,4.051550,8.787897,1.347210,4.368228,3.573315,3.123475,2.201531,2.976060,3.920065,3.989119,5.025768,3.055421,6.278759,4.562359,1.710192,-0.840923,...,6.032731,1.534910,2.146349,4.092998,-2.548922,6.567513,0.337235,4.382758,0.448541,0.383365,-0.367445,8.340846,5.190435,3.135311,5.237238,-1.881708,7.752216,6.963967,5.934189,0.491408,1.745939,1.838984,-0.170547,9.357088,5.847352,7.176047,6.659457,10.066191,-0.664182,3.806433,2.604713,3.319296,7.356332,5.749020,1.684511,5.150289,2.864860,3.459954,-3.533274,5.353801
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
605,3.813111,3.198568,3.498028,3.581399,2.687193,4.127032,2.882528,2.922018,2.668466,3.511214,2.955723,3.394912,1.935384,3.316368,2.904136,4.022216,4.165402,3.775707,2.752378,2.981544,3.805204,3.089025,2.771021,3.452219,3.798318,4.499763,3.105503,3.685392,4.614708,4.176314,2.717922,3.741694,3.720497,3.862888,1.425901,3.600131,3.026177,3.149276,2.699197,3.850993,...,3.935981,2.298288,2.987135,3.475690,2.791825,2.803445,3.007161,2.996712,2.907628,2.232607,5.125713,3.290715,4.160991,2.384688,2.485670,2.871622,2.324908,4.904625,2.890466,3.852171,2.933367,2.834404,2.586838,2.212725,4.638245,3.208041,2.857853,1.842784,3.879876,3.260402,3.372627,3.293983,4.338795,3.595912,3.388992,3.939368,4.021370,4.747939,4.556772,5.389850
606,4.034244,3.881452,3.781950,2.913617,3.738151,2.619348,2.820255,4.443073,2.261923,2.830066,3.060975,2.967973,4.381863,4.863046,4.134380,3.609476,3.619936,3.677267,2.552884,3.474043,4.471214,3.867299,2.135015,4.079414,2.999414,3.812085,1.410244,4.735869,3.263067,1.437156,4.039825,3.582126,3.126532,4.004103,3.708759,3.090141,3.014384,4.658654,6.788207,4.329070,...,5.906500,4.117422,1.991116,0.943079,-0.390752,-0.059739,-1.289561,6.252219,0.421952,4.288732,1.416832,-0.108215,1.923895,3.880230,8.116074,6.413166,6.091307,6.286277,6.039220,-0.273568,1.868961,3.253638,-1.702838,5.429945,1.753851,4.177319,3.931603,2.018363,1.504547,1.448478,2.481910,4.750375,5.723424,4.464785,1.507346,7.570745,-0.038963,2.515614,2.730181,4.341918
607,2.767505,2.299443,1.995541,1.821202,1.674232,4.449608,2.443166,1.933170,1.960182,3.922536,3.036410,1.186305,4.168526,2.619620,1.701777,4.409437,3.623732,3.071483,2.061777,0.097469,3.437286,1.784405,2.868007,2.120723,2.635258,0.854677,3.999198,5.852479,3.534610,3.357166,2.997701,3.702820,3.495954,3.007983,1.726066,2.656082,3.704563,3.175593,6.036076,6.237026,...,4.436403,1.566981,0.821383,4.542758,8.334107,3.071353,2.216952,1.469040,3.212234,3.825082,4.757992,1.661272,3.102187,6.569770,2.091869,2.988057,4.988651,5.209114,3.216699,3.201772,3.593643,3.944571,3.237881,4.282469,3.410963,3.250888,2.211343,6.916819,3.084426,3.086067,1.707504,4.466227,6.655346,4.109215,6.749476,4.092143,-0.124570,3.936499,3.776629,1.399117
608,3.062740,2.932935,5.272092,3.535520,4.137499,5.233310,2.064524,2.285973,1.178270,3.899607,4.059402,5.445549,2.541075,4.802500,1.533070,3.507268,3.841952,1.870447,3.457280,2.785833,4.420583,3.398722,3.279640,2.028589,2.632411,2.449613,6.080650,-0.500824,4.701101,0.721942,3.215513,4.454340,4.877231,2.380020,-2.950715,5.160607,0.788462,0.459011,2.091469,0.909375,...,0.609536,2.579468,4.572991,0.527359,3.767056,2.191437,-2.550328,0.829553,1.269727,4.715175,1.531204,7.683144,3.076894,6.035475,5.346650,3.893821,8.742854,-0.209855,-3.164436,4.707911,2.062537,3.933787,1.671911,4.607062,7.816532,6.428270,2.293395,2.157627,8.487226,2.479884,5.507801,4.908486,8.439851,3.609813,3.624680,8.672776,-1.453882,7.573438,1.519334,5.219418


In [37]:
def recommend_by_MF(userId, rank=10):
    top_Rating = np.argsort(R_reduced[userId])[::-1]  # 재생산한 Rate Matrix에서 해당 user의 movie에 대한 rating값을 높은 순으로 정렬한다.
    top = pd.DataFrame(columns = movies.columns)
    for i in top_Rating:
        if len(top) ==rank:
            break
        if math.isnan(piv.loc[userId][i]):  # 사용자가 이전에 rating을 매기지 않은 경우만
            top = pd.concat([top, movies[movies["movieId"]==i]])    # top DataFrame에 rating이 높은 movie를 하나씩 추가한다.
    print("=====Recommendation List=====")
    return top.set_index('movieId')

recommend_by_MF(604, 5)

=====Recommendation List=====


Unnamed: 0_level_0,title,genres,corpus
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
548,Terminal Velocity (1994),Action|Mystery|Thriller,Action Mystery Thriller
441,Dazed and Confused (1993),Comedy,Comedy
430,Calendar Girl (1993),Comedy|Drama,Comedy Drama
206,Unzipped (1995),Documentary,Documentary
377,Speed (1994),Action|Romance|Thriller,Action Romance Thriller


MF를 통해 추정한 Rate Matrix를 통해 사용자의 id를 입력하면 영화를 추천해주는 함수를 구현하였다. 앞선 추천결과와는 상이한 결과를 보였지만 대부분의 장르가 Thiller, Comedy, Drama 등 604번 사용자가 선호하는 장르임을 확인할 수 있다.

## 4. 본인만의 추천 리스트 만들어보기

배운 내용 혹은 알고 있는 추천 알고리즘을 사용해서 user의 다음 추천 아이템에 대한 TOP10 추천 리스트를 만들어 봅시당

코드를 이해할 수 있도록 주석을 꼼꼼하게 달아주시면 감사하겠습니다!

In [38]:
!pip install scikit-surprise
import surprise

Collecting scikit-surprise
[?25l  Downloading https://files.pythonhosted.org/packages/97/37/5d334adaf5ddd65da99fc65f6507e0e4599d092ba048f4302fe8775619e8/scikit-surprise-1.1.1.tar.gz (11.8MB)
[K     |████████████████████████████████| 11.8MB 290kB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.1-cp37-cp37m-linux_x86_64.whl size=1617589 sha256=44e1b30334476ea67a4ee0b15cdb4440bd8f5c5b5219aeaf7595a1661c126a76
  Stored in directory: /root/.cache/pip/wheels/78/9c/3d/41b419c9d2aff5b6e2b4c0fc8d25c538202834058f9ed110d0
Successfully built scikit-surprise
Installing collected packages: scikit-surprise
Successfully installed scikit-surprise-1.1.1


추천시스템에 많이 사용되는 surprise 패키지를 import한다.

In [39]:
reader = surprise.Reader(rating_scale=(1,5))    # 데이터를 읽어들이기 위한 Reader 객체를 선언한다.
col_list = ['userId', 'movieId', 'rating']
data = surprise.Dataset.load_from_df(ratings[col_list], reader)   # dataframe으로부터 surprise용 dataset으로 변환한다.
trainset = data.build_full_trainset()   # data전체를 training에 사용한다.
algo = surprise.KNNBasic(sim_options = {'name': 'pearson'}).fit(trainset)  # Nearset Neigbor 방식으로 추천을 한다. 이 때, pearson을 계산하여 Nearest를 판단한다.

Computing the pearson similarity matrix...
Done computing similarity matrix.


In [40]:
def recommend_by_KNN(userId, k=5, rank=10):
    unseen = pd.DataFrame(columns = ['movieId', 'rating'])
    for movie in np.unique(piv.columns):
        if math.isnan(piv.loc[userId][movie]):  # 사용자가 이전에 rating을 매기지 않은 경우만
            rating = algo.predict(userId, movie)[3] # rating 값을 예측한다.
            unseen = unseen.append({'movieId': movie, 'rating': rating}, ignore_index=True) # unseen DataFrame에 예측한 rating값을 추가한다.
    unseen = unseen.sort_values(by=['rating'], axis=0, ascending=False) # 예측된 rating이 높은 순서대로 정렬한다.

    top = pd.DataFrame(columns = movies.columns)
    for i in unseen.movieId:
        if len(top) ==rank:
            break
        top = pd.concat([top, movies[movies["movieId"]==i]])    # top DataFrame에 rating이 높은 movie를 하나씩 추가한다.
    print("=====Recommendation List=====")
    return top.set_index('movieId')

recommend_by_KNN(604, rank = 5)

=====Recommendation List=====


Unnamed: 0_level_0,title,genres,corpus
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
173351,Wow! A Talking Fish! (1983),Animation|Children|Comedy|Fantasy,Animation Children Comedy Fantasy
5490,The Big Bus (1976),Action|Comedy,Action Comedy
6201,Lady Jane (1986),Drama|Romance,Drama Romance
5513,Martin Lawrence Live: Runteldat (2002),Comedy|Documentary,Comedy Documentary
6192,Open Hearts (Elsker dig for evigt) (2002),Romance,Romance


KNN을 이용하여 604번 사용자에게 추천하는 목록을 뽑는 함수를 구현하였다. 604번 사용자와 유사한 사용자들의 데이터를 각 영화에 대해 604번 사용자가 부여할 평점을 예측한다. 그리고, 604번 사용자가 기존에 보지 않은 영화 중에서 평점이 높은 순서대로 추천은 진행한다. 그 결과, 앞선 추천보다 훨씬 다양한 장르의 영화에 대한 추천을 얻을 수 있었다. 다만, "Wow! A Talking Fish!"와 같이 직관적으로 이해되지 않는 추천결과도 있었다. 아마 대중적으로 유명한 영화여서 추천결과에 포함된 것으로 추측된다.