 # 라이브러리 로드 및 데이터 불러오기

In [36]:
import pandas as pd
import numpy as np
import warnings
from ast import literal_eval

warnings.filterwarnings('ignore')
pd.set_option('max_colwidth', 100)



In [37]:
movies = pd.read_csv('./input/tmdb_5000_movies.csv')
print(movies.shape)
movies.head(3)


(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp...",en,Avatar,"In the 22nd century, a paraplegic Marine is dispatched to the moon Pandora on a unique mission, ...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289}, {""name"": ""Twentieth Century Fox Film Corporatio...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}, {""iso_3166_1"": ""GB"", ""name"": ""United ...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}]",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, has come back to life and is headed to the edge of t...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""name"": ""Jerry Bruckheimer Films"", ""id"": 130}, {""na...","[{""iso_3166_1"": ""US"", ""name"": ""United States of America""}]",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 80, ""name"": ""Crime""}]",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 4289, ""name"": ""secret...",en,Spectre,A cryptic message from Bond’s past sends him on a trail to uncover a sinister organization. Whil...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Danjaq"", ""id"": 10761}, {""name"": ""B24"", ""id"": ...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""}, {""iso_3166_1"": ""US"", ""name"": ""United States of ...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639...",Released,A Plan No One Escapes,Spectre,6.3,4466


 # 데이터 가공하기

In [3]:
movies_df = movies[['id', 'title', 'genres', 'vote_average', 'vote_count', 'popularity', 'keywords', 'overview']]



In [4]:
movies_df[['genres', 'keywords']][:2]



Unnamed: 0,genres,keywords
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {...","[{""id"": 1463, ""name"": ""culture clash""}, {""id"": 2964, ""name"": ""future""}, {""id"": 3386, ""name"": ""sp..."
1,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {""id"": 28, ""name"": ""Action""}]","[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""name"": ""drug abuse""}, {""id"": 911, ""name"": ""exotic is..."


In [5]:
movies_df['genres'] = movies_df['genres'].apply(literal_eval)
movies_df['keywords'] = movies_df['keywords'].apply(literal_eval)



In [6]:
movies_df['genres'] = movies_df['genres'].apply(lambda x : [y['name'] for y in x])
movies_df['keywords'] = movies_df['keywords'].apply(lambda x : [y['name'] for y in x])
movies_df[['genres', 'keywords']][:1]


Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colony, society, space travel, futuristic, romance, spa..."


 # 장르 컨텐츠 유사도 측정


 만약 영화 A의 장르가 `[Action, Adventure, Fantasy, Science Fiction]`이고,
 B가 `[Adventure, Fantasy, Action]`으로 되어있다면 어떻게 장르별 유사도를 측정할까?

 여러가지 방법이 있을 수 있으나, 가장 간단한 방법은 genres를 문자열로 변경한 뒤,
 이를 CountVectorizer로 피처 벡터화한 행렬 데이터 값을 코사인 유사도로 비교하는 것입니다.
 genres 컬럼을 기반으로 하는 컨텐츠 기반 필터링은 다음 단계로 구현하겠습니다.

   1. 문자열로 변환된 genres 컬럼을 Count 기반으로 피처 벡터화 변환합니다.
   2. genres 문자열을 피처 벡터ㅗ하 행렬로 변환한 데이터 셋을 코사인 유사도를 통해 비교합니다. 이를 위해 데이터 셋의 레코드별로 타 레코드와 장르에서 코사인 유사도 값을 가지는 객체를 생성합니다.
   3. 장르 유사도가 높은 영화 중에서 평점이 높은 순으로 영화를 추천합니다.

 장르 컬럼을 문자열로 변환한 뒤, 사이킷런의 CountVectorizer를 이용해 피처 벡터 행렬로 만들겠습니다.

In [7]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환.

movies_df['genres_literal'] = movies_df['genres'].apply(lambda x: (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)


(4803, 276)


 CountVectorizer로 변환해 4803개의 레코드와 276개의 개별 단어 피처로 구성된 피처 벡터 행렬을 만듬.
 이렇게 생성된 행렬에 사이킷런의 cosine_similarity()를 이용해 코사인 유사도를 계산하겠습니다.

In [8]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_mat.shape)
print(genre_sim[:2])


(4803, 276)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


 cosine_similarities() 호출로 생성된 genre_sim 객체는 movies_df 의 genre_literal 컬럼을 피처 벡터화한 행럴(genre_mat) 데이터의 행별로
 유사도 정보를 가지고 있으며, 결국은 movies_df DataFrame의 행별 장르 유사도 값을 가지고 있는 것입니다. movies_df를 장르 기준으로
 컨텐츠 기반 필터링ㄹ을 수행하려면 movies_df의 개별 레코드에 대해서 가장 장르 유사도가 높은 순으로 다른 레코드를 추출해야 하는데,
 이를 위해 앞에서 생성한 genre_sim 객체를 이용합니다.

   genre_sim 객체의 기준 행별로 비교 대상이 되는 행의 유사도 값이 높은 순으로 정렬된 행렬의 위치 인덱스 값을 추출하면 됩니다. 값이 높은 순으로 정렬도니 비교 대상 행의 유사도 값이 아니라
   비교 대상 행의 위치 인덱스임에 주의합시다. numpydml argsort() 함수를 이용하여 argsort()[:, ::1] 을 이용하면 유사도가 높은 순으로 정리된 genre_sim 객체의 배교 행 위치 인덱스 값을 간편하게 얻을 수 있습니다.
   genre_sim.argsort()[:, ::1] 을 사용해 높은 순으로 정렬된 비교 행 위치 인덱스 값을 가져오고 그 중에 0번 레코드의 비교 행 위치 인덱스 값만 샘플로 추출해 보겠습니다.

In [9]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]



In [10]:
genre_sim_sorted_ind


array([[   0, 3494,  813, ..., 3038, 3037, 2401],
       [ 262,    1,  129, ..., 3069, 3067, 2401],
       [   2, 1740, 1542, ..., 3000, 2999, 2401],
       ...,
       [4800, 3809, 1895, ..., 2229, 2230,    0],
       [4802, 1594, 1596, ..., 3204, 3205,    0],
       [4802, 4710, 4521, ..., 3140, 3141,    0]], dtype=int64)

 첫번째 값을 보면, 0번 레코드의 경우, 자신인 0번이 가장 가깝고, 그다음 3494번째가 가깝고, 그다음 813이 가깝다는 뜻.
 두번째 값을 보면, 이상하게도 자신의 값이 아닌 262가 가장 가깝다고 나옴.

 ## 장르 컨텐츠 필터링을 이용한 영화추천

In [11]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):

    # 인자로 입력된 movies_df 데이터프레임에서 'title' 컬럼이 입력된 title_name 값인 데이터프레임을 추출
    title_movie = df[df['title'] == title_name]


    # title_named을 가진 데이터프레임의 index 객체를 ndarray로 변환하고
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]

    # 추출된 top_n index 출력. top_n index 는 2차원 데이터임.
    # 데이터프레임에서 index로 사용하기 위해서 1차원 array 로 변경.

    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)
    
    return df.iloc[similar_indexes]



In [12]:
# 위 함수를 이용해 영화 '대부'와 장르별로 유사한 영화 10개를 추천해보겠습니다.

similar_movies = find_sim_movie(movies_df, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]


[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


 대부 2 추천으로 봐서 잘된것도 같지만, Light Sleeper 같이 이상한 영화도 추천함. 좀 더 개선이 필요함을 알 수 있음.
 게다가 Mi America의 경우 평점이 0점임을 알 수 있음.
 이번에는 일단 좀 더 많은 후보군을 선정한 뒤 영화의 평점에 따라 필터링해서 최종 추천하는 방식으로 변경하겠습니다.
 영화의 평점 정보인 'vote_average' 값을 이용하겠습니다.
 그런데 1명, 2명의 소수 관객이 특정 영화에 만점이나 매우 높은 평점을 부여해 왜곡된 데이터가 발생할 수 있습니다. 이를 확인하기 위해
 오름차순으로 정렬해 확인해 보겠습니다.

In [13]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]


Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


 앞서 말했듯이 왜곡된 데이터가 존재함을 확인할 수 있습니다.
 이를 회피하기 위해 새로운 평가방식이 필요.
 평가횟수에 대한 가중치가 부여된 평점(Weighted Rating) 방식을 사용하겠습니다.

 가중 평점 = $\frac{v}{(v+m)}$ `*` R + $\frac{m}{(v+m)}$ `*` C

 각 변수의 의미
 - v: 개별 영화에 평점을 투표한 횟수, movies_df의 'vote_count'값
 - m: 평점을 부여하기 위한최수 투표 횟수, 가중치를 직접 조절하는 역할을 하는데, m 값을 높이면 평점 투표횟수가 많은 영화에 더 많은 가중 평점을 부여, m 값은 전체 투표 횟수에서 상위 60% 에 해당하는 횟수를 기준으로 정하겠습니다.
 - R: 개별 영화에 대한 평균 평점, 'vote_average'
 - C: 전체 영화에 대한 평균 평점 'movies_df['vote_average'].mean()



In [14]:
C = movies_df['vote_average'].mean()
m = movies_df['vote_count'].quantile(0.6)
print(C, m)


6.092171559442016 370.1999999999998


 기존 평점을 새로운 가중 평점으로 변경하는 함수를 생성하고 이를 이용해, 새로운 평점 정보인 'vote_weighted' 값을 만들겠습니다.
 함수명은 weighted_vote_average()입니다. 이 함수는 데이터 프레임의 레코드를 인자로 받아 이 레코드를 vote_count와 vote_average(), 미리 추출된 m과 C 값을
 적용해 레코드별 가중 평점을 반환합니다.

In [15]:
percentile = 0.6
m = movies['vote_count'].quantile(percentile)
C = movies['vote_average'].mean()

def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']

    return ((v/(v+m)) * R) + ((m/(m+v)) * C)

movies['weighted_vote'] = movies.apply(weighted_vote_average, axis = 1)



In [16]:
movies['weighted_vote'] = movies.apply(weighted_vote_average, axis = 1)



In [17]:
movies.sort_values('weighted_vote', ascending=False)[:10]


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,weighted_vote
1881,25000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,278,"[{""id"": 378, ""name"": ""prison""}, {""id"": 417, ""name"": ""corruption""}, {""id"": 796, ""name"": ""police b...",en,The Shawshank Redemption,"Framed in the 1940s for the double murder of his wife and her lover, upstanding banker Andy Dufr...",136.747729,"[{""name"": ""Castle Rock Entertainment"", ""id"": 97}]",...,1994-09-23,28341469,142.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Fear can hold you prisoner. Hope can set you free.,The Shawshank Redemption,8.5,8205,8.396052
3337,6000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://www.thegodfather.com/,238,"[{""id"": 131, ""name"": ""italy""}, {""id"": 699, ""name"": ""love at first sight""}, {""id"": 1872, ""name"": ...",en,The Godfather,"Spanning the years 1945 to 1955, a chronicle of the fictional Italian-American Corleone crime fa...",143.659698,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""name"": ""Alfran Productions"", ""id"": 10211}]",...,1972-03-14,245066411,175.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ...",Released,An offer you can't refuse.,The Godfather,8.4,5893,8.263591
662,63000000,"[{""id"": 18, ""name"": ""Drama""}]",http://www.foxmovies.com/movies/fight-club,550,"[{""id"": 825, ""name"": ""support group""}, {""id"": 851, ""name"": ""dual identity""}, {""id"": 1541, ""name""...",en,Fight Club,A ticking-time-bomb insomniac and a slippery soap salesman channel primal male aggression into a...,146.757391,"[{""name"": ""Regency Enterprises"", ""id"": 508}, {""name"": ""Fox 2000 Pictures"", ""id"": 711}, {""name"": ...",...,1999-10-15,100853753,139.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Mischief. Mayhem. Soap.,Fight Club,8.3,9413,8.216455
3232,8000000,"[{""id"": 53, ""name"": ""Thriller""}, {""id"": 80, ""name"": ""Crime""}]",,680,"[{""id"": 396, ""name"": ""transporter""}, {""id"": 1411, ""name"": ""brothel""}, {""id"": 2231, ""name"": ""drug...",en,Pulp Fiction,"A burger-loving hit man, his philosophical partner, a drug-addled gangster's moll and a washed-u...",121.463076,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"": ""A Band Apart"", ""id"": 59}, {""name"": ""Jersey Films...",...,1994-10-08,213928762,154.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""es"", ""name"": ""Espa\u00f1ol""}, {""iso_639_...",Released,Just because you are a character doesn't mean you have character.,Pulp Fiction,8.3,8428,8.207102
65,185000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 28, ""name"": ""Action""}, {""id"": 80, ""name"": ""Crime""}, {""id"": ...",http://thedarkknight.warnerbros.com/dvdsite/,155,"[{""id"": 849, ""name"": ""dc comics""}, {""id"": 853, ""name"": ""crime fighter""}, {""id"": 1308, ""name"": ""s...",en,The Dark Knight,Batman raises the stakes in his war on crime. With the help of Lt. Jim Gordon and District Attor...,187.322927,"[{""name"": ""DC Comics"", ""id"": 429}, {""name"": ""Legendary Pictures"", ""id"": 923}, {""name"": ""Warner B...",...,2008-07-16,1004558444,152.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""zh"", ""name"": ""\u666e\u901a\u8bdd""}]",Released,Why So Serious?,The Dark Knight,8.2,12002,8.13693
1818,22000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 36, ""name"": ""History""}, {""id"": 10752, ""name"": ""War""}]",http://www.schindlerslist.com/,424,"[{""id"": 1382, ""name"": ""factory""}, {""id"": 1631, ""name"": ""concentration camp""}, {""id"": 1701, ""name...",en,Schindler's List,The true story of how businessman Oskar Schindler saved over a thousand Jewish lives from the Na...,104.469351,"[{""name"": ""Universal Pictures"", ""id"": 33}, {""name"": ""Amblin Entertainment"", ""id"": 56}]",...,1993-11-29,321365567,195.0,"[{""iso_639_1"": ""de"", ""name"": ""Deutsch""}, {""iso_639_1"": ""pl"", ""name"": ""Polski""}, {""iso_639_1"": ""h...",Released,"Whoever saves one life, saves the world entire.",Schindler's List,8.3,4329,8.126069
3865,3300000,"[{""id"": 18, ""name"": ""Drama""}]",http://sonyclassics.com/whiplash/,244786,"[{""id"": 1416, ""name"": ""jazz""}, {""id"": 1523, ""name"": ""obsession""}, {""id"": 1640, ""name"": ""conserva...",en,Whiplash,"Under the direction of a ruthless instructor, a talented young drummer begins to pursue perfecti...",192.528841,"[{""name"": ""Bold Films"", ""id"": 2266}, {""name"": ""Blumhouse Productions"", ""id"": 3172}, {""name"": ""Ri...",...,2014-10-10,13092000,105.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The road to greatness can take you to the edge.,Whiplash,8.3,4254,8.123248
809,55000000,"[{""id"": 35, ""name"": ""Comedy""}, {""id"": 18, ""name"": ""Drama""}, {""id"": 10749, ""name"": ""Romance""}]",,13,"[{""id"": 422, ""name"": ""vietnam veteran""}, {""id"": 458, ""name"": ""hippie""}, {""id"": 791, ""name"": ""men...",en,Forrest Gump,A man with a low IQ has accomplished great things in his life and been present during significan...,138.133331,"[{""name"": ""Paramount Pictures"", ""id"": 4}]",...,1994-07-06,677945399,142.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"The world will never be the same, once you've seen it through the eyes of Forrest Gump.",Forrest Gump,8.2,7927,8.105954
2294,15000000,"[{""id"": 14, ""name"": ""Fantasy""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 16, ""name"": ""Animation""}...",http://movies.disney.com/spirited-away,129,"[{""id"": 616, ""name"": ""witch""}, {""id"": 970, ""name"": ""parents kids relationship""}, {""id"": 2343, ""n...",ja,千と千尋の神隠し,A ten year old girl who wanders away from her parents along a path that leads to a world ruled b...,118.968562,"[{""name"": ""Studio Ghibli"", ""id"": 10342}]",...,2001-07-20,274925095,125.0,"[{""iso_639_1"": ""ja"", ""name"": ""\u65e5\u672c\u8a9e""}]",Released,The tunnel led Chihiro to a mysterious town...,Spirited Away,8.3,3840,8.105867
2731,13000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,240,"[{""id"": 700, ""name"": ""italo-american""}, {""id"": 701, ""name"": ""cuba""}, {""id"": 1950, ""name"": ""voror...",en,The Godfather: Part II,"In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily an...",105.792936,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""name"": ""The Coppola Company"", ""id"": 536}]",...,1974-12-20,47542841,200.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ...",Released,"I don't feel I have to wipe everybody out, Tom. Just my enemies.",The Godfather: Part II,8.3,3338,8.079586


 새롭게 정의된 평점 기준에 따라서 영화를 추천해 보겠습니다. 장르 유사성이 높은 영화를 top_n의 2배수 만큼 후보군으로 선정한 뒤,
 weighted_vote 컬럼 값이 높은 순으로 top_n만큼 추출하는 방식으로 find_sim_movie() 함수를 변경합니다.

In [18]:
def find_sim_movie(df, sorted_ind, title_name, top_n = 10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values

    # top_n의 2배에 해당하는 장르 유사서이 높은 인덱스 추출
    similar_indexes = sorted_ind[title_index, :(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)
    
    # 기준 영화 인덱스는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]

    # top_n의 2배에 해당하는 후보군에서 weighted_vote 가 높은 순으로 top_n만큼 추출
    
    return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

similar_movies = find_sim_movie(movies, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies.head(10)



Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,weighted_vote
2731,13000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,240,"[{""id"": 700, ""name"": ""italo-american""}, {""id"": 701, ""name"": ""cuba""}, {""id"": 1950, ""name"": ""voror...",en,The Godfather: Part II,"In the continuing saga of the Corleone crime family, a young Vito Corleone grows up in Sicily an...",105.792936,"[{""name"": ""Paramount Pictures"", ""id"": 4}, {""name"": ""The Coppola Company"", ""id"": 536}]",...,1974-12-20,47542841,200.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ...",Released,"I don't feel I have to wipe everybody out, Tom. Just my enemies.",The Godfather: Part II,8.3,3338,8.079586
1847,25000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://www.warnerbros.com/goodfellas,769,"[{""id"": 378, ""name"": ""prison""}, {""id"": 818, ""name"": ""based on novel""}, {""id"": 1196, ""name"": ""flo...",en,GoodFellas,"The true story of Henry Hill, a half-Irish, half-Sicilian Brooklyn kid who is adopted by neighbo...",63.654244,"[{""name"": ""Winkler Films"", ""id"": 8880}]",...,1990-09-12,46836394,145.0,"[{""iso_639_1"": ""it"", ""name"": ""Italiano""}, {""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Three Decades of Life in the Mafia.,GoodFellas,8.2,3128,7.976937
3866,3300000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://cidadededeus.globo.com/,598,"[{""id"": 255, ""name"": ""male nudity""}, {""id"": 542, ""name"": ""street gang""}, {""id"": 983, ""name"": ""br...",pt,Cidade de Deus,Cidade de Deus is a shantytown that started during the 1960s and became one of Rio de Janeiro’s ...,44.356711,"[{""name"": ""O2 Filmes"", ""id"": 345}, {""name"": ""VideoFilmes"", ""id"": 346}, {""name"": ""Wild Bunch"", ""i...",...,2002-02-05,30641770,130.0,"[{""iso_639_1"": ""pt"", ""name"": ""Portugu\u00eas""}]",Released,"If you run you're dead... if you stay, you're dead again. Period.",City of God,8.1,1814,7.759693
1663,30000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,311,"[{""id"": 314, ""name"": ""life and death""}, {""id"": 417, ""name"": ""corruption""}, {""id"": 542, ""name"": ""...",en,Once Upon a Time in America,A former Prohibition-era Jewish gangster returns to the Lower East Side of Manhattan over thirty...,49.336397,"[{""name"": ""Warner Bros."", ""id"": 6194}, {""name"": ""The Ladd Company"", ""id"": 7965}]",...,1984-02-16,0,229.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}, {""iso_639...",Released,"Crime, passion and lust for power - Sergio Leone's explosive saga of gangland America.",Once Upon a Time in America,8.2,1069,7.657811
883,52000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",https://www.facebook.com/CatchMeIfYouCanMovie,640,"[{""id"": 3202, ""name"": ""con man""}, {""id"": 5565, ""name"": ""biography""}, {""id"": 18525, ""name"": ""fbi ...",en,Catch Me If You Can,"A true story about Frank Abagnale Jr. who, before his 19th birthday, successfully conned million...",73.944049,"[{""name"": ""Kemp Company"", ""id"": 367}, {""name"": ""Splendid Pictures"", ""id"": 368}, {""name"": ""Parkes...",...,2002-12-25,352114312,141.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""}]",Released,The true story of a real fake.,Catch Me If You Can,7.7,3795,7.557097
281,100000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://www.americangangster.net/,4982,"[{""id"": 240, ""name"": ""underdog""}, {""id"": 577, ""name"": ""black people""}, {""id"": 2149, ""name"": ""dru...",en,American Gangster,"Following the death of his employer and mentor, Bumpy Johnson, Frank Lucas establishes himself a...",42.361215,"[{""name"": ""Imagine Entertainment"", ""id"": 23}, {""name"": ""Universal Pictures"", ""id"": 33}, {""name"":...",...,2007-11-02,266465037,157.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,There are two sides to the American dream,American Gangster,7.4,1502,7.141396
4041,2380000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://www.thisisenglandmovie.co.uk/,11798,"[{""id"": 65, ""name"": ""holiday""}, {""id"": 379, ""name"": ""skinhead""}, {""id"": 392, ""name"": ""england""},...",en,This Is England,"A story about a troubled boy growing up in England, set in 1983. He comes across a few skinheads...",8.395624,"[{""name"": ""EM Media"", ""id"": 1917}, {""name"": ""UK Film Council"", ""id"": 2452}, {""name"": ""Screen Yor...",...,2006-09-12,8176544,101.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Run with the crowd, stand alone, you decide.",This Is England,7.4,363,6.739664
1149,40000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",http://www.americanhustle-movie.com/,168672,"[{""id"": 10453, ""name"": ""con artist""}, {""id"": 11454, ""name"": ""scam""}, {""id"": 11578, ""name"": ""mobs...",en,American Hustle,"A con man, Irving Rosenfeld, along with his seductive partner Sydney Prosser, is forced to work ...",49.664128,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""name"": ""Atlas Entertainment"", ""id"": 507}, {""name"": ""A...",...,2013-12-12,251171807,138.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,Everyone Hustles To Survive,American Hustle,6.8,2807,6.717525
1243,500000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,203,"[{""id"": 1944, ""name"": ""epilepsy""}, {""id"": 2236, ""name"": ""protection money""}, {""id"": 7879, ""name""...",en,Mean Streets,"A small-time hood must choose from among love, friendship and the chance to rise within the mob.",17.002096,"[{""name"": ""Scorsese Productions"", ""id"": 120}]",...,1973-10-02,3000000,110.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso_639_1"": ""it"", ""name"": ""Italiano""}]",Released,You don't make up for your sins in church. You do it in the streets...,Mean Streets,7.2,345,6.626569
2839,12000000,"[{""id"": 18, ""name"": ""Drama""}, {""id"": 80, ""name"": ""Crime""}]",,10220,"[{""id"": 395, ""name"": ""gambling""}, {""id"": 3703, ""name"": ""law""}, {""id"": 5887, ""name"": ""compulsive ...",en,Rounders,A young man is a reformed gambler who must return to playing big stakes poker to help a friend p...,18.422008,"[{""name"": ""Miramax Films"", ""id"": 14}, {""name"": ""Spanky Pictures"", ""id"": 1566}]",...,1998-09-11,22921898,121.0,"[{""iso_639_1"": ""ru"", ""name"": ""P\u0443\u0441\u0441\u043a\u0438\u0439""}, {""iso_639_1"": ""en"", ""name...",Released,Trust everyone... But always cut the cards.,Rounders,6.9,439,6.530427


이전보다 훨씬 나은 영화가 추천되었으나,  
장르만으로 영화가 전달하는 많은 요소와 분위기, 그리고 개인이 좋아하는 성향을 반영하기가 부족할 수 있다.  
이에 좀 더 다양한 컨텐츠 기반으로 확장할 수 있으나, 여기까지 하고,  

이후, 아이템 기반 최근접 이웃 협업 필터링을 구현해 보겠다.


 ## 아이템 기반 최근접 이웃 협업 필터링 실습

In [19]:
import pandas as pd
import numpy as np

ratings = pd.read_csv('./input/ml-latest-small/ratings.csv')
print(movies.shape, ratings.shape)



(9742, 3) (100836, 4)


In [20]:
movies.head()



Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [21]:
ratings.head()



Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [22]:
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')
ratings_matrix.head()



movieId,1,2,3,4,5,6,7,8,9,10,...,193565,193567,193571,193573,193579,193581,193583,193585,193587,193609
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,4.0,,4.0,,,4.0,,,,,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,,,,,,,,,,...,,,,,,,,,,


In [23]:
rating_movies = pd.merge(ratings, movies, on='movieId')
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')



In [24]:
ratings_matrix = ratings_matrix.fillna(0)
ratings_matrix.head(2)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


 ### 영화 간 유사도 산출

In [25]:
ratings_matrix_T = ratings_matrix.transpose()
ratings_matrix_T.head(3)



userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [26]:
from sklearn.metrics.pairwise import cosine_similarity

item_sim = cosine_similarity(ratings_matrix_T, ratings_matrix_T)

# cosine_similarity()로 반환된 넘파이 행렬을 영화명을 매핑해 DataFrame으로 변환
item_sim_df = pd.DataFrame(data=item_sim, index=ratings_matrix.columns, columns=ratings_matrix.columns)

item_sim_df.head(3)



title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.141653,0.0,...,0.0,0.342055,0.543305,0.707107,0.0,0.0,0.139431,0.327327,0.0,0.0
'Hellboy': The Seeds of Creation (2004),0.0,1.0,0.707107,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.707107,1.0,0.0,0.0,0.0,0.176777,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [27]:
item_sim_df["Godfather, The (1972)"].sort_values(ascending=False)[:6]



title
Godfather, The (1972)                        1.000000
Godfather: Part II, The (1974)               0.821773
Goodfellas (1990)                            0.664841
One Flew Over the Cuckoo's Nest (1975)       0.620536
Star Wars: Episode IV - A New Hope (1977)    0.595317
Fargo (1996)                                 0.588614
Name: Godfather, The (1972), dtype: float64

In [28]:
item_sim_df["Inception (2010)"].sort_values(ascending=False)[1:6]


title
Dark Knight, The (2008)          0.727263
Inglourious Basterds (2009)      0.646103
Shutter Island (2010)            0.617736
Dark Knight Rises, The (2012)    0.617504
Fight Club (1999)                0.615417
Name: Inception (2010), dtype: float64

 ## 아이템 기반 최근접 이웃 협업 필터링으로 개인화된 영화 추천

 아이템 기반의 영화 유사도 데이터는 모든 사용자의 평점을 기준으로 영화의 유사도를 생성했고,
 이를 이용해 꽤 훌륭한 영화를 추천할 수 있었습니다.
 하지만 이는 개인적 취향을 반영하기 못했습니다. 오직 영화 간의 유사도만을 가지고 추천한 것입니다.
 이번 절에서는 영화 유사도 데이터를 이용해 최근접 이웃 협업 필터링으로 개인에게 최적화된 영화 추천을 구현해보겠습니다. 개인화된 영화 추천의 가장 큰 특징은
 개인이 아직 관람하지 않은 영화를 추천한다는 것입니다. 아직 관람하지 않은 영화에 대해서 아이템 유사도와 기존에 관람한 영화의 평점 데이터를 기반으로
 새롭게 모든 영화의 예측 평점을 계산한 후 높은 예측 평점을 가진 영화를 추천하는 방식입니다.

 위 필터링에서 개인화된 예측 평점은 다음 식으로 구할 수 있습니다.
 $\hat{R}_{u,i} = \displaystyle\sum_{}^{N} (S_{i,N} * R_{u,N}) / \displaystyle\sum_{}^{N} ( |S_{i,N}| )$

 - $\hat{R}_{u,i}$: 사용자 u, 아이템 i의 개인화된 예측 평점 값
 - ${S}_{i,N}$: 아이템 i와 가장 유사도가 높은 Top-N 개 아이템의 유사도 벡터
 - ${R}_{u,N}$: 사용자 u의 아이템 i와 가장 유사도가 높은 Top-N개 아이템에 대한 실제 평점 벡터

 위의 N 값은 아이템의 최근접 이웃 범위 계수(item neighbor)를 의미합니다. 이는 특정 아이템과 유사도가 가장 높은 Top-N개의 다른 아이템을 추출하는 데 사용됩니다.
 먼저 N의 범위에 제약을 두지 않고 모든 아이템으로 갖어하고 예측 평점을 구하는 로직을 작성한 뒤에 Top-N 아이템을 기반으로 협업 필터링을 수행하는 로직으로 변경하겠습니다.

In [29]:
def predict_rating(ratings_arr, item_sim_arr):
    ratings_pred = ratings_arr.dot(item_sim_arr) / np.array([np.abs(item_sim_arr).sum(axis=1)])
    return ratings_pred



In [30]:
ratings_pred = predict_rating(ratings_matrix.values, item_sim_df.values)
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)


title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.070345,0.577855,0.321696,0.227055,0.206958,0.194615,0.249883,0.102542,0.157084,0.178197,...,0.113608,0.181738,0.133962,0.128574,0.006179,0.21207,0.192921,0.136024,0.292955,0.720347
2,0.01826,0.042744,0.018861,0.0,0.0,0.035995,0.013413,0.002314,0.032213,0.014863,...,0.01564,0.020855,0.020119,0.015745,0.049983,0.014876,0.021616,0.024528,0.017563,0.0
3,0.011884,0.030279,0.064437,0.003762,0.003749,0.002722,0.014625,0.002085,0.005666,0.006272,...,0.006923,0.011665,0.0118,0.012225,0.0,0.008194,0.007017,0.009229,0.01042,0.084501


 예측평점이 사용자별 영화의 실제 평점과 영화의 코사인 유사도를 내적한 값이기 때문에 기존 영화를 관람하지 않아 0에 해당했던 실제 영화 평점이 예측에서는 값이 부여되는 경우가 많이 발생합니다.
 예측 평점이 실제 평점보다 작을 수 있습니다. 이는 내적 결과를 코사인 유사도 벡터 합으로 나누었기 때문에 생기는 현상입니다.
 MSE 를 활용해 예측 평가를 하겠습니다.

In [31]:
from sklearn.metrics import mean_squared_error

# 사용자가 평점을 부여한 영화에 대해서만 예측 성능 평가 MSE를 구함.

def get_mse(pred, actual):
    # 평점이 있는 실제 영화만 추출
    pred = pred[actual.nonzero()].flatten()
    actual = actual[actual.nonzero()].flatten()
    return mean_squared_error(pred, actual)

print('아이템 기반 모든 최근접 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))



아이템 기반 모든 최근접 이웃 MSE:  9.895354759094706


In [32]:
def predict_rating_topsim(ratings_arr, item_sim_arr, n=20):
    # 사용자-아이템 평점 행렬 크기만큼 0으로 채운 예측 행렬 초기화
    pred = np.zeros(ratings_arr.shape)

    # 사용자-아이템 평점 행렬의 열 크기만큼 루프 수행
    for col in range(ratings_arr.shape[1]):
        # 유사도 행렬에서 유사도가 큰 순으로 n개 데이터 행렬의 인덱스 반환
        top_n_items = [np.argsort(item_sim_arr[:, col])[:-n-1:-1]]
        # 개인화된 예측 평점을 계산
        for row in range(ratings_arr.shape[0]):
            pred[row, col] = item_sim_arr[col, :][top_n_items].dot(ratings_arr[row,:][top_n_items].T)
            pred[row, col] /= np.sum(np.abs(item_sim_arr[col, :][top_n_items]))
    return pred



In [33]:
ratings_pred = predict_rating_topsim(ratings_matrix.values, item_sim_df.values, n=20)
print('아이템 기반 최근점 TOP-20 이웃 MSE: ', get_mse(ratings_pred, ratings_matrix.values))

# 계산된 예측 평점데이터는 DataFrame 으로 재생성
ratings_pred_matrix = pd.DataFrame(data=ratings_pred, index=ratings_matrix.index, columns=ratings_matrix.columns)



아이템 기반 최근점 TOP-20 이웃 MSE:  3.6949999176225483


In [34]:
# 이제 특정 사용자에 대해 영화를 추천. 우선 userID=9 인 사용자에 대해 추천해보겠습니다.
user_rating_id = ratings_matrix.loc[9,:]
user_rating_id[user_rating_id > 0].sort_values(ascending=False)[:10]



title
Adaptation (2002)                                                                 5.0
Austin Powers in Goldmember (2002)                                                5.0
Lord of the Rings: The Fellowship of the Ring, The (2001)                         5.0
Lord of the Rings: The Two Towers, The (2002)                                     5.0
Producers, The (1968)                                                             5.0
Citizen Kane (1941)                                                               5.0
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)    5.0
Back to the Future (1985)                                                         5.0
Glengarry Glen Ross (1992)                                                        4.0
Sunset Blvd. (a.k.a. Sunset Boulevard) (1950)                                     4.0
Name: 9, dtype: float64

In [42]:
def get_unseen_movies(ratings_matrix, userId):
    # userId로 입력받은 사용자의 모든 영화정보를 추출해 Series로 반환함.
    # 반환된 user_rating 은 영화명(title) 을 인덱스로 가지는 Series 객체임.
    user_rating = ratings_matrix.loc[userId, :]

    # user_rating 이 0보다 크면 기존에 관람한 영화. 대상 인덱스를 추출해 list 객체로 만듬
    already_seen = user_rating[user_rating > 0].index.tolist()

    # 모든 영화명을 list 객체로 만듬
    movies_list = ratings_matrix.columns.tolist()

    # list_comprehension 으로 already_seen 에 해당하는 영화는 movies_list에서 제외함.
    unseen_list = [movie for movie in movies_list if movie not in already_seen]

    return unseen_list


In [43]:
def recomm_movie_by_userid(pred_df, userId, unseen_list, top_n=10):
    # 예측 평점 DataFrame 에서 사용자 id 인덱스와 unseen_list로 들어온 영화명 컬럼을 추출해
    # 가장 예측 평점이 높은 순으로 정렬함
    recomm_movies = pred_df.loc[userId, unseen_list].sort_values(ascending=False)[:top_n]
    return recomm_movies

# 사용자가 관람하지 않는 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 아이템 기반의 최근접 이웃 협업 필터링으로 영화 추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이터를 DataFrame으로 생성.
recomm_movies = pd.DataFrame(data = recomm_movies.values, index = recomm_movies.index, columns = ['pred_score'])

recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Shrek (2001),0.866202
Spider-Man (2002),0.857854
"Last Samurai, The (2003)",0.817473
Indiana Jones and the Temple of Doom (1984),0.816626
"Matrix Reloaded, The (2003)",0.80099
Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001),0.765159
Gladiator (2000),0.740956
"Matrix, The (1999)",0.732693
Pirates of the Caribbean: The Curse of the Black Pearl (2003),0.689591
"Lord of the Rings: The Return of the King, The (2003)",0.676711


## 행렬 분해를 이용한 잠재 요인 협업 필터링 실습

행렬 분해에는 SVD, NMF 등이 적용가능하고, 주로 SVD 활용됩니다. 하지만  
사용자 - 아이템 평점 행렬에는 사용자가 평점을 매기지 않은 null 데이터가 많기 때문에 주로 SGD나 ALS 기반의 행렬 분해를 이용합니다.  
여기서는 SGD 기반의 행렬 분해를 구현해보겠습니다.

In [54]:
from sklearn.metrics import mean_squared_error

def get_rmse(R, P, Q, non_zeros):
    error = 0
    # 두개의 분해된 행렬 P 와 Q.T 의 내적으로 예측 R 행렬 생성
    full_pred_matrix = np.dot(P, Q.T)

    # 실제 R 행렬에서 null이 아닌 값의 위치 인덱스를 추출해 실제 R 행렬과 예측 행렬의 RMSE 추출
    x_non_zero_ind = [non_zero[0] for non_zero in non_zeros]
    y_non_zero_ind = [non_zero[1] for non_zero in non_zeros]
    R_non_zeros = R[x_non_zero_ind, y_non_zero_ind]
    full_pred_matrix_non_zeros = full_pred_matrix[x_non_zero_ind, y_non_zero_ind]
    mse = mean_squared_error(R_non_zeros, full_pred_matrix_non_zeros)
    rmse = np.sqrt(mse)

    return rmse


In [55]:
def matrix_factorization(R, K, steps=200, learning_rate = 0.01, r_lambda = 0.01):
    num_users, num_items = R.shape
    # P와 Q 매트릭스의 크기를 지정하고, 정규 분포를 가진 랜덤한 값으로 입력
    np.random.seed(1)
    P = np.random.normal(scale=1./K, size=(num_users, K))
    Q = np.random.normal(scale=1./K, size=(num_items, K))

    prev_rmse = 10000
    break_count = 0

    # R > 0 인 행위치, 열위치, 값을 non_zeros 리스트 객체에 저장
    non_zeros = [ (i, j, R[i,j]) for i in range(num_users) for j in range(num_items) if R[i, j] > 0]

    # SGD 기법으로 P와 Q 매트릭스를 계속 업데이트
    for step in range(steps):
        for i, j, r in non_zeros:
            # 실제 값과 예측 값의 차이인 오류 값 구함
            eij = r - np.dot(P[i, :], Q[j, :].T)

            # Regularization 을 반영한 SGD 업데이트 공식 적용
            P[i, :] = P[i, :] + learning_rate * (eij * Q[j, :] - r_lambda * P[i, :])
            Q[j, :] = Q[j, :] + learning_rate * (eij * P[i, :] - r_lambda * Q[j, :])
        
        rmse = get_rmse(R, P, Q, non_zeros)

        if (step % 10) == 0:
            print("### iteration step: ", step, "rmse: ", rmse)
        
    return P, Q

In [56]:
movies = pd.read_csv('./input/ml-latest-small/movies.csv')
ratings = pd.read_csv('./input/ml-latest-small/ratings.csv')
ratings = ratings[['userId', 'movieId', 'rating']]
ratings_matrix = ratings.pivot_table('rating', index='userId', columns='movieId')

# title 칼럼을 얻기 위해 movies와 join 수행
rating_movies = pd.merge(ratings, movies, on="movieId")
# columns='title' 로 title 컬럼으로 pivot 수행.
ratings_matrix = rating_movies.pivot_table('rating', index='userId', columns='title')


In [57]:
P, Q = matrix_factorization(ratings_matrix.values, K=50, steps=200, learning_rate=0.01, r_lambda=0.01)
pred_matrix = np.dot(P, Q.T)

### iteration step:  0 rmse:  2.9023619751336867
### iteration step:  10 rmse:  0.7335768591017927
### iteration step:  20 rmse:  0.5115539026853442
### iteration step:  30 rmse:  0.37261628282537446
### iteration step:  40 rmse:  0.2960818299181014
### iteration step:  50 rmse:  0.2520353192341642
### iteration step:  60 rmse:  0.22487503275269854
### iteration step:  70 rmse:  0.2068545530233154
### iteration step:  80 rmse:  0.19413418783028688
### iteration step:  90 rmse:  0.18470082002720406
### iteration step:  100 rmse:  0.17742927527209104
### iteration step:  110 rmse:  0.1716522696470749
### iteration step:  120 rmse:  0.16695181946871723
### iteration step:  130 rmse:  0.16305292191997542
### iteration step:  140 rmse:  0.15976691929679646
### iteration step:  150 rmse:  0.1569598699945732
### iteration step:  160 rmse:  0.1545339818671543
### iteration step:  170 rmse:  0.15241618551077643
### iteration step:  180 rmse:  0.1505508073962831
### iteration step:  190 rmse:  0

In [58]:
ratings_pred_matrix = pd.DataFrame(data=pred_matrix, index=ratings_matrix.index, columns=ratings_matrix.columns)
ratings_pred_matrix.head(3)

title,'71 (2014),'Hellboy': The Seeds of Creation (2004),'Round Midnight (1986),'Salem's Lot (2004),'Til There Was You (1997),'Tis the Season for Love (2015),"'burbs, The (1989)",'night Mother (1986),(500) Days of Summer (2009),*batteries not included (1987),...,Zulu (2013),[REC] (2007),[REC]² (2009),[REC]³ 3 Génesis (2012),anohana: The Flower We Saw That Day - The Movie (2013),eXistenZ (1999),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.055084,4.092018,3.56413,4.502167,3.981215,1.271694,3.603274,2.333266,5.091749,3.972454,...,1.402608,4.208382,3.705957,2.720514,2.787331,3.475076,3.253458,2.161087,4.010495,0.859474
2,3.170119,3.657992,3.308707,4.166521,4.31189,1.275469,4.237972,1.900366,3.392859,3.647421,...,0.973811,3.528264,3.361532,2.672535,2.404456,4.232789,2.911602,1.634576,4.135735,0.725684
3,2.307073,1.658853,1.443538,2.208859,2.229486,0.78076,1.997043,0.924908,2.9707,2.551446,...,0.520354,1.709494,2.281596,1.782833,1.635173,1.323276,2.88758,1.042618,2.29389,0.396941


In [59]:
# 사용자가 관람하지 않은 영화명 추출
unseen_list = get_unseen_movies(ratings_matrix, 9)

# 잠재 요인 협업 필터링으로 영화추천
recomm_movies = recomm_movie_by_userid(ratings_pred_matrix, 9, unseen_list, top_n=10)

# 평점 데이터를 DataFrame 으로 생성
recomm_movies = pd.DataFrame(data=recomm_movies.values, index=recomm_movies.index, columns=['pred_score'])
recomm_movies

Unnamed: 0_level_0,pred_score
title,Unnamed: 1_level_1
Rear Window (1954),5.704612
"South Park: Bigger, Longer and Uncut (1999)",5.4511
Rounders (1998),5.298393
Blade Runner (1982),5.244951
Roger & Me (1989),5.191962
Gattaca (1997),5.183179
Ben-Hur (1959),5.130463
Rosencrantz and Guildenstern Are Dead (1990),5.087375
"Big Lebowski, The (1998)",5.03869
Star Wars: Episode V - The Empire Strikes Back (1980),4.989601
