In [1]:
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

movies = pd.read_csv('/Users/taewon/Documents/GitHub/ML/datasets/tmdb_5000_movies.csv')

print(movies.shape)
movies.head(3)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",http://disney.go.com/disneypictures/pirates/,285,"[{""id"": 270, ""name"": ""ocean""}, {""id"": 726, ""na...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.sonypictures.com/movies/spectre/,206647,"[{""id"": 470, ""name"": ""spy""}, {""id"": 818, ""name...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [3]:
# 장르 데이터 전처리
from ast import literal_eval

movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

In [4]:
movies['genres'] = movies['genres'].apply(lambda x : [ y ['name'] for y in x])
movies['keywords'] = movies['keywords'].apply(lambda x : [ y ['name'] for y in x])
movies[['genres', 'keywords']][:1]

Unnamed: 0,genres,keywords
0,"[Action, Adventure, Fantasy, Science Fiction]","[culture clash, future, space war, space colon..."


In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환
movies['genres_literal'] = movies['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0, ngram_range=(1,2))
genre_mat = count_vect.fit_transform(movies['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [7]:
from sklearn.metrics.pairwise import cosine_similarity

genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [12]:
genre_sim_sorted_ind = genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:1])

[[   0 3494  813 ... 3038 3037 2401]]


In [13]:
# 특정 영화와 장르 유사도가 높은 영화를 반환하는 함수
def find_sim_movie(df, sorted_ind, title_name, top_n=10):

    # 인자로 입력된 movies DataFrame에서 'title' 컬럼이 입력된 title_name 값인 DataFrame 추출
    title_movie = df[df['title'] == title_name]

    # title_named을 가진 DataFrame의 index 객체를 ndarray로 반환하고
    # sorted_ind 인자로 입력된 genre_sim_sorted_ind 객체에서 유사도 순으로 top_n 개의 index 추출
    title_index = title_movie.index.values
    similar_indexes = sorted_ind[title_index, :(top_n)]

    # 추출된 top_n index들 출력. top_n index는 2차원 데이터임.
    # dataframe에서 index로 사용하기 위해서 1차원 array로 변경
    print(similar_indexes)
    similar_indexes = similar_indexes.reshape(-1)

    return df.iloc[similar_indexes]

In [14]:
similar_movies = find_sim_movie(movies, genre_sim_sorted_ind, 'The Godfather', 10)
similar_movies[['title', 'vote_average']]

[[2731 1243 3636 1946 2640 4065 1847 4217  883 3866]]


Unnamed: 0,title,vote_average
2731,The Godfather: Part II,8.3
1243,Mean Streets,7.2
3636,Light Sleeper,5.7
1946,The Bad Lieutenant: Port of Call - New Orleans,6.0
2640,Things to Do in Denver When You're Dead,6.7
4065,Mi America,0.0
1847,GoodFellas,8.2
4217,Kids,6.8
883,Catch Me If You Can,7.7
3866,City of God,8.1


In [15]:
## 문제
# 평가횟수가 현저히 적은 영화들이 추천되는 것도 있음
# low quality 추천 문제

# 우리가 전혀 모르는 영화를 추천받는 것은 엉뚱한 추천 결과를 낳을 수 있음
# -> 평가횟수를 반영한 추천 시스템이 필요

### 가중평점(평점&평가횟수) 반영한 영화 추천

#### @ 가중평점 (Weighted Rating) :
- (v / (v + m)) * R + (m / (v + m)) * C
- v : 영화별 평점을 투표한 횟수 (vote_count) -> 투표횟수가 많은 영화에 가중치 부여
- m : 평점을 부여하기 위한 최소 투표 횟수 -> 여기서는 투표 수 상위 60%
- R : 개별 영화에 대한 평균 평점 (vote_average)
- C : 전체 영화에 대한 평균 평점 (movies['vote_average'].mean())
    - C, m은 고정값
    - v, R은 영화마다 변동값

In [17]:
# 상위 60%에 해당하는 vote_count를 최소 투표 횟수인 m으로 지정
C = movies['vote_average'].mean()
m = movies['vote_count'].quantile(0.6)

In [19]:
# C : 전체 영화에 대한 평균 평점 = 약 6점
# m : 평점을 부여하기 위한 최소 투표 횟수 = 370회(상위 60% 수준)
print('C:', round(C,3), 'm', round(m,3))

C: 6.092 m 370.2


### 가중평점을 계산하는 함수

In [21]:
def Weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    return ((v/(v+m))*R + (m/(v+m))*C) # 가중평점을 return 값으로 돌려준다

In [22]:
# 기존 데이터에 가중평점 컬럼 추가
movies['weighted_vote'] = movies.apply(Weighted_vote_average,axis=1)

### 추천 ver2. 먼저 장르 유사성 높은 영화 20개 선정 후, 가중평점순 10개 선정

In [23]:
# 먼저 장르 유사성 높은 영화 20개 선정 후, 가중평점순 10개 선정
def find_sim_movie_ver2(df,sorted_ind,title_name,top_n=10):
    title_movie = df[df['title'] == title_name]
    title_index = title_movie.index.values

    # top_n의 2배에 해당하는 장르 유사성이 높은 index 추출
    similar_indexes = sorted_ind[title_index,:(top_n*2)]
    similar_indexes = similar_indexes.reshape(-1)

    # 기준 영화 index는 제외
    similar_indexes = similar_indexes[similar_indexes != title_index]

    # top_n의 2배에 해당하는 후보군에서 weighted_vote 높은 순으로 top_n 만큼 추출
    return df.iloc[similar_indexes].sort_values('weighted_vote',ascending=False)[:top_n]

### 영화 Godfather에 대해 장르 유사성, 가중평점 반영한 추천 영화 10개를 뽑아보자

In [26]:
similar_indexes = find_sim_movie_ver2(movies,genre_sim_sorted_ind, 'The Godfather',10)
similar_indexes[['title','vote_average','weighted_vote','genres','vote_count']]

Unnamed: 0,title,vote_average,weighted_vote,genres,vote_count
2731,The Godfather: Part II,8.3,8.079586,"[Drama, Crime]",3338
1847,GoodFellas,8.2,7.976937,"[Drama, Crime]",3128
3866,City of God,8.1,7.759693,"[Drama, Crime]",1814
1663,Once Upon a Time in America,8.2,7.657811,"[Drama, Crime]",1069
883,Catch Me If You Can,7.7,7.557097,"[Drama, Crime]",3795
281,American Gangster,7.4,7.141396,"[Drama, Crime]",1502
4041,This Is England,7.4,6.739664,"[Drama, Crime]",363
1149,American Hustle,6.8,6.717525,"[Drama, Crime]",2807
1243,Mean Streets,7.2,6.626569,"[Drama, Crime]",345
2839,Rounders,6.9,6.530427,"[Drama, Crime]",439


### 요약 : Godfather를 좋아하는 사람에게 영화 추천해주기
- Godfather 장르가 Drama, Crime이다.
- 우선 Drama, Crime 장르 기준으로 상위 20개 영화를 뽑아보고,
- 그 중 평가횟수를 반영한 가중평점 기준 상위 10개 영화를 뽑아서 추천해준다.

### 응용 : Spider-Man 3 좋아하는사람 기준으로 장르가 유사한 영화를 추천해주자

In [30]:
similar_movies = find_sim_movie_ver2(movies, genre_sim_sorted_ind, 'Spider-Man 3', 10)
similar_movies[['title','vote_average','weighted_vote','genres','vote_count']]

Unnamed: 0,title,vote_average,weighted_vote,genres,vote_count
329,The Lord of the Rings: The Return of the King,8.1,8.011871,"[Adventure, Fantasy, Action]",8064
262,The Lord of the Rings: The Fellowship of the Ring,8.0,7.922175,"[Adventure, Fantasy, Action]",8705
330,The Lord of the Rings: The Two Towers,8.0,7.910111,"[Adventure, Fantasy, Action]",7487
19,The Hobbit: The Battle of the Five Armies,7.1,7.027274,"[Action, Adventure, Fantasy]",4760
98,The Hobbit: An Unexpected Journey,7.0,6.961224,"[Adventure, Fantasy, Action]",8297
126,Thor: The Dark World,6.8,6.748873,"[Action, Adventure, Fantasy]",4755
30,Spider-Man 2,6.7,6.652034,"[Action, Adventure, Fantasy]",4321
129,Thor,6.6,6.572735,"[Adventure, Fantasy, Action]",6525
20,The Amazing Spider-Man,6.5,6.478296,"[Action, Adventure, Fantasy]",6586
38,The Amazing Spider-Man 2,6.5,6.466812,"[Action, Adventure, Fantasy]",4179
