<a href="https://colab.research.google.com/github/hipster4020/RecommendationSystem/blob/master/ContentsBasedFiltering_Movies.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
# Library load
import pandas as pd
import numpy as np
import warnings; warnings.filterwarnings('ignore')

from ast import literal_eval
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# **csv 파일 판다스 dataframe로 반입**

In [3]:
movies=pd.read_csv("/content/sample_data/tmdb_5000_movies.csv")
print(movies.shape)
movies.head(1)

(4803, 20)


Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",http://www.avatarmovie.com/,19995,"[{""id"": 1463, ""name"": ""culture clash""}, {""id"":...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800


In [4]:
# null 값 체크
movies.isnull().sum()

budget                     0
genres                     0
homepage                3091
id                         0
keywords                   0
original_language          0
original_title             0
overview                   3
popularity                 0
production_companies       0
production_countries       0
release_date               1
revenue                    0
runtime                    2
spoken_languages           0
status                     0
tagline                  844
title                      0
vote_average               0
vote_count                 0
dtype: int64

In [5]:
# 주요 컬럼 추출
movies_df=movies[['id', 'title', 'genres', 'vote_average', 'vote_count']]
movies_df.head(5)

Unnamed: 0,id,title,genres,vote_average,vote_count
0,19995,Avatar,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",7.2,11800
1,285,Pirates of the Caribbean: At World's End,"[{""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""...",6.9,4500
2,206647,Spectre,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.3,4466
3,49026,The Dark Knight Rises,"[{""id"": 28, ""name"": ""Action""}, {""id"": 80, ""nam...",7.6,9106
4,49529,John Carter,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""nam...",6.1,2124


# **리스트 - 딕셔너리로 구성된 genres 컬럼 가공**

In [6]:
# 컬럼 길이 100으로 세팅
pd.set_option('max_colwidth', 100)
movies_df[['genres']][:1]

Unnamed: 0,genres
0,"[{""id"": 28, ""name"": ""Action""}, {""id"": 12, ""name"": ""Adventure""}, {""id"": 14, ""name"": ""Fantasy""}, {..."


In [7]:
# apply()에 literal_eval 함수를 적용해 문자열을 객체로 변경
movies_df['genres']=movies_df['genres'].apply(literal_eval)
movies_df.head(1)

Unnamed: 0,id,title,genres,vote_average,vote_count
0,19995,Avatar,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'name': 'Adventure'}, {'id': 14, 'name': 'Fantasy'}, {...",7.2,11800


In [8]:
# apply lambda를 이용하여 리스트 내 여러 개의 딕셔너리의 'name' 키 찾아 리스트 객체로 변환.
movies_df['genres']=movies_df['genres'].apply(lambda x : [ y['name'] for y in x])
movies_df[['genres']][:1]

Unnamed: 0,genres
0,"[Action, Adventure, Fantasy, Science Fiction]"


# **장르 콘텐츠 유사도 측정**

In [11]:
movies_df[['genres']]

Unnamed: 0,genres
0,"[Action, Adventure, Fantasy, Science Fiction]"
1,"[Adventure, Fantasy, Action]"
2,"[Action, Adventure, Crime]"
3,"[Action, Crime, Drama, Thriller]"
4,"[Action, Adventure, Science Fiction]"
...,...
4798,"[Action, Crime, Thriller]"
4799,"[Comedy, Romance]"
4800,"[Comedy, Drama, Romance, TV Movie]"
4801,[]


In [14]:
# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환.
movies_df['genres_literal']=movies_df['genres'].apply(lambda x : (' ').join(x))

# min_df는 너무 드물게로 나타나는 용어를 제거하는 데 사용. min_df = 0.01은 "문서의 1 % 미만"에 나타나는 용어를 무시한다. 
# ngram_range는 n-그램 범위.
count_vect=CountVectorizer(min_df=0, ngram_range=(1, 2))
genre_mat=count_vect.fit_transform(movies_df['genres_literal'])
print(genre_mat.shape)

(4803, 276)


In [17]:
genre_sim=cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:1])

(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]]


In [28]:
# [:, ::-1] axis = 1 기준으로 2차원 numpy 배열 뒤집기
genre_sim_sorted_ind=genre_sim.argsort()[:, ::-1]
print(genre_sim_sorted_ind[:3])

[[2401 3037 3038 ...  813 3494    0]
 [2401 3067 3069 ...  129    1  262]
 [2401 2999 3000 ... 1542 1740    2]]


# **장르 콘텐츠 필터링을 이용한 영화 추천**

In [37]:
movies_df[['title', 'vote_average', 'vote_count']].sort_values('vote_average', ascending=False)[:10]

Unnamed: 0,title,vote_average,vote_count
3519,Stiff Upper Lips,10.0,1
4247,Me You and Five Bucks,10.0,2
4045,"Dancer, Texas Pop. 81",10.0,1
4662,Little Big Top,10.0,1
3992,Sardaarji,9.5,2
2386,One Man's Hero,9.3,2
2970,There Goes My Baby,8.5,2
1881,The Shawshank Redemption,8.5,8205
2796,The Prisoner of Zenda,8.4,11
3337,The Godfather,8.4,5893


In [42]:
percentile = 0.6
m = movies_df['vote_count'].quantile(percentile)  # 평점을 부여하기 위한 최소 평가 수
C = movies_df['vote_average'].mean()  # 전체 영화의 평균 평점

def weighted_vote_average(record):
  v = record['vote_count']  # 영화에 평가를 매긴 횟수
  R = record['vote_average']  # 영화의 평균 평점

  return ( (v/(v+m)) * R ) + ( (m/(m+v)) * C )  # 가중 평점 계산 식

movies_df['weighted_vote'] = movies.apply(weighted_vote_average, axis=1)

In [44]:
movies_df[['title', 'weighted_vote', 'vote_count']].sort_values('weighted_vote', ascending=False)[:10]

Unnamed: 0,title,weighted_vote,vote_count
1881,The Shawshank Redemption,8.396052,8205
3337,The Godfather,8.263591,5893
662,Fight Club,8.216455,9413
3232,Pulp Fiction,8.207102,8428
65,The Dark Knight,8.13693,12002
1818,Schindler's List,8.126069,4329
3865,Whiplash,8.123248,4254
809,Forrest Gump,8.105954,7927
2294,Spirited Away,8.105867,3840
2731,The Godfather: Part II,8.079586,3338


In [54]:
def find_sim_movie(df, sorted_ind, title_name, top_n=10):
  title_movie = df[df['title'] == title_name]
  title_index = title_movie.index.values

  # top_n의 2배에 해당하는 장르 유사성이 높은 인덱스 추출
  similar_indexes = sorted_ind[title_index, :(top_n*2)]
  # reshape(-1) 1차열 배열 반환
  similar_indexes = similar_indexes.reshape(-1)
  # 기준 영화 인덱스는 제외
  similar_indexes = similar_indexes[similar_indexes != title_index]

  # top_n의 2배에 해당하는 후보군에서 weighted_vote가 높은 순으로 top_n만큼 추출
  return df.iloc[similar_indexes].sort_values('weighted_vote', ascending=False)[:top_n]

similar_movies=find_sim_movie(movies_df, genre_sim_sorted_ind, 'No Country for Old Men', 10)
similar_movies[['title', 'vote_count', 'weighted_vote']]

Unnamed: 0,title,vote_count,weighted_vote
2294,Spirited Away,3840,8.105867
2285,Back to the Future,6079,7.890486
0,Avatar,11800,7.166301
2315,Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan,1579,6.422544
2299,Leap Year,465,6.263556
2312,Neighbors,2713,6.187053
2326,Diary of a Wimpy Kid: Rodrick Rules,238,6.173499
2293,The Crazies,633,6.160209
2316,Legally Blonde,882,6.097686
2281,A Cinderella Story,713,6.097325
