# 데이터 전처리

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from ast import literal_eval
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
data = pd.read_csv('./movie_data/movies_metadata.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [4]:
data.shape

(45466, 24)

In [5]:
data = data[['id', 'genres', 'vote_average', 'vote_count', 'popularity', 'title', 'overview']]

In [6]:
tmp_m = data['vote_count'].quantile(0.989)
tmp_m

2048.8530000000246

In [7]:
tmp_data = data.copy().loc[data['vote_count'] >= tmp_m]
tmp_data.shape

(501, 7)

In [8]:
del tmp_data

m = data['vote_count'].quantile(0.989)
data = data.loc[data['vote_count'] >= m]

In [9]:
data.head()

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,21.9469,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.0155,Jumanji,When siblings Judy and Peter discover an encha...
31,63,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",7.4,2470.0,12.2973,Twelve Monkeys,"In the year 2035, convict James Cole reluctant..."
46,807,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",8.1,5915.0,18.4574,Se7en,Two homicide detectives are on a desperate hun...
49,629,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",8.1,3334.0,16.3025,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin..."


In [10]:
C = data['vote_average'].mean()

In [11]:
print(C)
print(m)

6.980439121756488
2048.8530000000246


In [12]:
def weighted_rating(x, m=m, C=C):
    v = x['vote_count']
    R = x['vote_average']
    
    return (v / (v+m) * R) + (m / (v+m) * C)

In [13]:
data['score'] = data.apply(weighted_rating, axis = 1)

In [14]:
data.head(5)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,score
0,862,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",7.7,5415.0,21.9469,Toy Story,"Led by Woody, Andy's toys live happily in his ...",7.502478
1,8844,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",6.9,2413.0,17.0155,Jumanji,When siblings Judy and Peter discover an encha...,6.936937
31,63,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",7.4,2470.0,12.2973,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",7.209771
46,807,"[{'id': 80, 'name': 'Crime'}, {'id': 9648, 'na...",8.1,5915.0,18.4574,Se7en,Two homicide detectives are on a desperate hun...,7.811972
49,629,"[{'id': 18, 'name': 'Drama'}, {'id': 80, 'name...",8.1,3334.0,16.3025,The Usual Suspects,"Held in an L.A. interrogation room, Verbal Kin...",7.673866


In [15]:
data.shape

(501, 8)

In [16]:
data['genres'] = data['genres'].apply(literal_eval)

In [17]:
data[['genres']].head(2)

Unnamed: 0,genres
0,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '..."
1,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '..."


In [18]:
data['genres'] = data['genres'].apply(lambda x : [d['name'] for d in x]).apply(lambda x : " ".join(x))

In [19]:
data.head(2)

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,score
0,862,Animation Comedy Family,7.7,5415.0,21.9469,Toy Story,"Led by Woody, Andy's toys live happily in his ...",7.502478
1,8844,Adventure Fantasy Family,6.9,2413.0,17.0155,Jumanji,When siblings Judy and Peter discover an encha...,6.936937


In [20]:
data.to_csv('./movie_data/pre_movies_metadata.csv', index = False)

# 콘텐츠 기반 필터링 추천 (Content based filtering

In [21]:
data = pd.read_csv('./movie_data/pre_movies_metadata.csv')

In [22]:
data.genres.head(2)

0     Animation Comedy Family
1    Adventure Fantasy Family
Name: genres, dtype: object

In [23]:
count_vector = CountVectorizer(ngram_range=(1, 3))

In [24]:
c_vector_genres = count_vector.fit_transform(data['genres'])

In [25]:
c_vector_genres.shape

(501, 375)

In [26]:
# 코사인 유사도를 구한 벡터를 미리 저장
genre_c_sim = cosine_similarity(c_vector_genres, c_vector_genres).argsort()[:, ::-1]

In [27]:
genre_c_sim.shape

(501, 501)

In [28]:
def get_recommend_movie_list(df, movie_title, top=30):
    # 특정 영화와 비슷한 영화를 추천해야 하기 때문에 '특정 영화' 정보를 뽑아낸다.
    target_movie_index = df[df['title'] == movie_title].index.values
    
    # 코사인 유사도 중 비슷한 코사인 유사도를 가진 정보를 뽑아낸다.
    sim_index = genre_c_sim[target_movie_index, :top].reshape(-1)    
    # 본인 제외
    sim_index = sim_index[sim_index != target_movie_index]
    
    # 데이터프레임으로 만들고 vote_count로 정렬 후 return
    result = df.iloc[sim_index].sort_values('score', ascending=False)[:10]
    return result

In [43]:
get_recommend_movie_list(data, movie_title='Harry Potter')

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,score


In [35]:
data[data['title'] == 'The Dark Knight Rises']

Unnamed: 0,id,genres,vote_average,vote_count,popularity,title,overview,score
284,49026,Action Crime Drama Thriller,7.6,9263.0,20.58258,The Dark Knight Rises,Following the death of District Attorney Harve...,7.487782
