필요 모듈 import

In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel


데이터 살펴보기

In [44]:
data = pd.read_csv('./movie_data/movies_metadata.csv', low_memory=False)
data.head(2)


Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


데이터 전처리 

In [4]:
n_sample = 20000
# overview 컬럼이 na라면 공백으로 바꿔줌
data['overview'] = data['overview'].fillna('') 
# row를 섞어줌
data = data.sample(frac=1).reset_index(drop=True)
# n_sample 개의 데이터만 사용
data = data.iloc[:n_sample, :] 
print(data.shape)

(20000, 24)


overview(줄거리)에 있는 단어로 tf-idf 행렬 계산

In [5]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(20000, 50335)


각 영화 끼리의 코사인 유사도 구하기

In [32]:
cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)
# 입력으로 넣은 title의 영화를 다시 추천하는 것을 방지하기 위해 대각값을 0으로 바꿈.
np.fill_diagonal(cosine_sim, 0, wrap=True)

영화 인덱스 사전 만들기

In [36]:
title_to_int = {title: i for i, title in enumerate(data['title'])}
int_to_title = {i: title  for i, title in enumerate(data['title'])}

# 어떤 영화들이 있는지 확인
print(list(title_to_int.keys())[:50])

['The Dawns Here Are Quiet', 'Maelström', 'For the Boys', 'Umberto D.', 'American Pie Presents: Band Camp', 'Rockabilly Vampire', 'Soul of the Game', 'Black Friday', 'The Gendarme Takes Off', 'Erik the Viking', 'The Peanuts Movie', 'Divergence', 'Going in Style', 'All About Love', 'Belle Starr', 'Seeking Asylum', '2 Days in New York', 'Regeneration', 'Zameen', 'Phenomenon', 'Ping Pong', "Dracula's Daughter", 'Dracula II: Ascension', 'The Secret Life of Girls', 'Faces of Death III', 'Ultramarines: A Warhammer 40,000 Movie', 'Return from Witch Mountain', 'Killer Condom', 'Buster', 'Gilda', 'Repentance', 'Wife! Be Like a Rose!', 'The Goonies', 'Doctor Who: The Husbands of River Song', 'The Iceman and the Psychiatrist', 'Mall', 'Vamp', 'Calling Dr. Death', 'Private Worlds', 'Gregoire Moulin vs. Humanity', 'Face to Face', 'Gung Ho', 'Classmates', 'Pina', 'A Gathering of Eagles', 'New Wave', 'Buying the Cow', 'Jekyll and Hyde ... Together Again', 'The Act in Question', 'Time and Tide']


가장 비슷한 영화 n개 추천

In [42]:
def recommend_n_movie(title, n ,cosine_sim, title_to_int, int_to_title):
  recommendtations = []
  movie_idx = title_to_int.get(title, False)

  # 검색하고자 하는 영화가 목록에 없다면.
  if movie_idx == False:
    # 목록에 없다는 메시지를 반환함.
    return f"{title} is not in our movie list"

  row = cosine_sim[movie_idx, :]
  top_n_movie_idx = list(row.argsort()[-n:][::-1])

  for idx in top_n_movie_idx:
    movie_title = int_to_title[idx]
    recommendtations.append(movie_title)
  
  return recommendtations

영화 추천 받기

In [49]:
movies = recommend_n_movie("Black Friday", 5, cosine_sim, title_to_int, int_to_title)
print(movies)

movies = recommend_n_movie("Titanic", 5, cosine_sim, title_to_int, int_to_title)
print(movies)

movies = recommend_n_movie("Scent of a Woman", 5, cosine_sim, title_to_int, int_to_title)
print(movies)

movies = recommend_n_movie("Reservoir Dogs", 5, cosine_sim, title_to_int, int_to_title)
print(movies)

["Please Don't Eat the Daisies", 'Salaam Bombay!', 'Four Mothers', 'Anuvahood', 'Trade Winds']
Titanic is not in our movie list
['Fausto 5.0', 'À la mode', 'Three Brothers', 'The Bastards', 'Forbidden Zone']
['Crows and Sparrows', 'Ten Little Indians', 'Pride and Prejudice', 'The Karate Kid, Part II', "Nativity 3: Dude, Where's My Donkey?!"]
