In [3]:
import pandas as pd
import numpy as np
import os
from ast import literal_eval

In [31]:
pd.set_option('display.max_columns', 20)
pd.set_option('display.width', 1000)

movies = pd.read_csv('tmdb_5000_movies.csv')
movies = movies.loc[:, ['title', 'genres', 'keywords']]

#딕셔너리화?
movies['genres'] = movies['genres'].apply(literal_eval)
movies['keywords'] = movies['keywords'].apply(literal_eval)

# [ {}, {}, {}, {}] -> [장르, 장르, 장르, 장르]
# i=0
# for row in movies['genres']:
#   genres = []
#   for ele in row:
#     genres.append(ele['name'])
#   movies['genres'][i]=genres
#   i +=1

#한 줄에서 name에 해당하는 것들만 뽑겠다는 것
movies['genres'] = movies['genres'].apply(lambda x:  [ele['name'] for ele in x])
movies['keywords'] = movies['keywords'].apply(lambda x: [ele['name'] for ele in x])

# [장르, 장르, 장르, 장르] -> 장르 장르 장르 장르
movies['genres'] = movies['genres'].apply(lambda x: ' '.join(x))
movies['keywords'] = movies['keywords'].apply(lambda x: ' '.join(x))

print(movies)

                                         title                                    genres                                           keywords
0                                       Avatar  Action Adventure Fantasy Science Fiction  culture clash future space war space colony so...
1     Pirates of the Caribbean: At World's End                  Adventure Fantasy Action  ocean drug abuse exotic island east india trad...
2                                      Spectre                    Action Adventure Crime  spy based on novel secret agent sequel mi6 bri...
3                        The Dark Knight Rises               Action Crime Drama Thriller  dc comics crime fighter terrorist secret ident...
4                                  John Carter          Action Adventure Science Fiction  based on novel mars medallion space travel pri...
...                                        ...                                       ...                                                ...
4798                

In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer

# ngram_range(1,2)는 단어를 1개 혹은 2개 연속으로 보겠다
tfidf_vec = TfidfVectorizer(ngram_range=(1,2))
tfidf_matrix = tfidf_vec.fit_transform(movies['genres'])
print(tfidf_vec.vocabulary_.items())
# 4803은 영화 개수, 276은 장르 단어 개수(1개씩 보는것, 2개씩 보는 것 다 포함)
# 하나의 영화를 276개의 열을 가진 벡터로 표현하겠다 
print(tfidf_matrix.shape)

dict_items([('action', 0), ('adventure', 16), ('fantasy', 124), ('science', 232), ('fiction', 138), ('action adventure', 1), ('adventure fantasy', 24), ('fantasy science', 135), ('science fiction', 233), ('fantasy action', 125), ('crime', 64), ('adventure crime', 20), ('drama', 90), ('thriller', 234), ('action crime', 4), ('crime drama', 68), ('drama thriller', 106), ('adventure science', 29), ('animation', 33), ('family', 109), ('animation family', 38), ('fantasy family', 130), ('action science', 12), ('adventure action', 17), ('action thriller', 13), ('thriller crime', 238), ('western', 265), ('adventure western', 32), ('adventure family', 23), ('family fantasy', 115), ('fiction action', 139), ('action fantasy', 7), ('comedy', 44), ('action comedy', 3), ('comedy science', 59), ('adventure drama', 22), ('drama action', 91), ('romance', 214), ('drama romance', 104), ('romance thriller', 228), ('thriller action', 235), ('fiction thriller', 150), ('adventure thriller', 30), ('fantasy adv

In [36]:
# 유사도 행렬 (4803, 4803)
# 1, 1 (1번째 영화와 1번째 영화의 유사도)
# 1, 1/ 1, 2 / ... / 1, 4803 -> 1번째 영화와 1-4803 영화의 유사도
# 2, 1/ 2, 2/ ..../ 2, 4803 -> 2번째 영화와 1-4803 영화의 유사도
# ....
# 4803, 1/ 4803, 2/ .... / 4803, 4803 -> 1번째 영화와 1-4803 영화의 유사도

from sklearn.metrics.pairwise import cosine_similarity

#4803개의 영화에 대해서 4803개의 영화와의 유사도를 구하겠다
genres_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)
print(genres_similarity)
# 유사도 값이 높은 영화의 제목
# 유사도 값이 높은 순으로 인덱스 값을 뽑아낸다
# 높은거부터 낮은걸로 정렬하고 인덱스를 뽑아낸다
similar_index = np.argsort(-genres_similarity)
print(similar_index)

[[1.         0.49309367 0.29270708 ... 0.         0.         0.        ]
 [0.49309367 1.         0.17786505 ... 0.         0.         0.        ]
 [0.29270708 0.17786505 1.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 1.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         1.        ]]
[[   0  870 3494 ... 2355 2397 4802]
 [ 329  379  199 ... 2323 2325 4802]
 [1740    2 1542 ... 2359 2344 4802]
 ...
 [4800 3809 3285 ... 2085 2142 4802]
 [   0 3205 3204 ... 1596 1594 4802]
 [4802 4593 4583 ... 1597 1595 2401]]


In [37]:
# 사용자가 입력한 영화의 인덱스 값을 찾아내고
# similar_index에 기록된 유사한 인덱스를 찾아내고
# 유사한 영화 인덱스를 토대로 영화 이름을 찾아내면 된다!
input_movie = input()

movie_index = movies[movies['title']==input_movie].index.values
print(movie_index)
similar_movies = similar_index[movie_index, :10]
print(similar_movies)
#인덱스를 쓸 때는 일차원으로 바꿔야한다
similar_movies_index = similar_movies.reshape(-1)
print(similar_movies_index)
print(movies.iloc[similar_movies_index])

The Dark Knight
[65]
[[  65  613  830 2229 2074 1488 3591 2111  739  468]]
[  65  613  830 2229 2074 1488 3591 2111  739  468]
                      title                       genres                                           keywords
65          The Dark Knight  Drama Action Crime Thriller  dc comics crime fighter secret identity scarec...
613               The Score        Action Crime Thriller  quebec jewel scepter customs house jewelry hei...
830       Kill Bill: Vol. 2        Action Crime Thriller  brother brother relationship swordplay katana ...
2229          Machete Kills        Action Crime Thriller  mexico white house nuclear missile machete out...
2074        The Transporter        Action Crime Thriller  car journey transportation auto human trafficking
1488                   Safe        Action Crime Thriller                                     broken trachea
3591             Nighthawks        Action Crime Thriller                                                   
2111  The