In [60]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [61]:
movies_5000 = pd.read_csv("../Data/tmdb_5000_movies.csv")

In [62]:
import ast
def value_import(x):
    return [i['name'] for i in ast.literal_eval(x)]
movies_5000['genres'] = movies_5000['genres'].apply(value_import)
movies_5000['keywords'] = movies_5000['keywords'].apply(value_import)

In [63]:
movies_5000.head(3)

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...","[{""iso_3166_1"": ""US"", ""name"": ""United States o...",2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...","[{""iso_3166_1"": ""GB"", ""name"": ""United Kingdom""...",2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466


In [64]:
def join_words(x):
    return (' ').join(x) 

movies_5000['genres_literal'] = movies_5000['genres'].apply(join_words)

In [65]:
movies_5000['genres_literal']

0       Action Adventure Fantasy Science Fiction
1                       Adventure Fantasy Action
2                         Action Adventure Crime
3                    Action Crime Drama Thriller
4               Action Adventure Science Fiction
                          ...                   
4798                       Action Crime Thriller
4799                              Comedy Romance
4800               Comedy Drama Romance TV Movie
4801                                            
4802                                 Documentary
Name: genres_literal, Length: 4803, dtype: object

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

# CountVectorizer를 적용하기 위해 공백문자로 word 단위가 구분되는 문자열로 변환
movies_5000['genres_literal'] = movies_5000['genres'].apply(lambda x : (' ').join(x))
count_vect = CountVectorizer(min_df=0.0, ngram_range = (1,2))
genre_mat = count_vect.fit_transform(movies_5000['genres_literal'])
print(genre_mat.shape)
# bigram으로 피처 수 276개로 증가

(4803, 276)


In [67]:
from sklearn.metrics.pairwise import cosine_similarity

# 영화 간 장르 유사도를 코사인 유사도로 계산
genre_sim = cosine_similarity(genre_mat, genre_mat)
print(genre_sim.shape)
print(genre_sim[:2])


(4803, 4803)
[[1.         0.59628479 0.4472136  ... 0.         0.         0.        ]
 [0.59628479 1.         0.4        ... 0.         0.         0.        ]]


In [68]:
# 첫번째 영화와 유사도가 높은 영화 순서
genre_sim_sorted_ind = genre_sim.argsort()[:, :-1]
print(genre_sim_sorted_ind[1:2])

[[2401 3067 3069 ...  199  129    1]]


class sklearn.neighbors.NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)

In [69]:
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(n_neighbors=20,algorithm='brute',metric='cosine', n_jobs= -1)
model_knn.fit(genre_mat)

In [72]:
distances, indices_result = model_knn.kneighbors(genre_mat[0],n_neighbors=20)

In [73]:
distances

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.09546597, 0.09546597, 0.1180829 , 0.1180829 ,
        0.19596975, 0.19596975, 0.19596975, 0.19596975, 0.19596975,
        0.22222222, 0.22222222, 0.22222222, 0.24407105, 0.24407105]])

In [74]:
indices_result

array([[3494,   46,    0,  813,   14,  870, 1296, 1652,  419,  420, 3208,
          72,  238, 1932, 1191,   10,  232,   61,  242, 2995]])

In [75]:
indices = pd.Series(movies_5000.index, index = movies_5000['original_title'])

In [76]:
indices

original_title
Avatar                                         0
Pirates of the Caribbean: At World's End       1
Spectre                                        2
The Dark Knight Rises                          3
John Carter                                    4
                                            ... 
El Mariachi                                 4798
Newlyweds                                   4799
Signed, Sealed, Delivered                   4800
Shanghai Calling                            4801
My Date with Drew                           4802
Length: 4803, dtype: int64

In [94]:
def get_recommandations(title, cosine_sim = genre_sim) :
    idx = indices['original_title']
    result = list(enumerate(genre_sim[idx]))
    result = sorted(result, key = lambda x:x[1], reverse = True)
    # print(result)
    result = result[1:11]
    print(result)
    movies_indices = [i[0] for i in result] # list를 뽑기 위해서 영화 list만 뽑아냈다. 
    movies_indices
    return movies_5000['original_title'].iloc[movies_indices]

In [95]:
result = get_recommandations('John Carter')
print(result)

KeyError: 'original_title'

In [101]:
C = movies_5000['vote_average'].mean()
# 투표횟수 중 60%이상의 횟수에 달하는 숫자
# 예를들어 총 투표횟수가 100과 1일때 m값은 매우 달라진다.
m = movies_5000['vote_count'].quantile(0.6)
def weighted_vote_average(record):
    v = record['vote_count']
    R = record['vote_average']
    
    return ( (v/(v+m)) * R) + ( (m/(m+v)) * C)
    
movies_5000['weighted_vote'] = movies_5000.apply(weighted_vote_average, axis=1)

In [100]:
movies_5000.head()

Unnamed: 0,budget,genres,homepage,id,keywords,original_language,original_title,overview,popularity,production_companies,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,vote_average,vote_count,genres_literal
0,237000000,"[Action, Adventure, Fantasy, Science Fiction]",http://www.avatarmovie.com/,19995,"[culture clash, future, space war, space colon...",en,Avatar,"In the 22nd century, a paraplegic Marine is di...",150.437577,"[{""name"": ""Ingenious Film Partners"", ""id"": 289...",...,2009-12-10,2787965087,162.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}, {""iso...",Released,Enter the World of Pandora.,Avatar,7.2,11800,Action Adventure Fantasy Science Fiction
1,300000000,"[Adventure, Fantasy, Action]",http://disney.go.com/disneypictures/pirates/,285,"[ocean, drug abuse, exotic island, east india ...",en,Pirates of the Caribbean: At World's End,"Captain Barbossa, long believed to be dead, ha...",139.082615,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}, {""...",...,2007-05-19,961000000,169.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"At the end of the world, the adventure begins.",Pirates of the Caribbean: At World's End,6.9,4500,Adventure Fantasy Action
2,245000000,"[Action, Adventure, Crime]",http://www.sonypictures.com/movies/spectre/,206647,"[spy, based on novel, secret agent, sequel, mi...",en,Spectre,A cryptic message from Bond’s past sends him o...,107.376788,"[{""name"": ""Columbia Pictures"", ""id"": 5}, {""nam...",...,2015-10-26,880674609,148.0,"[{""iso_639_1"": ""fr"", ""name"": ""Fran\u00e7ais""},...",Released,A Plan No One Escapes,Spectre,6.3,4466,Action Adventure Crime
3,250000000,"[Action, Crime, Drama, Thriller]",http://www.thedarkknightrises.com/,49026,"[dc comics, crime fighter, terrorist, secret i...",en,The Dark Knight Rises,Following the death of District Attorney Harve...,112.31295,"[{""name"": ""Legendary Pictures"", ""id"": 923}, {""...",...,2012-07-16,1084939099,165.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,The Legend Ends,The Dark Knight Rises,7.6,9106,Action Crime Drama Thriller
4,260000000,"[Action, Adventure, Science Fiction]",http://movies.disney.com/john-carter,49529,"[based on novel, mars, medallion, space travel...",en,John Carter,"John Carter is a war-weary, former military ca...",43.926995,"[{""name"": ""Walt Disney Pictures"", ""id"": 2}]",...,2012-03-07,284139100,132.0,"[{""iso_639_1"": ""en"", ""name"": ""English""}]",Released,"Lost in our world, found in another.",John Carter,6.1,2124,Action Adventure Science Fiction


#### movies['soup']을 활용한 콘텐츠 기반 추천 시스템

In [102]:
movies = pd.read_csv("../Data/tmdb.csv", index = 0)

TypeError: read_csv() got an unexpected keyword argument 'index'

In [103]:
movies['soup']

KeyError: 'soup'

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer(min_df=0.0,n_gram_range = (1,2))
soup_mat = count_vect.fit_transform(movies['soup'])
print(soup_mat.shape)

In [None]:
# class sklearn.neighbors.NearestNeighbors(*, n_neighbors=5, radius=1.0, algorithm='auto', leaf_size=30, metric='minkowski', p=2, metric_params=None, n_jobs=None)
from sklearn.neighbors import NearestNeighbors

model_knn = NearestNeighbors(n_neighbors=20 , algorithm='auto', metric='cosine' , n_jobs=-1)
model_knn.fit(soup_mat)

In [None]:
distances , indices_result = model_knn.kneighbors(soup_mat[0], n_neighbors=11)

In [None]:
distances

In [None]:
indices = pd.Series(movies.index , index=movies['original_title'])
idx = indices['John Carter']
idx

In [None]:
def get_recommandations(title, model = model_knn) :
    idx = indices['title']
    distances , indices_result = model_knn.kneighbors(soup_mat[idx], n_neighbors=11)
    result = indices_result[0][1:]
    similarity = list(1-np.array(distances))[0]
    return (movies['original_title'].iloc[result], similarity)

In [None]:
title_series, sim = get_recommandations('John Carter')
for i in range(len(title_series)):
    print(f'{i+1}번째 추천영화는 \n')
    print(f'영화 {title_series[i]}이고 유사도는 {sim[0][i]}')
    print(f'=============================\n')

Item Based Collaborative Filtering

In [104]:
rating_data = pd.read_csv("../Data/ratings_small.csv")
movie_data = pd.read_csv("../Data/movies_small.csv")

In [106]:
movie_data.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [109]:
rating_data.head(3)

Unnamed: 0,userId,movieId,rating
0,1,1,4.0
1,1,3,4.0
2,1,6,4.0


In [108]:
rating_data.drop('timestamp',axis=1, inplace = True)

In [111]:
user_movie_rating_df = pd.merge(rating_data, movie_data, on='movieId')

In [112]:
user_movie_rating_df.head(3)

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [114]:
# 피벗테이블 만들기
# 인덱스 : title, 컬럼 : userId, value = rating
movie_user_rating = user_movie_rating_df.pivot_table('rating',index = ['title'], columns=['userId'])
user_movie_rating = user_movie_rating_df.pivot_table('rating',index = ['serId'], columns=['title'])

KeyError: 'serId'

In [115]:
movie_user_rating.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9719 entries, '71 (2014) to À nous la liberté (Freedom for Us) (1931)
Columns: 610 entries, 1 to 610
dtypes: float64(610)
memory usage: 45.3+ MB


In [116]:
movie_user_rating.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),,,,,,,,,,,...,,,,,,,,,,4.0
'Hellboy': The Seeds of Creation (2004),,,,,,,,,,,...,,,,,,,,,,
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot (2004),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,


In [117]:
movie_user_rating.fillna(0,inplace = True)

In [118]:
movie_user_rating.head()

userId,1,2,3,4,5,6,7,8,9,10,...,601,602,603,604,605,606,607,608,609,610
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71 (2014),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0
'Hellboy': The Seeds of Creation (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Round Midnight (1986),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Salem's Lot (2004),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
'Til There Was You (1997),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


* 협업 필터링(collaborative filtering) 방식의 추천 시스템은 많은 장점을 가지고 있지만, 동시에 여러 난제도 존재합니다. 주요 난제들은 다음과 같습니다:

* 콜드 스타트 문제 (Cold Start Problem):
    - 사용자 콜드 스타트: 새로운 사용자가 시스템에 가입했을 때, 그 사용자의 과거 데이터가 없기 때문에 그에 대한 추천을 생성하기 어렵습니다.
    - 아이템 콜드 스타트: 새로운 아이템이 시스템에 추가되었을 때, 아이템에 대한 초기 평점이나 피드백이 없기 때문에 해당 아이템을 어떤 사용자에게 추천해야 할지 판단하기 어렵습니다.

In [119]:
from scipy.sparse import csr_matrix
movie_user_rating_matrix = csr_matrix(movie_user_rating.values)

In [None]:
from sklearn.neighbors import NearestNeighbors
