##영화 줄거리 overview 를 사용하여  여러 거리 함수를 통한 추천 시스템 만들기 

In [108]:
# install and import the libraries needed
# this script is heavily inspired by Wikidocs
!pip3 install scikit-learn
!pip3 install gdown
from typing import Tuple, List
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
import os
import gdown # for downloading the data stored in google drive



In [109]:
# --- constants --- #
MOVIES_METADATA_CSV_URL = "https://drive.google.com/u/0/uc?id=15I-izNgUJqybrPLK3qlZhLMCuuNcaJhJ&export=download"
MOVIES_METADATA_CSV = "./movies_metadata.csv"  # save the data in the current directory
DOC_SIZE = 2000  # choose the size of the data here (너무 크면 시간이 오래 걸릴 수도 있음!)
TOP_N = 30

In [110]:
# download the data
gdown.download(url=MOVIES_METADATA_CSV_URL, output=MOVIES_METADATA_CSV, quiet=False)

Downloading...
From: https://drive.google.com/u/0/uc?id=15I-izNgUJqybrPLK3qlZhLMCuuNcaJhJ&export=download
To: /content/movies_metadata.csv
34.4MB [00:00, 132MB/s]


'./movies_metadata.csv'

In [111]:
# have a look at the data
movies_df = pd.read_csv(MOVIES_METADATA_CSV, low_memory=False)
movies_df.head(3)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,popularity,poster_path,production_companies,production_countries,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",21.946943,/rhIRbceoE9lR4veEXuwCC2wARtG.jpg,"[{'name': 'Pixar Animation Studios', 'id': 3}]","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,17.015539,/vzmL6fP7aPKNKPRTFnZmiUfciyV.jpg,"[{'name': 'TriStar Pictures', 'id': 559}, {'na...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,11.7129,/6ksm1sjKMFLbO7UY2i6G1ju9SML.jpg,"[{'name': 'Warner Bros.', 'id': 6194}, {'name'...","[{'iso_3166_1': 'US', 'name': 'United States o...",1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0


In [112]:
# resize the dataframe
movies_df = movies_df.head(DOC_SIZE)
# we don't want any null values. So replace them with an empty string.
movies_df['overview'] = movies_df['overview'].fillna('')
display(movies_df.overview)

0       Led by Woody, Andy's toys live happily in his ...
1       When siblings Judy and Peter discover an encha...
2       A family wedding reignites the ancient feud be...
3       Cheated on, mistreated and stepped on, the wom...
4       Just when George Banks has recovered from his ...
                              ...                        
1995    Coming-of-age adventure about two teenage brot...
1996    As Kevin Flynn searches for proof that he inve...
1997    The story of a close-knit group of young kids ...
1998    Two decades after surviving a massacre on Octo...
1999    With the help of a talking freeway billboard, ...
Name: overview, Length: 2000, dtype: object

In [113]:
### TODO 1 #### 
# use CountVectorizer & TfidfVectorizer to construct dtm & dtm_tfidf
count_vec = CountVectorizer(binary = False) #bianry True 면 0 or 1 만 나옴
C = count_vec.fit_transform(movies_df.overview)

#dtm: np.ndarray = np.ndarray(m.toarray())
dtm = C.toarray()
print(dtm)
print('-------------------------------------------------------------------------------------')
#dtm_tfidf: np.ndarray = np.ndarray()  # from csr_matrix to numpy array.  # 이제는 이거 한줄로 끝내기
tf_idf_vec = TfidfVectorizer()
T = tf_idf_vec.fit_transform(movies_df.overview)
dtm_tfidf = T.toarray()

print(dtm_tfidf)
###############
 # should be csr sparse matrix
print(dtm.shape)  # (num_docs, num_terms) 당연히.. 대부분은 0 이겠지! - 다시한번 sparsity를 확인할 수 있다.
#print(dtm_tfidf)  # should be csr sparse matrix
print(dtm_tfidf.shape)  # (num_docs, num_terms) 당연히.. 대부분은 0 이겠지! - 다시한번 sparsity를 확인할 수 있다.

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
-------------------------------------------------------------------------------------
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(2000, 14195)
(2000, 14195)


In [114]:
# cosine distance -> this is vectorized...
sims_cosine = cosine_similarity(dtm)
sims_cosine_tfidf = cosine_similarity(dtm_tfidf)  # this may take a while... 
#print(sims_cosine.shape)  # (num_doc, num_doc)
#print(sims_cosine_tfidf.shape)  # (num_doc, num_doc)
print(sims_cosine) #dtm 으로 만든 cosine 유사도 
print(sims_cosine_tfidf)


[[1.         0.18553901 0.19482163 ... 0.24333213 0.11068177 0.20292693]
 [0.18553901 1.         0.24419315 ... 0.3158899  0.11560887 0.26413527]
 [0.19482163 0.24419315 1.         ... 0.32025631 0.14567141 0.23112508]
 ...
 [0.24333213 0.3158899  0.32025631 ... 1.         0.21659944 0.48112522]
 [0.11068177 0.11560887 0.14567141 ... 0.21659944 1.         0.23343366]
 [0.20292693 0.26413527 0.23112508 ... 0.48112522 0.23343366 1.        ]]
[[1.         0.03454759 0.01481731 ... 0.01949542 0.0097847  0.03464374]
 [0.03454759 1.         0.05823795 ... 0.03201838 0.01484567 0.04089162]
 [0.01481731 0.05823795 1.         ... 0.02758326 0.02423444 0.02401523]
 ...
 [0.01949542 0.03201838 0.02758326 ... 1.         0.02917474 0.05349839]
 [0.0097847  0.01484567 0.02423444 ... 0.02917474 1.         0.0244971 ]
 [0.03464374 0.04089162 0.02401523 ... 0.05349839 0.0244971  1.        ]]


In [115]:
# Note that these are distances, now
dists_manhattan_tfidf = manhattan_distances(dtm_tfidf)
print(dists_manhattan_tfidf)  # (num_doc, num_doc)

[[ 0.         10.97613614 11.03551801 ... 10.44997268 11.63298859
   9.61092773]
 [10.97613614  0.         12.05549179 ... 11.7412071  13.30529499
  10.90461447]
 [11.03551801 12.05549179  0.         ... 11.59379479 12.88278095
  10.85146956]
 ...
 [10.44997268 11.7412071  11.59379479 ...  0.         12.13311521
  10.00174599]
 [11.63298859 13.30529499 12.88278095 ... 12.13311521  0.
  11.5360165 ]
 [ 9.61092773 10.90461447 10.85146956 ... 10.00174599 11.5360165
   0.        ]]


In [116]:
dists_euclidean_tfidf = euclidean_distances(dtm_tfidf)  # this may take a while... not efficient at all? cosine - it didn't take that much of a time. but manhattan... well that took some time.
print(dists_euclidean_tfidf)  # (num_doc, num_doc)

[[0.         1.38957001 1.40369704 ... 1.40036037 1.40727773 1.38950082]
 [1.38957001 0.         1.37241543 ... 1.39138896 1.40367684 1.38499703]
 [1.40369704 1.37241543 0.         ... 1.39457287 1.39697213 1.39712903]
 ...
 [1.40036037 1.39138896 1.39457287 ... 0.         1.3934312  1.37586454]
 [1.40727773 1.40367684 1.39697213 ... 1.3934312  0.         1.39678409]
 [1.38950082 1.38499703 1.39712903 ... 1.37586454 1.39678409 0.        ]]


In [117]:
# this is to retrive the titles
indices = pd.Series(data=movies_df.index, index=movies_df['title']).drop_duplicates()

#display(indices)
#display(indices['Batman'])
indices['Batman']



585

In [118]:
movies_df['title']

0                         Toy Story
1                           Jumanji
2                  Grumpier Old Men
3                 Waiting to Exhale
4       Father of the Bride Part II
                   ...             
1995                            Tex
1996                           Tron
1997                     Swing Kids
1998                 Halloween: H20
1999                     L.A. Story
Name: title, Length: 2000, dtype: object

In [119]:
def get_recommendations(title: str,
                        # dists X
                        sims: np.ndarray,
                        indices: pd.Series,
                        data: pd.DataFrame) -> List[Tuple[str, float]]:
    global TOP_N
    ### TODO 2 ###
    # 루프는 사용 X
    # use: np.argsort()[::-1] to recommend titles with scores
    
    title_idx = indices[title] #585
    sims_to_title = sims[title_idx, :] # (2000,) 585번째 행의 모든 유사도 를 가져온다 
    #print(sims_to_title)
    sims_to_title_sorted: List[float] = np.sort(sims_to_title)[::-1] #값들을 정렬하여 큰거부터 
    indices_sorted = np.argsort(sims_to_title)[::-1] #인자 값을 통한 인덱스도 큰거부터 정렬
    titles_sorted: List[str] = data['title'][indices_sorted] #data  = movies_df [제목][index]
    #print(titles_sorted)
    
    '''
    titles_sorted: List[str] = indices[np.argsort(sims[585])[::-1]]
    sims_to_title_sorted: List[float] = data[indices[np.argsort(sims[585])[::-1]]]
    '''
    results: List[Tuple[str, float]] = [
        (title, score)
        for title, score in zip(titles_sorted, sims_to_title_sorted)
    ]
    ##############
    return results[:TOP_N]

In [120]:
[np.argsort(sims_cosine[585])[::-1]]

[array([ 585,  269, 1074, ...,   57,  641,  792])]

In [121]:
Index=indices.index[np.argsort(sims_cosine_tfidf[movies_df['title'] == 'Batman'])][::-1]

  """Entry point for launching an IPython kernel.


In [122]:
print(Index)

[['The Last Days of Disco' 'Nobody Loves Me' 'Under The Domim Tree' ...
  'Batman & Robin' 'Batman Returns' 'Batman']]


In [123]:
dists_euclidean_tfidf[movies_df['title'] == 'Batman']

array([[1.40295336, 1.40679845, 1.40245754, ..., 1.39289045, 1.40291716,
        1.38608626]])

In [124]:
dists_manhattan_tfidf[movies_df['title'] == 'Batman'].shape

(1, 2000)

In [125]:
# compare the results of dtm with tfidf
# 이유는 다들 알고 계십니다! ㅎㅎ
print("#### dtm + cosine sim ####")
for idx, (title, score) in enumerate(get_recommendations("Batman", sims_cosine, indices, movies_df)):
  print(idx, title, score)

print("#### tfidf + cosine sim ####")
for idx, (title, score) in enumerate(get_recommendations("Batman", sims_cosine_tfidf, indices, movies_df)):
  print(idx, title, score)


#### dtm + cosine sim ####
0 Batman 1.0000000000000002
1 Mary Shelley's Frankenstein 0.45268314226448536
2 The Leopard Son 0.4468418247369901
3 The Last Emperor 0.44633201988370275
4 The Confessional 0.4436069753671345
5 Sweet Nothing 0.43970404417590525
6 Conan the Barbarian 0.4375949744936837
7 The Children Are Watching Us 0.43193421279068006
8 The Search for One-eye Jimmy 0.43092875104518874
9 Stalingrad 0.42857142857142855
10 North Star 0.42839304295347314
11 I Can't Sleep 0.42766686606638943
12 The Rocketeer 0.42721034696687243
13 Days of Thunder 0.4269760205378603
14 The Remains of the Day 0.42525863589985735
15 My Life and Times With Antonin Artaud 0.4198925212700719
16 The Replacement Killers 0.4190262407031392
17 Ed Wood 0.4164633650362828
18 Commandments 0.41475753100312657
19 Return of the Jedi 0.4139469470433775
20 And the Band Played On 0.41226488477260886
21 Mad Dog Time 0.4114755998989118
22 Child's Play 2 0.41099746826339323
23 The Education of Little Tree 0.41030496993

In [126]:
### TODO 3 ###
# use different similarity measures and compare the results
# note that dists != sims
print("#### tfidf + cosine sim ####")
for idx, (title, score) in enumerate(get_recommendations("Batman", sims_cosine_tfidf, indices, movies_df)):
  print(idx, title, score)
print("\n#### tfidf + euclidean dists ####")
for idx, (title, score) in enumerate(get_recommendations("Batman", -dists_euclidean_tfidf, indices, movies_df)): #거리는 가까워야 더 좋기 떄문에
  print(idx, title, score)
print("\n#### tfidf + manhattan dists ####")
for idx, (title, score) in enumerate(get_recommendations("Batman", -dists_manhattan_tfidf, indices, movies_df)):
  print(idx, title, score) 
##############   

#### tfidf + cosine sim ####
0 Batman 1.0000000000000002
1 Batman Returns 0.19155692273625355
2 Batman & Robin 0.1758295995796183
3 Batman Forever 0.14581071757552017
4 Cry, the Beloved Country 0.11109312245823069
5 B. Monkey 0.10544235266596627
6 The Shadow 0.10514679108660141
7 Sweet Nothing 0.10035634826245518
8 A Month by the Lake 0.10023026852720505
9 Chairman of the Board 0.09920814012584121
10 The War at Home 0.0988387365764903
11 Dune 0.09875401691372578
12 Mediterraneo 0.09803176049698262
13 Return of the Jedi 0.09217825522595026
14 The Pompatus of Love 0.09085969169749389
15 Cyclo 0.08667634812405352
16 Commandments 0.08651635230910235
17 Apocalypse Now 0.08609774357728836
18 Friday the 13th: A New Beginning 0.08605652456139677
19 Dangerous Game 0.08604236366845823
20 Highlander 0.08602706777821761
21 Metropolis 0.08398384733774258
22 Somebody Is Waiting 0.08258337613352316
23 M 0.08193657697542574
24 Mrs. Miniver 0.0811774410090339
25 The Apostle 0.08031383520515435
26 Love 

# 다음의 문제에 답하세요.
> cosine >> euclidean & manhattan 인 이유는 무엇인가요?

> 일반적으로 고차원 벡터인 경우, manhattan이 euclidean보다 성능이 좋은 것으로 알려져 있습니다 (슬라이드). 하지만 이 문제에서는 euclidean > manhattan 입니다. 그 이유는 무엇인가요?

# 다음의 문제에 답하세요.
> cosine >> euclidean & manhattan 인 이유는 무엇인가요?

코사인 유사도 방법은 길이와 관련없이 벡터간의 각도만을 고려해서  (줄거리 문서 -> 단어의 빈도수가 제각기이기 때문에, 각도만을 고려하는 것이 적합).


> 일반적으로 고차원 벡터인 경우, manhattan이 euclidean보다 성능이 좋은 것으로 알려져 있습니다 (슬라이드). 하지만 이 문제에서는 euclidean > manhattan 입니다. 그 이유는 무엇인가요?
논문: "일반적" = 정규분포, 분포, 일반적으로 분포를 가정 (벡터 속의 성분). -> [100, 1 ,2]
힌트: 벡터를 얻은 방법은 규칙기반. tfidf. (벡터 속의 성분 -> keyword는 값이 높다). 
outlier (이상치). -> [100 <- 중요한 단어, 1 ,2]
유클리드 -> 가중치가 증폭 (제곱).
맨해튼 거리 -> 둔감하다.
차원의 저주 -> 고차원으로 갈수록 거리의 개념 옅어지는데 (모두가 다 커지니까).
일반적인 주장의 반례.
  