In [1]:
docs = [
    '먹고 싶은 사과',
    '먹고 싶은 바나나',
    '길고 노란 바나나, 바나나',
    '저는 과일이 좋아요']
docs

['먹고 싶은 사과', '먹고 싶은 바나나', '길고 노란 바나나, 바나나', '저는 과일이 좋아요']

In [2]:
from sklearn.feature_extraction.text import CountVectorizer
vect = CountVectorizer() # Count Vectorizer객체 생성

In [3]:
vect

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [4]:
# 문장을 Count Vectorizer 형태로 변형
countvect = vect.fit_transform(docs)
countvect

<4x9 sparse matrix of type '<class 'numpy.int64'>'
	with 12 stored elements in Compressed Sparse Row format>

In [5]:
# toarray()를 통해서 문장이 Vector 형태의 값을 얻을 수 있다.
# 하지만, 각 인덱스와 컬럼이 무엇을 의미하는지에 대해서는 알 수가 없다.
# sparse matrix -> numpy
countvect.toarray()

array([[0, 0, 0, 1, 0, 1, 1, 0, 0],
       [0, 0, 0, 1, 1, 0, 1, 0, 0],
       [0, 1, 1, 0, 2, 0, 0, 0, 0],
       [1, 0, 0, 0, 0, 0, 0, 1, 1]], dtype=int64)

In [6]:
# 딕셔너리 형태의 단어 : 인덱스
vect.vocabulary_

{'먹고': 3,
 '싶은': 6,
 '사과': 5,
 '바나나': 4,
 '길고': 1,
 '노란': 2,
 '저는': 7,
 '과일이': 0,
 '좋아요': 8}

In [7]:
# sorted라는 함수를 통해서 단어를 정렬 ( 인덱스 기준)
sorted(vect.vocabulary_)

['과일이', '길고', '노란', '먹고', '바나나', '사과', '싶은', '저는', '좋아요']

In [8]:
# counter vector를 dataframe으로 생성
import pandas as pd 
countvect_df = pd.DataFrame(countvect.toarray(), columns = sorted(vect.vocabulary_))
countvect_df.index =['문서1','문서2','문서3','문서4']
countvect_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0,0,0,1,0,1,1,0,0
문서2,0,0,0,1,1,0,1,0,0
문서3,0,1,1,0,2,0,0,0,0
문서4,1,0,0,0,0,0,0,1,1


In [9]:
# 위의 Data Frame 형태의 코사인 유사도를 계산
from sklearn.metrics.pairwise import cosine_similarity

cosine_similarity(countvect_df, countvect_df)

array([[1.        , 0.66666667, 0.        , 0.        ],
       [0.66666667, 1.        , 0.47140452, 0.        ],
       [0.        , 0.47140452, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

# TF-IDF 이용

In [10]:
# countVectorizer -> TF-IDF vectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
vect = TfidfVectorizer()
tfvect = vect.fit(docs)
tfvect

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True, stop_words=None, strip_accents=None,
                sublinear_tf=False, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, use_idf=True, vocabulary=None)

In [11]:
tfidv_df = pd.DataFrame(tfvect.transform(docs).toarray(), columns= sorted(vect.vocabulary_))
tfidv_df.index =['문서1','문서2','문서3','문서4']
tfidv_df

Unnamed: 0,과일이,길고,노란,먹고,바나나,사과,싶은,저는,좋아요
문서1,0.0,0.0,0.0,0.526405,0.0,0.667679,0.526405,0.0,0.0
문서2,0.0,0.0,0.0,0.57735,0.57735,0.0,0.57735,0.0,0.0
문서3,0.0,0.47212,0.47212,0.0,0.74445,0.0,0.0,0.0,0.0
문서4,0.57735,0.0,0.0,0.0,0.0,0.0,0.0,0.57735,0.57735


In [12]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_similarity(tfidv_df,tfidv_df)

array([[1.        , 0.60784064, 0.        , 0.        ],
       [0.60784064, 1.        , 0.42980824, 0.        ],
       [0.        , 0.42980824, 1.        , 0.        ],
       [0.        , 0.        , 0.        , 1.        ]])

---
# 영화 분석


In [13]:
import pandas as pd 
data = pd.read_csv('./data/movies_metadata.csv',low_memory=False)
data.head(2)

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0


In [14]:
# 각 영화의 줄거리(소개) 확인
data['overview'].head()

0    Led by Woody, Andy's toys live happily in his ...
1    When siblings Judy and Peter discover an encha...
2    A family wedding reignites the ancient feud be...
3    Cheated on, mistreated and stepped on, the wom...
4    Just when George Banks has recovered from his ...
Name: overview, dtype: object

In [15]:
# overview 항목 추출
data.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [16]:
# 결측치 확인 (True = 결측치 X , False = 결측치) 
data['overview'].notnull()

0        True
1        True
2        True
3        True
4        True
         ... 
45461    True
45462    True
45463    True
45464    True
45465    True
Name: overview, Length: 45466, dtype: bool

In [17]:
# 전처리
# overview의 결측치가 있는 항목은 모두 제거
data = data[data['overview'].notnull()].reset_index(drop = True)
data.shape

data= data.loc[0:20000].reset_index(drop=True)
data

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 119050, 'name': 'Grumpy Old Men Collect...",0,"[{'id': 10749, 'name': 'Romance'}, {'id': 35, ...",,15602,tt0113228,en,Grumpier Old Men,A family wedding reignites the ancient feud be...,...,1995-12-22,0.0,101.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Still Yelling. Still Fighting. Still Ready for...,Grumpier Old Men,False,6.5,92.0
3,False,,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",,31357,tt0114885,en,Waiting to Exhale,"Cheated on, mistreated and stepped on, the wom...",...,1995-12-22,81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Friends are the people who let you be yourself...,Waiting to Exhale,False,6.1,34.0
4,False,"{'id': 96871, 'name': 'Father of the Bride Col...",0,"[{'id': 35, 'name': 'Comedy'}]",,11862,tt0113041,en,Father of the Bride Part II,Just when George Banks has recovered from his ...,...,1995-02-10,76578911.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Just When His World Is Back To Normal... He's ...,Father of the Bride Part II,False,5.7,173.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19996,False,,0,"[{'id': 99, 'name': 'Documentary'}]",http://www.pbs.org/kenburns/centralparkfive/,124067,tt2380247,en,The Central Park Five,"In 1989, five black and Latino teenagers from ...",...,2012-05-24,273747.0,119.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,The Central Park Five,False,6.8,33.0
19997,False,"{'id': 165369, 'name': 'The Collector Collecti...",0,"[{'id': 27, 'name': 'Horror'}, {'id': 53, 'nam...",,134597,tt1748227,en,The Collection,Arkin escapes with his life from the vicious g...,...,2012-09-21,6842058.0,94.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,Every great collector has a vision.,The Collection,False,5.9,294.0
19998,False,,0,[],http://www.sakuranosono-movie.jp/,125136,tt1313145,ja,櫻の園 -さくらのその-,"Remake of a hit film from 1990, ""The Cherry Or...",...,2008-11-08,0.0,102.0,"[{'iso_639_1': 'ja', 'name': '日本語'}]",Released,,The Cherry Orchard: Blossoming,False,0.0,0.0
19999,False,,0,"[{'id': 35, 'name': 'Comedy'}]",,99223,tt1811315,en,FDR: American Badass!,Chronicles the adventures of Franklin Delano R...,...,2012-09-24,0.0,93.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,FDR: American Badass!,False,4.9,17.0


In [18]:
# 불용어 : 유의미하지 않은 단어 토큰을 제거하기.
tfidf = TfidfVectorizer(stop_words='english')

# overview에 대해서 tf-idf 수행
tfidf_matrix = tfidf.fit_transform(data['overview'])
print(tfidf_matrix.shape)

(20001, 47665)


In [19]:
from sklearn.metrics.pairwise import cosine_similarity
cosine_matrix = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [25]:
cosine_matrix.shape

(20001, 20001)

In [24]:
np.round(cosine_matrix,4)

array([[1.    , 0.0158, 0.    , ..., 0.0083, 0.0172, 0.    ],
       [0.0158, 1.    , 0.0492, ..., 0.0057, 0.008 , 0.    ],
       [0.    , 0.0492, 1.    , ..., 0.    , 0.    , 0.    ],
       ...,
       [0.0083, 0.0057, 0.    , ..., 1.    , 0.0144, 0.    ],
       [0.0172, 0.008 , 0.    , ..., 0.0144, 1.    , 0.0183],
       [0.    , 0.    , 0.    , ..., 0.    , 0.0183, 1.    ]])

In [28]:
# id와 movie title를  매핑할 딕셔너리 : { id : 영화제목}
movie2id = {}
for i,c in enumerate(data['title']):movie2id[i] = c
# movie2id     

# movie title와 id를 매핑할 딕셔너리 : { 영화제목 : id }
id2movie = {}
for i,c in movie2id.items() : id2movie[c]= i
# id2movie

In [32]:
# 토이스토리의 index찾기
idx = id2movie['Toy Story']
idx

0

In [68]:
# 위에서 찾은 토이스토리의 인덱스를 통해 다른 영화와의 코사인 유사도 확인
cosine_matrix[idx]

array([1.        , 0.015775  , 0.        , ..., 0.00826973, 0.01721372,
       0.        ])

In [69]:
# 자기 자신을 제외한 영화들의 유사도 및 인덱스를 추출
sim_scores = [(i,c) for i, c in enumerate(cosine_matrix[idx]) if i!= idx] 
sim_scores

[(1, 0.01577499623706559),
 (2, 0.0),
 (3, 0.0),
 (4, 0.0),
 (5, 0.0),
 (6, 0.0),
 (7, 0.0),
 (8, 0.0),
 (9, 0.0),
 (10, 0.0),
 (11, 0.0),
 (12, 0.0),
 (13, 0.0),
 (14, 0.0),
 (15, 0.0),
 (16, 0.0),
 (17, 0.041138683296865486),
 (18, 0.0),
 (19, 0.0),
 (20, 0.0099121496903153),
 (21, 0.0),
 (22, 0.0),
 (23, 0.0),
 (24, 0.0),
 (25, 0.0),
 (26, 0.0),
 (27, 0.0),
 (28, 0.0),
 (29, 0.0),
 (30, 0.0),
 (31, 0.0),
 (32, 0.01978034381431984),
 (33, 0.0),
 (34, 0.0),
 (35, 0.0),
 (36, 0.0),
 (37, 0.0),
 (38, 0.0),
 (39, 0.0),
 (40, 0.0),
 (41, 0.006321775635368981),
 (42, 0.0),
 (43, 0.0),
 (44, 0.009292791126667362),
 (45, 0.0),
 (46, 0.0),
 (47, 0.0),
 (48, 0.013838678611953216),
 (49, 0.009852367947354567),
 (50, 0.010928162091485132),
 (51, 0.0),
 (52, 0.0),
 (53, 0.02000467244181858),
 (54, 0.0),
 (55, 0.025263801435198463),
 (56, 0.02072192444202655),
 (57, 0.0),
 (58, 0.03420184247473588),
 (59, 0.0),
 (60, 0.0),
 (61, 0.00860353886947865),
 (62, 0.0),
 (63, 0.01019819462957017),
 (64, 0

In [70]:
# 유사도 높은 순서대로 추출
sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
sim_scores[0:10]

[(15282, 0.5262275451171008),
 (2979, 0.463276799830381),
 (10271, 0.2797390476075632),
 (8303, 0.20078538664316947),
 (1058, 0.18287334034120212),
 (11367, 0.15712074193481165),
 (1916, 0.15288512626542436),
 (3039, 0.1433450408051554),
 (483, 0.13765225108436677),
 (11573, 0.1337032693869044)]

In [71]:
# 인덱스에 movie title 넣어주도록 설정
sim_scores = [(movie2id[i],score) for i, score in sim_scores[0:10]]
sim_scores  

[('Toy Story 3', 0.5262275451171008),
 ('Toy Story 2', 0.463276799830381),
 ('The 40 Year Old Virgin', 0.2797390476075632),
 ('The Champ', 0.20078538664316947),
 ('Rebel Without a Cause', 0.18287334034120212),
 ('For Your Consideration', 0.15712074193481165),
 ('Condorman', 0.15288512626542436),
 ('Man on the Moon', 0.1433450408051554),
 ('Malice', 0.13765225108436677),
 ('Factory Girl', 0.1337032693869044)]

# 정리
- 영화 meta data를 통해 영화 overview(줄거리, 소개)컬럼을 통해 유사한 영화 찾기
- overview에 대해서 결측치 제거 및 데이터 개수 제한(램 용량으로 인해)
- 백터라이저를 만들때 stop_words를 통해 불용어 제거
- fit_transformation을 통해 tf-idf matrix 생성
- tf_matrix로 코사인유사도 적용
- 영화의 index와 제목을 딕셔너리에 담기 
- 해당 딕셔너리의 유사도 부분만 추출 후 높은 순서대로 정렬한다.
- 그리고 인덱스를 무비타이틀로 바꾸어 연관성이 높은 영화 순서대로 추출한다.
