# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [4]:
data_filename = './data/Korean_movie_reviews_2016_filtered.csv'
import pandas as pd
review_df = pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [5]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_list = list(map(str, review_df.review))
corpus = [review.split() for review in review_list]
corpus[:5]

[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [6]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=10)

In [7]:
# 단어의 임베딩 벡터 확인
model_sg_n10.wv['이정재']

array([-0.21649578,  0.15433274,  0.04426147, -0.76282865,  0.05368855,
       -0.1056255 ,  0.56533754,  0.06568599,  0.7531187 ,  0.05207073,
       -0.4784948 , -0.40010217,  0.19476135,  0.42890623, -0.32590827,
        0.19714326, -0.57630444, -0.36840293,  0.45004693, -0.07719022,
       -0.16383074, -0.14299394,  0.1795542 ,  0.09649319, -0.18122156,
       -0.2693328 ,  0.16802205, -0.01961172,  0.3065859 ,  0.3572807 ,
       -0.07749548, -0.02140605,  0.2583765 ,  0.0832959 , -0.10806311,
       -0.7776684 , -0.13289388, -0.10889033, -0.5776583 ,  0.07024786,
       -0.6760659 ,  0.47957024, -0.07248031, -0.22444876,  0.19515274,
        0.297789  , -0.30581608, -0.14171237,  0.07449647,  0.20299226,
       -0.09577254, -0.3170468 , -0.24236834,  0.05552538,  0.06584989,
        0.01330423, -0.03138121, -0.3860317 ,  0.07314497,  0.26346105,
       -0.45267504, -0.24333915,  0.25619894, -0.03043336, -0.6464386 ,
       -0.22593641,  0.20498742, -0.3251728 ,  0.07309982, -0.33

In [8]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_n10.wv['이정재'])

100

In [9]:
# 두 단어 간 유사도 확인
model_sg_n10.wv.similarity('이정재','정우성')

0.7428684

In [10]:
# 특정 단어와 유사한 단어 추출
model_sg_n10.wv.most_similar('이정재', topn=20)

[('송강호', 0.8011184930801392),
 ('이범수', 0.7993087768554688),
 ('공유', 0.7887062430381775),
 ('김범수', 0.7821043133735657),
 ('정우성', 0.7428683638572693),
 ('박해일', 0.7411994934082031),
 ('조재현', 0.7399726510047913),
 ('김윤석', 0.7348418831825256),
 ('김남길', 0.7328280806541443),
 ('이성민', 0.7299358248710632),
 ('이병헌', 0.7260499596595764),
 ('마동석', 0.7257714867591858),
 ('김희원', 0.7238935232162476),
 ('정재형', 0.7198199033737183),
 ('이진욱', 0.7190083265304565),
 ('리암', 0.7172463536262512),
 ('윌스미스', 0.7153100967407227),
 ('조정석', 0.7134910225868225),
 ('김성균', 0.7073071002960205),
 ('정진영', 0.7036947011947632)]

In [39]:
model_sg_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.8852418065071106),
 ('재밌네', 0.827724039554596),
 ('재밌었', 0.8272783756256104),
 ('잼남', 0.8186686635017395),
 ('재밋음', 0.81390380859375),
 ('재밌어', 0.8103576898574829),
 ('재밋었음', 0.7908328175544739),
 ('재밋어용', 0.7782317399978638),
 ('재밌아', 0.7778360843658447),
 ('재밋엇음', 0.7772762179374695),
 ('재밋엇어용', 0.7748783230781555),
 ('재밋었습니', 0.7695954442024231),
 ('잼슴', 0.7677687406539917),
 ('재밋네', 0.7673613429069519),
 ('재미있었', 0.7671087384223938),
 ('재밌슴', 0.7658709287643433),
 ('잼난', 0.7607936859130859),
 ('재밋엇', 0.7585732340812683),
 ('엇', 0.7582724094390869),
 ('쟈밋', 0.7566028833389282)]

### Skipgram, negative=5 인 경우

In [46]:
# 모델 생성
model_sg_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=5)

In [52]:
# 특어 단어와 유사한 단어 추출 : 이정재
model_sg_n5.wv['이정재']

array([-0.02195257, -0.06435785, -0.3206985 , -0.28789276, -0.10208076,
        0.25240868,  0.4424389 , -0.07581257,  0.9080127 , -0.17147991,
       -0.48685217, -0.18013243,  0.27144673,  0.47259003, -0.32417458,
       -0.14575255, -0.38587043, -0.24963373,  0.27155092, -0.5119119 ,
       -0.15091318, -0.05951878,  0.6660914 ,  0.0332968 , -0.13342531,
        0.13041082,  0.2845127 ,  0.28398934,  0.37693816,  0.01734416,
       -0.13642691,  0.11109585,  0.05263711, -0.46021292, -0.089947  ,
       -0.57192534, -0.55584306,  0.00779176, -0.6268388 , -0.31543738,
       -0.19696434,  0.27924508, -0.31257722,  0.21334696, -0.11147975,
        0.5229138 , -0.06121739, -0.07760566, -0.1350612 ,  0.24782005,
        0.24510363, -0.50833255,  0.17965926, -0.037125  ,  0.04753613,
        0.00490494, -0.17908691, -0.5627272 , -0.1352312 ,  0.18098052,
       -0.37666956, -0.48729008,  0.5180221 , -0.51904505, -0.36007234,
       -0.38967833,  0.22811113, -0.43452767,  0.09482293,  0.00

In [53]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.9069398045539856),
 ('재밋음', 0.8329796195030212),
 ('재밌었', 0.8237584829330444),
 ('재밌네', 0.8227387070655823),
 ('재밌어', 0.8188971877098083),
 ('잼남', 0.8172439932823181),
 ('재밋네', 0.7755934000015259),
 ('재밋었음', 0.7674241065979004),
 ('쟈밋', 0.7635376453399658),
 ('재밋어용', 0.7631458044052124),
 ('재밌아', 0.7603585124015808),
 ('재밋었어', 0.7598894238471985),
 ('재밌슴', 0.7579833269119263),
 ('재밋어', 0.7573654651641846),
 ('재미있었', 0.7567914128303528),
 ('재밋엇어용', 0.7562897205352783),
 ('엇', 0.7557668089866638),
 ('재밋습니', 0.7554990649223328),
 ('잼슴', 0.7534531950950623),
 ('재밋엇', 0.7522073984146118)]

### CBOW, negative=10 인 경우

In [51]:
model_cbow_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=10)

In [54]:
model_cbow_n10.wv['이정재']

array([-0.13658555, -0.85582584, -0.5965061 , -0.15984029, -0.5376444 ,
        1.1904149 , -0.43819183,  0.55138403,  1.0578737 , -1.6624207 ,
       -1.6741538 , -2.3241816 ,  0.078515  , -0.42542446, -0.4449014 ,
        0.479749  , -0.37869847, -0.35108104,  1.3945607 ,  0.14340247,
       -1.5453271 ,  0.44665772, -0.13796876,  0.7425632 , -0.13475013,
       -0.2788885 , -0.30516037,  0.269031  ,  0.9325435 ,  1.1733589 ,
       -0.5521836 ,  0.47613505,  0.88269275, -0.8208811 , -1.0410289 ,
       -1.8259056 ,  0.3728462 ,  1.1099322 ,  1.5157921 , -0.09162243,
       -0.3477921 ,  1.1884626 , -0.60859454, -0.6445083 , -1.3913327 ,
        0.6882294 ,  2.3642497 , -0.3693313 , -0.9684044 , -0.95009476,
       -0.02927688, -1.7829193 ,  0.5888794 , -2.2205122 , -0.13634154,
       -0.8670218 ,  0.6673292 , -0.25383556,  1.0391845 ,  0.92548823,
       -0.22671445, -1.1556056 ,  0.47062522, -1.1969223 , -0.42312518,
       -1.2452385 ,  0.19922261, -0.15589915,  1.1085116 , -0.65

In [55]:
model_cbow_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.8959496021270752),
 ('재밌네', 0.8175869584083557),
 ('재밌어', 0.8084039688110352),
 ('재밌었', 0.7916589379310608),
 ('재밋음', 0.7852320671081543),
 ('재밋어', 0.7311105132102966),
 ('재밌는', 0.7264140844345093),
 ('재미있었', 0.7164555191993713),
 ('재미있네', 0.7113712430000305),
 ('재밌더', 0.7065057754516602),
 ('잼남', 0.6992375254631042),
 ('재밌던', 0.687153160572052),
 ('재밋네', 0.680222749710083),
 ('재밋엇어', 0.6794935464859009),
 ('재미있어', 0.6750192642211914),
 ('재밋', 0.6584526896476746),
 ('재밋었', 0.6555356383323669),
 ('재밋엇음', 0.6458634734153748),
 ('꿀잼', 0.6422904133796692),
 ('재밌다', 0.6421495676040649)]

### CBOW, negative=5 인 경우

In [56]:
model_cbow_n5 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=5)

In [57]:
model_cbow_n5.wv['이정재']

array([-0.8418818 , -0.51941985,  0.11489245,  0.16911522, -0.6814595 ,
        1.3924159 , -0.48024455, -0.09062111,  1.1681973 , -0.7132346 ,
       -0.87944555, -2.1279182 ,  2.0166895 , -0.23045477,  0.84562427,
       -0.32974744, -0.1724508 , -0.5594249 ,  0.7661816 ,  0.51398486,
        0.52946234,  0.51859474,  0.77160347, -0.6777657 ,  0.13769627,
        0.5365478 ,  0.32565662, -0.38545474,  1.6463456 ,  0.842902  ,
       -0.27630922, -0.73703766,  0.8547903 , -0.71127415, -1.1440284 ,
        0.73990667, -0.1974746 , -1.0063187 ,  0.72589755,  0.05272076,
       -0.27998587,  1.0836728 , -0.74335164, -0.3560372 , -0.5867807 ,
        1.1015412 ,  1.21314   , -0.49041957, -0.79719275, -0.40819168,
        0.40998304, -0.7374517 ,  0.9592455 , -0.6559557 , -0.15135121,
       -0.36264223,  0.8737517 ,  0.03595906,  0.2760178 ,  1.0403224 ,
       -0.8751495 , -1.416863  , -0.98904526,  0.26394805, -1.1883756 ,
       -1.2733369 , -0.6826227 , -0.01604543, -0.49909657, -1.38

In [58]:
model_cbow_n5.wv.most_similar('재밌', topn=20)

[('재미있', 0.90916907787323),
 ('재밌네', 0.8171957731246948),
 ('재밌어', 0.8129955530166626),
 ('재밋음', 0.8020539283752441),
 ('재밌었', 0.7985680103302002),
 ('재미있었', 0.7346876263618469),
 ('재밌는', 0.7307628393173218),
 ('재밌던', 0.7164293527603149),
 ('재밋어', 0.7149795293807983),
 ('잼남', 0.7144502401351929),
 ('재미있네', 0.7103576064109802),
 ('재밌더', 0.6909223198890686),
 ('재밋엇어', 0.6866120100021362),
 ('재미있어', 0.6863977313041687),
 ('꿀잼', 0.6698381900787354),
 ('재밌다', 0.6680341362953186),
 ('재밋네', 0.6665441393852234),
 ('재미있던', 0.6633694767951965),
 ('재밋', 0.6523979306221008),
 ('재밋었어', 0.6497507095336914)]

### OOV(Out of Vocabulary) 문제

In [25]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_n10.wv.key_to_index

False

In [50]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_sg_n10.wv['이정재']

array([-0.21649578,  0.15433274,  0.04426147, -0.76282865,  0.05368855,
       -0.1056255 ,  0.56533754,  0.06568599,  0.7531187 ,  0.05207073,
       -0.4784948 , -0.40010217,  0.19476135,  0.42890623, -0.32590827,
        0.19714326, -0.57630444, -0.36840293,  0.45004693, -0.07719022,
       -0.16383074, -0.14299394,  0.1795542 ,  0.09649319, -0.18122156,
       -0.2693328 ,  0.16802205, -0.01961172,  0.3065859 ,  0.3572807 ,
       -0.07749548, -0.02140605,  0.2583765 ,  0.0832959 , -0.10806311,
       -0.7776684 , -0.13289388, -0.10889033, -0.5776583 ,  0.07024786,
       -0.6760659 ,  0.47957024, -0.07248031, -0.22444876,  0.19515274,
        0.297789  , -0.30581608, -0.14171237,  0.07449647,  0.20299226,
       -0.09577254, -0.3170468 , -0.24236834,  0.05552538,  0.06584989,
        0.01330423, -0.03138121, -0.3860317 ,  0.07314497,  0.26346105,
       -0.45267504, -0.24333915,  0.25619894, -0.03043336, -0.6464386 ,
       -0.22593641,  0.20498742, -0.3251728 ,  0.07309982, -0.33

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [30]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2
from gensim.models import FastText

ft_model = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, negative=10, sg=1)

In [40]:
# 특정 단어와 유사한 단어 추출 : 이정재
ft_model.wv['이정재']

array([ 0.05583531,  0.28827134, -0.1779248 ,  0.04833512,  0.21326427,
       -0.22701411, -0.38048577,  0.40398088,  0.4968421 ,  0.09576116,
       -0.2436662 ,  0.31466022, -0.24345195, -0.0954785 , -0.06140641,
        0.03497512, -0.01329535, -0.59748936,  0.12440193, -0.27127838,
        0.13639708, -0.22356606,  0.06923499,  0.03877357,  0.07950471,
       -0.18604226, -0.18032762, -0.1603231 , -0.554193  , -0.36610055,
       -0.0263896 , -0.507544  ,  0.05250511, -0.17767575, -0.3128543 ,
       -0.14844684,  0.22697124, -0.31353188, -0.38742408, -0.10092349,
       -0.40799913,  0.08637041, -0.4366885 , -0.11318976, -0.68740594,
       -0.18683758, -0.70518875, -0.4101662 , -0.07487129, -0.29181352,
        0.10337768,  0.16632244,  0.11849848,  0.1600953 , -0.14542507,
        0.11874249,  0.3279655 ,  0.24604575, -0.35171157,  0.37023494,
       -0.03324571, -0.18169408, -0.3991275 ,  0.42720824, -0.49163228,
       -0.2637424 ,  0.48635942, -0.09506752,  0.09759332, -0.27

In [41]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in ft_model.wv.key_to_index

False

In [42]:
# corpus에 없는 단어의 임베딩 벡터 확인 
ft_model.wv['우주평화']

array([ 3.38434041e-01,  1.60113927e-02,  4.02661450e-02,  2.50365674e-01,
       -1.67652532e-01, -1.06458351e-01, -1.68802112e-01,  6.33340836e-01,
        3.01824480e-01,  5.94024062e-01,  7.88126364e-02,  2.27993757e-01,
       -3.41539830e-02,  6.04831457e-01, -1.48296162e-01, -4.02958304e-01,
        6.72118664e-02, -2.28912547e-01,  1.16571309e-02,  6.32627606e-02,
        9.06493217e-02, -6.40475750e-02, -8.39839429e-02, -1.76049359e-02,
       -1.18838608e-01, -1.04374532e-02, -4.49016571e-01, -2.63667643e-01,
       -3.25676978e-01, -3.46074909e-01,  9.94044542e-02, -2.46001810e-01,
       -8.55218396e-02, -1.44546673e-01, -7.30191618e-02,  1.37333542e-01,
        2.54362941e-01,  2.42436409e-01, -2.78951973e-01, -8.09000209e-02,
       -6.64390326e-02,  3.49806882e-02, -1.48850158e-01,  6.11392632e-02,
       -2.33057886e-01,  1.92399830e-01,  1.08762696e-01,  3.03367943e-01,
       -3.55115049e-02, -1.17713764e-01,  1.87043950e-01,  2.28888839e-01,
       -1.06123172e-01, -

In [45]:
# corpus에 없는 단어와 유사한 단어추출 
ft_model.wv.most_similar('우주평화')

[('우주', 0.8228523135185242),
 ('우주비행사', 0.8043842315673828),
 ('평화', 0.8034636974334717),
 ('우방', 0.7990559339523315),
 ('우장', 0.7956016063690186),
 ('우주인', 0.7834931015968323),
 ('쉘', 0.7806243300437927),
 ('씰', 0.7746517658233643),
 ('쑥대밭', 0.7733591198921204),
 ('산화', 0.7732564210891724)]