# 영화 리뷰 워드 임베딩 (Word2Vec, FastText)
- gensim 라이브러리 사용 : pip install gensim
    - Word2Vec : models.Word2Vec
    - FastText : models.FastText

## 1. 데이터 준비
* 토큰화가 잘 되어 있는 filtered 데이터 사용

In [2]:
import platform
print(platform.python_version())
print(platform.architecture())

3.9.21
('64bit', 'WindowsPE')


In [1]:
data_filename = './Korean_movie_reviews_2016_filtered.csv'
import pandas as pd
review_df = pd.read_csv(data_filename)
review_df.head()

Unnamed: 0,review,rate
0,아니 딴 그렇 비 비탄 총 대체 왜 들 온겨,7
1,진심 쓰레기 영화 만들 무서 알 쫄아 틀었 이건 뭐 웃 거리 없는 쓰레기 영화 임,1
2,역대 좀비 영화 가장 최고다 원작 만화 읽어 보려 영화 보고 결정 하려 감독 간츠 ...,10
3,온종일 불편한 피 범벅 일,6
4,답답함 극치 움직일 잇으 좀 움직여 어지간히 좀비 봣으 얼 타고 때려 잡 때 되 않냐,1


In [2]:
# review만 모아서 review별 토큰 리스트로 변환 : review가 Object 타입이므로 str로 변환 후 split
review_list = list(map(str, review_df.review))
corpus = [review.split() for review in review_list] # review_list 안에(in) review를 split()
corpus[:5]

[['아니', '딴', '그렇', '비', '비탄', '총', '대체', '왜', '들', '온겨'],
 ['진심',
  '쓰레기',
  '영화',
  '만들',
  '무서',
  '알',
  '쫄아',
  '틀었',
  '이건',
  '뭐',
  '웃',
  '거리',
  '없는',
  '쓰레기',
  '영화',
  '임'],
 ['역대',
  '좀비',
  '영화',
  '가장',
  '최고다',
  '원작',
  '만화',
  '읽어',
  '보려',
  '영화',
  '보고',
  '결정',
  '하려',
  '감독',
  '간츠',
  '실사',
  '했',
  '사람',
  '거르려',
  '그냥',
  '봤',
  '정말',
  '흠잡',
  '없는',
  '최고',
  '좀비',
  '영화',
  '잔인',
  '거',
  '싫어하지',
  '참고',
  '볼',
  '만하',
  '로미',
  '인물',
  '왜',
  '그런',
  '모르'],
 ['온종일', '불편한', '피', '범벅', '일'],
 ['답답함',
  '극치',
  '움직일',
  '잇으',
  '좀',
  '움직여',
  '어지간히',
  '좀비',
  '봣으',
  '얼',
  '타고',
  '때려',
  '잡',
  '때',
  '되',
  '않냐']]

## 1. Word2Vec 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/word2vec.html

### Skipgram, negative=10 인 경우

In [3]:
# Word2Vec 모델 생성 및 학습 : window=3, min_count=3
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size = 100, sg=1, negative=10)

In [4]:
# 단어의 임베딩 벡터 확인
model_sg_n10.wv['이정재']

array([ 0.01022283,  0.26104134,  0.01318148, -0.36524236, -0.03511963,
       -0.21666387,  0.13474588,  0.324321  ,  0.8106426 ,  0.25123456,
       -0.43735376, -0.1069392 , -0.14986683,  0.31885707, -0.0467862 ,
       -0.21105322, -0.19038843, -0.26295218,  0.11889236, -0.35793293,
        0.18167378,  0.11188509,  0.43537372,  0.10225461, -0.31056252,
       -0.03684061,  0.4087804 , -0.23840295,  0.2606549 ,  0.21668485,
       -0.13754456, -0.11784765,  0.0463698 , -0.27771863,  0.35192356,
       -0.6055509 ,  0.16093022,  0.16630937, -0.43003544, -0.19077133,
       -0.65036476,  0.02159465, -0.06049187, -0.02457599, -0.20437911,
        0.35235628, -0.01652365,  0.3629236 , -0.22730571,  0.12804551,
       -0.1738072 , -0.42176855,  0.0617245 , -0.18094504,  0.15688728,
        0.2601662 ,  0.11069791, -0.6139172 ,  0.27907854,  0.34618932,
       -0.9112531 , -0.14660493, -0.00241152, -0.35481805, -0.7340159 ,
       -0.214628  ,  0.03835244, -0.3737632 ,  0.05557569, -0.01

In [5]:
# 단어의 임베딩 벡터 차원 확인
len(model_sg_n10.wv['이정재'])

100

In [6]:
# 두 단어 간 유사도 확인
model_sg_n10.wv.similarity('이정재', '정우성')

0.7455776

In [8]:
# 특정 단어와 유사한 단어 추출
model_sg_n10.wv.most_similar('이정재', topn=20)

[('송강호', 0.8215921521186829),
 ('이범수', 0.8139531016349792),
 ('공유', 0.7994104623794556),
 ('김범수', 0.7600052952766418),
 ('조재현', 0.7535810470581055),
 ('이성민', 0.7528467774391174),
 ('정우성', 0.7455776333808899),
 ('김남길', 0.7428144216537476),
 ('곽도원', 0.7341226935386658),
 ('황정민', 0.7322577834129333),
 ('박해일', 0.7283472418785095),
 ('김윤석', 0.7282106280326843),
 ('이병헌', 0.7266353368759155),
 ('마동석', 0.7230489253997803),
 ('주지훈', 0.7201620936393738),
 ('윌스미스', 0.7195296883583069),
 ('리암', 0.7179137468338013),
 ('유준상', 0.7136050462722778),
 ('박철민', 0.7119119167327881),
 ('송광호', 0.7079582810401917)]

In [9]:
model_sg_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.9022719264030457),
 ('재밌네', 0.8356807231903076),
 ('재밌었', 0.8169750571250916),
 ('재밋음', 0.814251184463501),
 ('잼슴', 0.8084348440170288),
 ('잼남', 0.8056206703186035),
 ('재밌어', 0.7960332632064819),
 ('쟈밋', 0.7806090116500854),
 ('재밌아', 0.7803431153297424),
 ('재밋엇음', 0.7767897248268127),
 ('재밋어용', 0.7716252207756042),
 ('재미있었', 0.767682671546936),
 ('재밋었음', 0.7651959657669067),
 ('엇', 0.7637551426887512),
 ('재밌슴', 0.7612617015838623),
 ('재밋는듯', 0.7585414052009583),
 ('재밋습니', 0.7580915689468384),
 ('재밋었습니', 0.7542613744735718),
 ('재밋네용', 0.7535898685455322),
 ('재밋네', 0.7535285353660583)]

### Skipgram, negative=5 인 경우

In [10]:
# 모델 생성
from gensim.models import Word2Vec
model_sg_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=0, negative=5)

In [11]:
# 특어 단어와 유사한 단어 추출 : 이정재
model_sg_n10.wv.most_similar('이정재', topn=20)

[('이범수', 0.7904582619667053),
 ('공유', 0.7561853528022766),
 ('송강호', 0.7505705952644348),
 ('김윤석', 0.738364040851593),
 ('이성민', 0.7088558077812195),
 ('김범수', 0.6906686425209045),
 ('주지훈', 0.6826448440551758),
 ('김남길', 0.6823396682739258),
 ('조재현', 0.6817394495010376),
 ('송광호', 0.6713417172431946),
 ('마동석', 0.6645645499229431),
 ('황정민', 0.6528230905532837),
 ('이진욱', 0.6511832475662231),
 ('요한', 0.6501532196998596),
 ('김명민', 0.6479749083518982),
 ('정우성', 0.6473250389099121),
 ('박해일', 0.6467392444610596),
 ('이병헌', 0.6463501453399658),
 ('곽도원', 0.6439253687858582),
 ('차승원', 0.637101411819458)]

In [12]:
# 특어 단어와 유사한 단어 추출 : 재밌
model_sg_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.9101809859275818),
 ('재밌네', 0.8362551927566528),
 ('재밌어', 0.8099352717399597),
 ('재밌었', 0.8097404837608337),
 ('재밋음', 0.8010364770889282),
 ('재밋어', 0.7385686635971069),
 ('재미있네', 0.7343350052833557),
 ('재미있었', 0.7308254837989807),
 ('재밌는', 0.7284349799156189),
 ('재미있어', 0.7132488489151001),
 ('잼남', 0.7101936936378479),
 ('재밌더', 0.7075064182281494),
 ('재밋엇어', 0.7025944590568542),
 ('재밋네', 0.6909552216529846),
 ('재밌던', 0.6903647780418396),
 ('재밋', 0.6872291564941406),
 ('꿀잼', 0.6750771403312683),
 ('재밌다', 0.6644400358200073),
 ('재밌고', 0.6599054336547852),
 ('재미있던', 0.6531773805618286)]

### CBOW, negative=10 인 경우

In [15]:
model_cb_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=10)

In [16]:
model_cb_n10.wv.most_similar('이정재', topn=20)

[('공유', 0.8136433362960815),
 ('이범수', 0.8019043803215027),
 ('송강호', 0.7993380427360535),
 ('김범수', 0.7505640387535095),
 ('정우성', 0.7450835704803467),
 ('조재현', 0.7334835529327393),
 ('박해일', 0.7325501441955566),
 ('이병헌', 0.7303497195243835),
 ('김윤석', 0.7300711870193481),
 ('이성민', 0.7271612882614136),
 ('리암', 0.7239680886268616),
 ('김남길', 0.7197943925857544),
 ('김성균', 0.7035316228866577),
 ('이진욱', 0.7031553387641907),
 ('곽도원', 0.700999915599823),
 ('송광호', 0.6984753012657166),
 ('정진영', 0.6975743770599365),
 ('슨', 0.6973852515220642),
 ('마동석', 0.695650577545166),
 ('오종혁', 0.6950219869613647)]

In [None]:
model_cb_n10.wv.most_similar('재밌', topn=20)

### CBOW, negative=5 인 경우

In [17]:
model_cb_n10 = Word2Vec(corpus, window=3, min_count=3, vector_size=100, sg=1, negative=5)

In [19]:
model_cb_n10.wv.most_similar('이정재', topn=20)

[('이범수', 0.8296025395393372),
 ('송강호', 0.8211592435836792),
 ('공유', 0.7978448867797852),
 ('이성민', 0.7486298084259033),
 ('김남길', 0.7446230053901672),
 ('김범수', 0.7437304258346558),
 ('이병헌', 0.739829957485199),
 ('김윤석', 0.7328487038612366),
 ('마동석', 0.7237625122070312),
 ('황정민', 0.7199950814247131),
 ('조재현', 0.7185785174369812),
 ('정우성', 0.7171173095703125),
 ('김명민', 0.7152525186538696),
 ('박해일', 0.7130917906761169),
 ('송광호', 0.7091579437255859),
 ('주지훈', 0.7088465690612793),
 ('곽도원', 0.6990246176719666),
 ('박철민', 0.6958315372467041),
 ('요한', 0.6951549053192139),
 ('차승원', 0.6906208395957947)]

In [20]:
model_cb_n10.wv.most_similar('재밌', topn=20)

[('재미있', 0.897014319896698),
 ('재밋음', 0.8319304585456848),
 ('재밌네', 0.830389142036438),
 ('재밌었', 0.8180097341537476),
 ('잼남', 0.814484179019928),
 ('재밌어', 0.8135380744934082),
 ('재밋어용', 0.7971233129501343),
 ('잼슴', 0.7861104011535645),
 ('재밋엇음', 0.7845796346664429),
 ('재밌아', 0.7787049412727356),
 ('재밋었어', 0.7744437456130981),
 ('재밋구', 0.7737035155296326),
 ('재밋었음', 0.7719170451164246),
 ('재밋습니', 0.7707273960113525),
 ('재밋엇어용', 0.7675083875656128),
 ('재밋어', 0.7585514783859253),
 ('재밋엇', 0.7581174373626709),
 ('재밋네용', 0.7572782039642334),
 ('쟈밋', 0.7497585415840149),
 ('재밋엇어', 0.7481048703193665)]

### OOV(Out of Vocabulary) 문제

In [21]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in model_sg_n10.wv.key_to_index

False

In [22]:
# corpus에 없는 단어의 임베딩 벡터 확인 
model_sg_n10.wv['우주평화']

KeyError: "Key '우주평화' not present"

## 2. FastText 활용 영화 리뷰 워드 임베딩
* https://radimrehurek.com/gensim/models/fasttext.html

In [29]:
# FastText 모델 생성 및 학습
# window=3, min_count=3, min_n=2, max_n=2
from gensim.models import FastText

ft_model = FastText(corpus, window=3, min_count=3, min_n=2, max_n=2, vector_size=100, negative=10, sg=1)

In [30]:
# 특정 단어와 유사한 단어 추출 : 이정재
ft_model.wv['이정재']

array([ 0.05495796,  0.30086234, -0.20551567, -0.37259862,  0.05008907,
       -0.22169146, -0.3239295 ,  0.4676009 ,  0.42100468, -0.03539502,
        0.00141502,  0.10858686, -0.40184745, -0.27423128, -0.0405279 ,
       -0.16759375, -0.1616519 , -0.48824912,  0.3306606 , -0.47718048,
        0.06288347, -0.38693112, -0.06490654, -0.11347349, -0.07016869,
       -0.23530245, -0.03755207, -0.3650286 , -0.53182167, -0.35977742,
       -0.0509521 , -0.53698194, -0.03023138, -0.15469074, -0.17728046,
       -0.12777595,  0.23052475, -0.01166559, -0.40750784,  0.03676789,
       -0.4141593 ,  0.12610334, -0.34435958, -0.14059243, -0.5596973 ,
        0.15597852, -0.3711471 , -0.50886893,  0.25279492, -0.03119456,
        0.03227868,  0.12396779,  0.36315697,  0.19359976, -0.10277767,
        0.25976348,  0.2664705 ,  0.19114162, -0.19478738, -0.06711166,
        0.2712919 , -0.34504265, -0.1300928 ,  0.42535537, -0.30009767,
       -0.53745925,  0.33721304,  0.00399266, -0.02284039,  0.11

In [31]:
# corpus에 없는 단어 확인 : 우주평화 
'우주평화' in ft_model.wv.key_to_index

False

In [32]:
# corpus에 없는 단어의 임베딩 벡터 확인 
ft_model.wv['우주평화']

array([ 3.90191376e-01,  2.67097384e-01,  1.23485468e-01,  7.55360276e-02,
       -2.13705301e-01,  1.86671619e-03, -1.09035112e-01,  6.66684628e-01,
        3.32995951e-01,  4.72357839e-01, -1.02076732e-01,  3.05256367e-01,
        1.27685681e-01,  3.01327288e-01, -2.28782728e-01, -2.98519582e-01,
        5.99298365e-02, -1.09253988e-01,  3.18587311e-02, -5.53175099e-02,
        1.77309126e-01,  6.24839440e-02,  1.59414764e-02, -1.55700386e-01,
        2.89834235e-02, -8.05145479e-04, -3.91776234e-01, -3.79867256e-01,
       -4.06154454e-01, -3.54035676e-01,  1.28006309e-01, -2.45981172e-01,
        4.54333723e-02, -2.81468540e-01, -2.86287814e-01,  6.56624287e-02,
        3.02144557e-01,  4.42125708e-01, -3.10764879e-01,  1.47598177e-01,
       -3.54213655e-01,  1.12020671e-01, -2.90666461e-01,  1.49042845e-01,
       -4.09696192e-01,  6.06012829e-02, -1.23640969e-01, -1.37411624e-01,
       -1.62551314e-01,  1.67607293e-01,  2.14306474e-01,  2.14006096e-01,
       -7.10231885e-02, -

In [33]:
# corpus에 없는 단어와 유사한 단어추출
ft_model.wv.most_similar('우주평화')

[('우주', 0.8293965458869934),
 ('평화', 0.8106266260147095),
 ('우주비행사', 0.8075344562530518),
 ('우장', 0.7874744534492493),
 ('우주선', 0.7871056199073792),
 ('아비규환', 0.7827560305595398),
 ('지구대', 0.7820226550102234),
 ('우방', 0.780914843082428),
 ('지옥도', 0.7803426384925842),
 ('지구촌', 0.7794464230537415)]