# TF2, Gensim을 사용해서 임베딩 공간 탐색

In [1]:
import os
import sys
import tensorflow as tf
from tensorflow.keras import datasets, layers, models, optimizers
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import clear_output
from six.moves import urllib
from tensorflow import feature_column as fc
import tensorflow_datasets as tfds
plt.rcParams["font.family"] = 'NanumBarunGothic'
TENSORBOARD_BINARY = '/home/hoondori/anaconda3/envs/ai/bin/tensorboard'
os.environ['TENSORBOARD_BINARY'] =  TENSORBOARD_BINARY
%load_ext tensorboard

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    # 텐서플로가 첫 번째 GPU만 사용하도록 제한
    # 프로그램 시작시에 메모리 증가가 설정되어야만 합니다
    try:
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        tf.config.experimental.set_memory_growth(gpus[0], True)
    except RuntimeError as e:
        # 프로그램 시작시에 접근 가능한 장치가 설정되어야만 합니다
        print(e)

# Gensim 데이터를 통해서 word2vec 모델 생성 

In [7]:
from gensim.models import KeyedVectors
import gensim.downloader as api
from gensim.models import Word2Vec

dataset = api.load("text8")
model = Word2Vec(dataset)
model.save('text8-word2vec.bin')

In [8]:
from gensim.models import KeyedVectors

# 다시 읽어들이기
model = KeyedVectors.load('text8-word2vec.bin')
word_vectors = model.wv

In [19]:
# voca 확인
print( [x for i, x in enumerate(word_vectors.vocab.keys()) if i < 10] )
print(f'voca size: {len(word_vectors.vocab.keys())}')

['anarchism', 'originated', 'as', 'a', 'term', 'of', 'abuse', 'first', 'used', 'against']
voca size: 71290


In [21]:
# 특정 단어와 유사한 단어 검색 
#  word_vectors.similar_by_word("king")[:5] 과 동일
word_vectors.most_similar("king")[:5]

[('queen', 0.7446538209915161),
 ('prince', 0.7367112636566162),
 ('throne', 0.6983506679534912),
 ('emperor', 0.6961647272109985),
 ('kings', 0.6851218938827515)]

In [25]:
# analogy로 유사 단어 검색 
#  파리:프랑스 :: 베를린:독일   => 프랑스 - 파리 + 베를린 = 독일
#  king:man :: queen:woman    => man - king + queen = woman
word_vectors.most_similar(positive=["france","berlin"], negative=["paris"])[:5]

[('germany', 0.7759740948677063),
 ('russia', 0.7300741076469421),
 ('austria', 0.6964518427848816),
 ('finland', 0.6880278587341309),
 ('hungary', 0.6857718229293823)]

In [26]:
# cosmul similarity로 측정한 유사도 기반 
word_vectors.most_similar_cosmul(positive=["france","berlin"], negative=["paris"])[:5]

[('germany', 0.9602494239807129),
 ('russia', 0.9475181102752686),
 ('finland', 0.9272105693817139),
 ('lithuania', 0.91187983751297),
 ('austria', 0.9082432985305786)]

In [29]:
word_vectors.most_similar_cosmul(positive=["man","queen"], negative=["king"])[:5]

[('woman', 1.0139929056167603),
 ('girl', 0.9544166326522827),
 ('lady', 0.8891240954399109),
 ('baby', 0.8838143944740295),
 ('naked', 0.8804119229316711)]

In [35]:
# 단어 목록 중 상이한 항목 탐지 
word_vectors.doesnt_match(['hindus', 'parsis', 'singapore', 'christian'])

'singapore'

In [40]:
# 두 단어 사이의 유사도 계산
print(f'man to tree : {word_vectors.similarity("man", "tree")}')
print(f'man to woman : {word_vectors.similarity("man", "woman")}')

man to tree : 0.2851971685886383
man to woman : 0.7528449892997742


In [43]:
# word vector 자체 확보
wv = word_vectors['computer']  # numpy vector of a word
wv.shape

(100,)

In [53]:
# 문장간 유사도
sen1 = 'The president greets the press in Chicago'.lower().split()
sen2 = 'The president greets the media in Washington'.lower().split()
word_vectors.wmdistance(sen1, sen2)

5.343210180655011

# Gensim 모델 평가

In [56]:
# Correlation with human opinion on word similarity
from gensim.test.utils import datapath

similarities = model.wv.evaluate_word_pairs(datapath('wordsim353.tsv'))
similarities

((0.6121774038260248, 1.7836714753797042e-37),
 SpearmanrResult(correlation=0.6269028863883931, pvalue=9.923148099037103e-40),
 0.56657223796034)

In [None]:
analogy_scores, details = model.wv.evaluate_word_analogies(datapath('questions-words.txt'))
analogy_scores