# Word2Vec 연습

PTB 데이터셋을 이용하여 word2vec(skip-gram) 알고리즘을 구현해보는 연습

In [1]:
import sys
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt

device = torch.device('cuda')

다음 PTB 데이터셋은 [여기](https://github.com/tomsercu/lstm)에서 다운로드

In [2]:
from pathlib import Path

DATA_PATH =  Path('data')
File_name = 'ptb.train.txt'
print(DATA_PATH / File_name) # Path 설정 (운영체제 별로 / \ 혼용되기 때문)

data\ptb.train.txt


In [3]:
corpus = open(DATA_PATH / File_name).read().replace('\n','<eos>').strip().split()

In [5]:
corpus[:20]

['aer',
 'banknote',
 'berlitz',
 'calloway',
 'centrust',
 'cluett',
 'fromstein',
 'gitano',
 'guterman',
 'hydro-quebec',
 'ipo',
 'kia',
 'memotec',
 'mlx',
 'nahb',
 'punts',
 'rake',
 'regatta',
 'rubens',
 'sim']

In [6]:
corpus[929580:]

['has',
 'been',
 'less',
 'prominent',
 'according',
 'to',
 'mr.',
 '<unk>',
 '<eos>']

In [7]:
len(corpus)

929589

이제 Look-up 테이블과 단어장을 만들자

In [8]:
word_to_id = {}
id_to_word = {}

In [9]:
for word in corpus:
    if word not in word_to_id:
        tmp_id = len(word_to_id)
        word_to_id[word]= tmp_id
        id_to_word[tmp_id]=word

In [10]:
len(word_to_id)

10000

In [11]:
vocab = set(word_to_id.keys())

In [12]:
len(vocab)

10000

하이퍼 파라미터 설정

In [13]:
HIDDEN_UNIT = 100
WINDOW_SIZE = 5
bs = 64

corpus를 n-그램으로 쪼개자

In [14]:
[corpus[0]]+[corpus[1]]

['aer', 'banknote']

In [15]:
context = []
center_words = []
for i in range(WINDOW_SIZE, len(corpus)-WINDOW_SIZE):
    context.append([corpus[i-j-1] for j in range(WINDOW_SIZE)]+
                             [corpus[i+j+1] for j in range(WINDOW_SIZE) ])
    center_words.append(corpus[i])

In [18]:
center_words[0],context[0]

('cluett',
 ['centrust',
  'calloway',
  'berlitz',
  'banknote',
  'aer',
  'fromstein',
  'gitano',
  'guterman',
  'hydro-quebec',
  'ipo'])

이제 Center words와 Context words를 벡터화하자.

In [19]:
def word_to_id_f(word):
    return word_to_id[word]

In [20]:
cent_vec = []
for cen in center_words:
    cent_vec.append(word_to_id[cen])
cent_vec = torch.tensor(cent_vec)

In [21]:
cont_vec = []
for con in context:
    mp = map(word_to_id_f, con)
    vec = [i for i in mp]
    cont_vec.append(vec)
cont_vec = torch.tensor(cont_vec)    

In [22]:
cont_vec[0], cent_vec[0], len(cent_vec)

(tensor([ 4,  3,  2,  1,  0,  6,  7,  8,  9, 10]), tensor(5), 929579)

Continuous Bag of Words 모델 생성

In [24]:
# CBOW model
class Word2vec(nn.Module):
    
    def __init__(self, vocab=vocab, hidden_unit = HIDDEN_UNIT, window_size=WINDOW_SIZE):
        super(Word2vec, self).__init__()

        self.size = len(vocab)
        self.hidden_unit = hidden_unit
        self.window_size = window_size
        self.W1 = nn.Parameter(torch.randn(self.size, self.hidden_unit))
        self.W2 = nn.Parameter(torch.randn(self.hidden_unit, self.size))

        
    def forward(self, x):
        if x.dim() == 1:
            vec = torch.zeros((1,self.size))
            for j in range(2*self.window_size):
                vec[0,x[j]]=1
                
        else:
            vec = torch.zeros((len(x),self.size))
            for i in range(len(x)):
                for j in range(2*self.window_size):
                    vec[i,x[i,j]]=1
                
        sample = (vec@self.W1)/(2*self.window_size)
        pred = sample@self.W2
        
        return pred


In [25]:
word2vec = Word2vec()

In [26]:
word2vec(cont_vec[0]).size()

torch.Size([1, 10000])

In [28]:
for i, param in enumerate(word2vec.named_parameters()):
    print(param)

('W1', Parameter containing:
tensor([[-0.9751,  1.0305, -1.9175,  ...,  0.0443, -1.9616, -0.1491],
        [-1.8496,  0.2325,  1.1690,  ...,  0.8303,  0.3290,  0.4017],
        [-1.7079, -0.4436,  0.1540,  ..., -2.0649, -1.2608,  0.0861],
        ...,
        [-0.1758, -0.1990,  0.5746,  ..., -0.7732,  0.8972,  0.3080],
        [-1.0515,  1.3140, -0.2354,  ...,  0.8983, -0.8010, -1.0314],
        [-0.1126, -0.1586, -0.4540,  ..., -0.9489, -0.7039, -1.1548]],
       requires_grad=True))
('W2', Parameter containing:
tensor([[ 0.0434, -0.1767, -0.2205,  ...,  1.3001,  0.3414,  2.4879],
        [ 0.6529, -0.8231, -0.7129,  ...,  0.2034,  0.4562,  1.8148],
        [-0.3662,  1.5896, -1.4049,  ..., -1.7577, -1.6136,  1.0697],
        ...,
        [ 0.6154,  0.2480,  1.7762,  ..., -0.0784, -1.0135, -0.4529],
        [-0.4520,  1.3719,  1.1661,  ...,  0.4912,  0.0528,  1.1597],
        [ 1.2228,  0.4209, -0.1258,  ...,  1.1939,  1.0815, -0.1474]],
       requires_grad=True))


손실함수는 크로스 엔트로피, 최적화 알고리즘은 SGD 사용

In [29]:
loss_func = F.cross_entropy
optim = torch.optim.SGD(word2vec.parameters(), lr=0.01)

In [30]:
x = word2vec(cont_vec[0])
y = cent_vec[0]

In [31]:
x.size()

torch.Size([1, 10000])

In [32]:
y

tensor(5)

In [33]:
loss_func(word2vec(cont_vec[0]),  cent_vec[0].view(1))

tensor(14.2806, grad_fn=<NllLossBackward>)

이제 모델을 훈련하자

In [35]:
import time

In [58]:
start_t = time.time()
epochs = 5

for epoch in range(epochs):
    loss_s = 0
    for i in range((len(cont_vec) - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        
        xb = cont_vec[start_i:end_i]
        yb = cent_vec[start_i:end_i]
        
        pred = word2vec(xb)
        
        loss = loss_func(pred, yb)
        loss_s += loss
        
        loss.backward()
        optim.step()
        optim.zero_grad()
        
        if i % 1000 == 999 :
            print(epoch+1, i+1, loss_s/1000)
            loss_s=0

tt = round(time.time()-start_t)            
print("Training time :" + str(tt//60) + "m " + str(tt%60) + "s ")     

1 1000 tensor(13.2573, grad_fn=<DivBackward0>)
1 2000 tensor(13.0176, grad_fn=<DivBackward0>)
1 3000 tensor(12.8929, grad_fn=<DivBackward0>)
1 4000 tensor(12.6945, grad_fn=<DivBackward0>)
1 5000 tensor(12.5660, grad_fn=<DivBackward0>)
1 6000 tensor(12.3469, grad_fn=<DivBackward0>)
1 7000 tensor(12.2688, grad_fn=<DivBackward0>)
1 8000 tensor(12.0607, grad_fn=<DivBackward0>)
1 9000 tensor(12.0058, grad_fn=<DivBackward0>)
1 10000 tensor(11.8761, grad_fn=<DivBackward0>)
1 11000 tensor(11.9002, grad_fn=<DivBackward0>)
1 12000 tensor(11.7814, grad_fn=<DivBackward0>)
1 13000 tensor(11.6586, grad_fn=<DivBackward0>)
1 14000 tensor(11.5322, grad_fn=<DivBackward0>)
2 1000 tensor(11.5925, grad_fn=<DivBackward0>)
2 2000 tensor(11.4488, grad_fn=<DivBackward0>)
2 3000 tensor(11.5197, grad_fn=<DivBackward0>)
2 4000 tensor(11.4238, grad_fn=<DivBackward0>)
2 5000 tensor(11.4039, grad_fn=<DivBackward0>)
2 6000 tensor(11.2644, grad_fn=<DivBackward0>)
2 7000 tensor(11.2712, grad_fn=<DivBackward0>)
2 8000 t

훈련 결과를 일단 저장

In [59]:
torch.save(word2vec.state_dict(), 'cbow.pt')

In [64]:
import copy

훈련된 단어 표현 불러오기

In [68]:
for data in word2vec.parameters():
    W = copy.deepcopy(data)
    break

In [69]:
W.size()

torch.Size([10000, 100])

단어에 매핑 되는 벡터 불러오기

In [90]:
def word_to_vec(word):
    return W[word_to_id[word]].view(1,-1)

In [91]:
word_to_vec('king')

tensor([[-1.3845e-01,  5.1904e-01,  1.3160e+00,  3.6017e-01,  8.1579e-02,
          1.0051e+00,  8.5556e-02, -1.0812e+00,  5.5394e-01,  1.0792e+00,
         -1.1921e+00,  1.6524e+00,  4.1688e-01,  6.4928e-01, -3.0679e-01,
         -1.2483e+00, -1.1141e+00, -8.8036e-01, -6.0241e-01, -2.1682e-01,
          4.4941e-01, -1.5590e+00, -4.0219e-01,  7.6776e-01,  1.1388e+00,
          1.1669e+00,  9.2264e-01,  1.4978e+00,  2.5769e-01,  1.0776e-01,
          1.2982e-03, -8.8100e-01,  2.4780e-01, -7.0344e-01,  9.7442e-01,
         -3.6403e-01,  1.6898e+00,  3.4519e-01, -8.1621e-02,  5.4540e-01,
         -6.8370e-01, -2.5154e-01, -2.4793e-01,  3.0588e-02, -3.6264e-01,
          4.4998e-01,  5.2382e-02, -8.4516e-01,  2.3926e-01,  2.6648e-01,
         -7.6238e-01, -5.1847e-01,  1.9922e+00, -6.5274e-01, -3.4827e-01,
         -9.9975e-01, -2.9840e-01,  4.0354e-01,  5.3319e-01,  1.7421e+00,
         -4.6960e-01, -8.2118e-01,  4.4036e-01,  5.7897e-01,  2.4332e-01,
         -3.4287e-01,  5.0929e-01,  3.

In [100]:
k, m, q,w  = word_to_vec('king'), word_to_vec('man'), word_to_vec('queen'), word_to_vec('woman')
ans = k -m + w

In [121]:
def cos_sim(x,y):
    return (torch.sum(x*y) / torch.sqrt((torch.sum(x*x))*(torch.sum(y*y)))).item()

In [117]:
ans_m = torch.zeros_like(W)
ans_m.size()

torch.Size([10000, 100])

모든 단어에 대한 cosine similarity 계산

In [122]:
ans_m = [cos_sim(ans, W[i]) for i in range(len(vocab))]

In [123]:
ans_m

[0.031105658039450645,
 -0.06424856185913086,
 0.009941181167960167,
 -0.005522926338016987,
 0.11785838007926941,
 -0.017450157552957535,
 -0.009432808496057987,
 0.11596625298261642,
 -0.05872269347310066,
 0.03671529144048691,
 -0.006257044617086649,
 -0.17627476155757904,
 -0.14356066286563873,
 0.04618173837661743,
 -0.06440877914428711,
 -0.10119396448135376,
 -0.26191210746765137,
 0.06462448835372925,
 0.012532428838312626,
 -0.011429478414356709,
 0.024043846875429153,
 -0.09419605880975723,
 -0.08623280376195908,
 0.06988555192947388,
 -0.14107777178287506,
 0.1118343397974968,
 -0.1431577503681183,
 -0.02995377592742443,
 0.09832093119621277,
 0.034563545137643814,
 0.12666448950767517,
 -0.08430101722478867,
 -0.057016659528017044,
 0.05489868298172951,
 0.12376685440540314,
 -0.1346730887889862,
 0.07886438071727753,
 0.11971766501665115,
 -0.11823081225156784,
 -0.06040956825017929,
 0.02661706507205963,
 0.020304124802350998,
 -0.0055317506194114685,
 0.09228567034006119

In [135]:
for i in np.array(ans_m).argsort()[-5:][::-1]:
    print(id_to_word[i])
    

woman
king
appearances
construct
costing


망했다!

이를 일반화해보자.

$$ \text{word}1 : \text{word}2 \simeq \text{word}3 : \text{word}4 $$

라는 관계를 얻을 수 있는 지 유추하기 위해, $\text{word}1, \text{word}2 , \text{word}4 $ 를 입력하면 $\text{word}3$와 가장 유사한 상위 5개의 단어를 얻는 함수를 만들자.

In [145]:
def tri_analogy(w1, w2, w4):
    ans = word_to_vec(w1)-word_to_vec(w2)+word_to_vec(w4)
    
    ans_m = [cos_sim(ans, W[i]) for i in range(len(vocab))]
    for i in np.array(ans_m).argsort()[-5:][::-1]:
        print(id_to_word[i])


In [146]:
tri_analogy('king','man','woman')

woman
king
appearances
construct
costing


In [147]:
tri_analogy('korea','seoul','london')

korea
london
garage
suspected
revise


아무래도 거지같으니까 훈련을 더 해보자. 5에폭을 더 해서 총 10에폭을 하자.

In [148]:
start_t = time.time()
epochs = 5

for epoch in range(epochs):
    loss_s = 0
    for i in range((len(cont_vec) - 1) // bs + 1):
        start_i = i * bs
        end_i = start_i + bs
        
        xb = cont_vec[start_i:end_i]
        yb = cent_vec[start_i:end_i]
        
        pred = word2vec(xb)
        
        loss = loss_func(pred, yb)
        loss_s += loss
        
        loss.backward()
        optim.step()
        optim.zero_grad()
        
        if i % 1000 == 999 :
            print(epoch+1, i+1, loss_s/1000)
            loss_s=0

tt = round(time.time()-start_t)            
print("Training time :" + str(tt//60) + "m " + str(tt%60) + "s ")     

1 1000 tensor(9.9863, grad_fn=<DivBackward0>)
1 2000 tensor(9.8478, grad_fn=<DivBackward0>)
1 3000 tensor(9.9594, grad_fn=<DivBackward0>)
1 4000 tensor(9.9103, grad_fn=<DivBackward0>)
1 5000 tensor(9.9202, grad_fn=<DivBackward0>)
1 6000 tensor(9.8065, grad_fn=<DivBackward0>)
1 7000 tensor(9.8546, grad_fn=<DivBackward0>)
1 8000 tensor(9.7052, grad_fn=<DivBackward0>)
1 9000 tensor(9.7065, grad_fn=<DivBackward0>)
1 10000 tensor(9.7433, grad_fn=<DivBackward0>)
1 11000 tensor(9.8171, grad_fn=<DivBackward0>)
1 12000 tensor(9.7568, grad_fn=<DivBackward0>)
1 13000 tensor(9.6856, grad_fn=<DivBackward0>)
1 14000 tensor(9.5978, grad_fn=<DivBackward0>)
2 1000 tensor(9.7777, grad_fn=<DivBackward0>)
2 2000 tensor(9.6405, grad_fn=<DivBackward0>)
2 3000 tensor(9.7510, grad_fn=<DivBackward0>)
2 4000 tensor(9.7103, grad_fn=<DivBackward0>)
2 5000 tensor(9.7186, grad_fn=<DivBackward0>)
2 6000 tensor(9.6076, grad_fn=<DivBackward0>)
2 7000 tensor(9.6610, grad_fn=<DivBackward0>)
2 8000 tensor(9.5140, grad_fn

In [149]:
torch.save(word2vec.state_dict(), 'cbow_10epochs.pt')

In [150]:
for data in word2vec.parameters():
    W = copy.deepcopy(data)
    break

In [151]:
tri_analogy('king','man','woman')

woman
king
appearances
construct
costing


In [152]:
tri_analogy('korea','seoul','london')

korea
london
garage
suspected
revise


개망