# 4. word2vec 속도 개선 

In [1]:
import numpy as np

## 4.1.2 Embedding 계층 구현

In [2]:
# W_in이라 가정
W = np.arange(21).reshape(7,3)
W

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14],
       [15, 16, 17],
       [18, 19, 20]])

In [3]:
i=2
j=3
print(f"{i}번째 단어의 분산 표현:  {W[i]}")
print(f"{j}번째 단어의 분산 표현:  {W[j]}")

2번째 단어의 분산 표현:  [6 7 8]
3번째 단어의 분산 표현:  [ 9 10 11]


In [4]:
# 여러 분산 표현 가져오기
idx = np.array([1,0,6,0,5])
W[idx]

array([[ 3,  4,  5],
       [ 0,  1,  2],
       [18, 19, 20],
       [ 0,  1,  2],
       [15, 16, 17]])

In [5]:
# Embedding 계층 구현
class Embedding :
    def __init__(self, W) :
        self.params = [W]
        self.grads = [np.zeros_like(W)]
        self.idx = None

    def forward(self, idx) :
        W, = self.params
        self.idx = idx
        out = W[idx]
        return out

    def backward(self, dout) :
        dW, = self.grads
        dW[...] = 0
        np.add.at(dW, self.idx, dout)

        return None

In [6]:
# Embeding층 값 예시
dW = np.zeros((3,3))
print("dW:")
print(dW)
dout = np.arange(12).reshape(4,3)
print("dout:")
print(dout)
idx = [0,2,1,2]
print("idx:")
print(idx)
print("embedding층 이후")
np.add.at(dW,idx,dout)
print(dW)

dW:
[[0. 0. 0.]
 [0. 0. 0.]
 [0. 0. 0.]]
dout:
[[ 0  1  2]
 [ 3  4  5]
 [ 6  7  8]
 [ 9 10 11]]
idx:
[0, 2, 1, 2]
embedding층 이후
[[ 0.  1.  2.]
 [ 6.  7.  8.]
 [12. 14. 16.]]


## 4.2.3 시그모이드 함수와 교차 엔트로피 차차

In [7]:
class SigmoidWithLoss :
    def __init__(self) :
        self.params, self.grads = [], []
        self.loss = []
        self.y = None 
        self.t = None

    def forward(self, x, t) :
        self.t = t
        self.y = 1 / (1 + np.exp(-x))

        self.loss = cross_entropy_error(np.c_[1-self.y, self.y], self.t)

        return self.loss

    def backward(self, dout = 1) :
        batch_size = self.t.shape[0]

        

## 4.2.4 다중분류에서 이진분류로(구현) 

In [8]:
class EmbeddingDot :
    def __init__(self, W) :
        self.embed = Embedding(W)
        self.params = self.embed.params
        self.grads = self.embed.grads
        self.cache = None

    def forward(self, h, idx) :
        target_W = self.embed.forward(idx)
        out = np.sum(target_W * h, axis=1)

        self.cache = (h, target_W)
        return out
    
    def backward(self, dout) :
        h, target_W = self.cache
        dout = dout.reshape(dout.shape[0], 1)

        dtarget_W = dout * h
        self.embed.backward(dtarget_W)
        dh = dout * target_W
        return dh

## 4.2.6 네거티브 샘플링의  샘플링 기법

In [9]:
# 넘파이에서 샘플링하는 예

words = ['you', 'say', 'goodbye', 'I', 'hello', '.']
np.random.choice(words)

np.random.choice(words, size = 5)

np.random.choice(words, size = 5, replace = False)

p = [0.5,0.1,0.05,0.2,0.05,0.1]
np.random.choice(words, p=p)


'you'

In [10]:
# 원래 확률이 낮은 단어의 확률을 높이기 위해 0.75 작업해주기
p = [0.7, 0.29, 0.01]
new_p = np.power(p, 0.75)
new_p /= np.sum(new_p)
print(new_p)

[0.64196878 0.33150408 0.02652714]


In [11]:
import sys
sys.path.append('..')
from common.np import *
import collections 
# 말뭉치에서 단어의 확률분포를 만들고, 다시 0.75를 제곱한 다음 부정적 예를 샘플링 하는 클래스
class UnigramSampler :
    def __init__(self, corpus, power, sample_size) :
        self.sample_size = sample_size
        self.vocab_size = None
        self.word_p = None

        counts = collections.Counter()
        for word_id in corpus :
            counts[word_id] += 1

        vocab_size = len(counts)
        self.vocab_size = vocab_size
        
        self.word_p = np.zeros(vocab_size)
        for i in range(vocab_size) :
            self.word_p[i] = counts[i]

        self.word_p = np.power(self.word_p, power)
        self.word_p /= np.sum(self.word_p)

    def get_negative_sample(self, target) :
        batch_size = target.shape[0]
        negative_sample = np.random.choice(self.vocab_size, size = (batch_size, self.sample_size), replace=True, p=self.word_p)

        return negative_sample
    
'''
        if not GPU :
            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)

            for i in range(batch_size) :
                p = self.word_p.copy()
                target_idx = target[i]
                p[target_idx] = 0
                p /= p.sum()
                negative_sample[i, :] = np.random.choice(self.vocab_size, size = self.sample_size)
                
        else :
'''
        

'\n        if not GPU :\n            negative_sample = np.zeros((batch_size, self.sample_size), dtype=np.int32)\n\n            for i in range(batch_size) :\n                p = self.word_p.copy()\n                target_idx = target[i]\n                p[target_idx] = 0\n                p /= p.sum()\n                negative_sample[i, :] = np.random.choice(self.vocab_size, size = self.sample_size)\n                \n        else :\n'

In [12]:
# UnigramSampler 사용 예시
corpus = np.array([0,1,2,3,4,1,2,3])
power = 0.75
sample_size = 2

sampler = UnigramSampler(corpus, power, sample_size)
target = np.array([1,3,0])
negative_sample=sampler.get_negative_sample(target)
print(negative_sample)


[[1 1]
 [4 3]
 [1 1]]


In [13]:
# 네거티브 샘플링 구현
class NegativeSamplingLoss :
    def __init__(self, W, corpus, power=0.75, sample_size = 5) :
        self.sample_size = sample_size
        self.sampler = UnigramSampler(corpus, power, sample_size)
        self.loss_layers = [SigmoidWithLoss() for _ in range(sample_size + 1)]
        self.embed_dot_layers = [EmbeddingDot(W) for _ in range(sample_size + 1)]

        self.params, self.grads = [], []
        for layer in self.embed_dot_layers :
            self.params += layer.params
            self.grads += layer.grads

    def forward(self, h, target) :
        batch_size = target.shape[0]
        negative_sample = self.sampler.get_negative_sample(target)

        # 긍정적인 예 순전파
        score = self.embed_dot_layers[0].forward(h, target)
        correct_label = np.ones(batch_size, dtype=np.int32)
        loss = self.loss_layers[0].forward(score, correct_label)

        # 부정적인 예 순전파
        negative_label = np.zeros(batch_size, dtype=np.int32)
        for i in range(self.sample_size) :
            negative_target = negative_sample[:,i]
            score = self.embed_dot_layers[1+i].forward(h, negative_target)
            loss += self.loss_layers[1+i].forward(score, negative_label)
            
        return loss
    
    def backward(self, dout = 1) :
        dh = 0
        for l0, l1 in zip(self.loss_layers, self.embed_dot_layers) :
            dscore = l0.backward(dout)
            dh += l1.backward(dscore)
            
        return dh

## 4.3.1 CBOW 모델 구현

In [14]:
# 개선된 CBOW 모델의 구현

class CBOW :
    def __init__(self, vocab_size, hidden_size, window_size, corpus) :
        V, H = vocab_size, hidden_size

        #가중치 초기화
        W_in = 0.01 * np.random.randn(V, H).astype('f')
        W_out = 0.01 * np.random.randn(V, H).astype('f')

        # 계층 생성
        self.in_layers = []
        for i in range(2 * window_size) :
            layer = Embedding(W_in)
            self.in_layers.append(layer)
        self.ns_loss = NegativeSamplingLoss(W_out, corpus, power = 0.75, sample_size=5)

        # 모든 가중치와 기울기를 배열에 모은다.
        layers = self.in_layers + [self.ns_loss]
        self.params, self.grads = [], []
        for layer in layers :
            self.params += layer.params
            self.grads += layer.grads

        # 인스턴스 변수에 단어의 분산 표현을 저장한다.
        self.word_vecs = W_in

    def forward(self, contexts, target) :
        h = 0
        for i, layer in enumerate(self.in_layers) :
            h += layer.forward(contexts[:,i])
        h *= 1 / len(self.in_layers)
        loss = self.ns_loss.forward(h, target)
        return loss

    def backward(self, dout = 1) :
        dout = self.ns_loss.backward(dout) 
        dout *= 1 / len(self.in_layers)
        for layer in self.in_layers :
            layer.backward(dout)
        return None

## 4.3.2 CBOW 모델 학습 코드

In [15]:
import sys
sys.path.append('..')
from common import config
#config.GPU = True
import pickle
from common.functions import cross_entropy_error
from common.trainer import Trainer
from common.optimizer import Adam
from common.util import create_contexts_target, to_cpu, to_gpu
from dataset import ptb

# 하이퍼파라미터 설정
window_size = 5
hidden_size = 100
batch_size = 100
max_epoch = 10

# 데이터 읽기
corpus, word_to_id, id_to_word = ptb.load_data('train')
vocab_size = len(word_to_id)

contexts, target = create_contexts_target(corpus, window_size)


In [16]:
# 전처리 확인기기
print(corpus[:160])
print(id_to_word[42])
print(contexts[:10])
print(target[:10])
print("-"*40)
print("id_to_word[target[100]] :")
print(id_to_word[int(target[100])])
print("100번째 문맥에서의 단어들")
for word_ID in map(int, contexts[100]) :
    print(id_to_word[word_ID])

[  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  27  24  39  26  40  41  42  26  43  32  44  45  46  24  47
  26  27  28  29  48  49  41  42  50  51  52  53  54  55  35  36  37  42
  56  57  58  59  24  35  60  42  61  62  63  64  65  66  67  68  69  70
  35  71  72  42  73  74  75  35  46  42  76  77  64  78  79  80  27  28
  81  82  83  24  32  61  84  26  40  85  26  62  78  86  32  26  87  88
  89  90  64  78  91  92  93  94  95  96  97  82  98  24  26  99  32 100
  42 101 102  26 103  93 104  66 105 106 107  26 108 109  26  67]
of
[[ 0  1  2  3  4  6  7  8  9 10]
 [ 1  2  3  4  5  7  8  9 10 11]
 [ 2  3  4  5  6  8  9 10 11 12]
 [ 3  4  5  6  7  9 10 11 12 13]
 [ 4  5  6  7  8 10 11 12 13 14]
 [ 5  6  7  8  9 11 12 13 14 15]
 [ 6  7  8  9 10 12 13 14 15 16]
 [ 7  8  9 10 11 13 14 15 16 17]
 [ 8  9 10 11 12 14 15 16 17 18]
 [ 9 10 11 12 13 15 16 17 18 19]]
[ 5  6  7  8  9 

In [17]:
if config.GPU :
    contexts, target = to_gpu(contexts), to_gpu(target)

# 모델 등 생성
model = CBOW(vocab_size, hidden_size, window_size, corpus)
optimizer = Adam()
trainer = Trainer(model, optimizer)

# 학습 시작
trainer.fit(contexts, target, max_epoch, batch_size)
trainer.plot()


# 나중에 사용할 수 있도록 필요한 데이터 저장
word_vecs = model.word_vecs
if config.GPU :
    word_vecs = to_cpu(word_vecs)

params = {}
params['word_vecs'] = word_vecs.astype(np.float16)
params['word_to_id'] = word_to_id
params['id_to_word'] = id_to_word
pkl_file = 'cbow_params.pkl'
with open(pkl_file, 'wb') as f :
    pickle.dump(params, f, -1)

AttributeError: 'NoneType' object has no attribute 'reshape'

## 4.3.3 CBOW 모델 평가

In [18]:
import sys
sys.path.append('..')
from common.util import most_similar

In [19]:
pkl_file = 'cbow_params.pkl'

with open(pkl_file, 'rb') as f :
    params = pickle.load(f)
    word_vecs = params['word_vecs']
    word_to_id = params['word_to_id']
    id_to_word = params['id_to_word']

querys = ['money', 'year', 'car', 'mom']
for query in querys :
    most_similar(query, word_to_id, id_to_word, word_vecs, top = 5)



[query] money
 topics: 0.458740234375
 clients: 0.452880859375
 requirements: 0.423095703125
 title: 0.4052734375
 payouts: 0.39501953125

[query] year
 month: 0.71875
 week: 0.65234375
 spring: 0.62744140625
 summer: 0.6259765625
 decade: 0.603515625

[query] car
 luxury: 0.497314453125
 arabia: 0.47802734375
 auto: 0.47119140625
 disk-drive: 0.450927734375
 travel: 0.4091796875

[query] mom
 tricky: 0.455322265625
 everything: 0.438232421875
 hud: 0.435302734375
 practical: 0.4287109375
 retailers: 0.427978515625


## man - king + queen = women

### 단어벡터의 덧셈, 뺄셈으로 유추문제를 해결할 수 있다.
### 단어의 의미뿐만이 아니라 문법적인 패턴도 알 수 있다.

In [23]:


def analogy(a, b, c, word_to_id, id_to_word, word_matrix, top=5, answer=None):
    for word in (a, b, c):
        if word not in word_to_id:
            print('%s(을)를 찾을 수 없습니다.' % word)
            return

    print('\n[analogy] ' + a + ':' + b + ' = ' + c + ':?')
    a_vec, b_vec, c_vec = word_matrix[word_to_id[a]], word_matrix[word_to_id[b]], word_matrix[word_to_id[c]]
    query_vec = b_vec - a_vec + c_vec
    query_vec = normalize(query_vec)

    similarity = np.dot(word_matrix, query_vec)

    if answer is not None:
        print("==>" + answer + ":" + str(np.dot(word_matrix[word_to_id[answer]], query_vec)))

    count = 0
    for i in (-1 * similarity).argsort():
        if np.isnan(similarity[i]):
            continue
        if id_to_word[i] in (a, b, c):
            continue
        print(' {0}: {1}'.format(id_to_word[i], similarity[i]))

        count += 1
        if count >= top:
            return


def normalize(x):
    if x.ndim == 2:
        s = np.sqrt((x * x).sum(1))
        x /= s.reshape((s.shape[0], 1))
    elif x.ndim == 1:
        s = np.sqrt((x * x).sum())
        x /= s
    return x


In [25]:
analogy('king', 'man', 'queen', word_to_id, id_to_word, word_vecs)


[analogy] king:man = queen:?
 woman: 5.16015625
 veto: 4.9296875
 ounce: 4.69140625
 earthquake: 4.6328125
 successor: 4.609375


In [26]:
print(analogy('take', 'took', 'buy', word_to_id, id_to_word, word_vecs))


[analogy] take:took = buy:?
 shares: 5.5859375
 bought: 4.98828125
 owns: 4.6484375
 cents: 4.5703125
 chairman: 4.2265625
None


In [27]:
print(analogy('take', 'took', 'eat', word_to_id, id_to_word, word_vecs))


[analogy] take:took = eat:?
 could: 4.4453125
 might: 4.23046875
 street: 4.03125
 owns: 4.03125
 succeeds: 3.96875
None


In [28]:
print(analogy('car', 'cars', 'person', word_to_id, id_to_word, word_vecs))


[analogy] car:cars = person:?
 there: 4.93359375
 average: 4.78125
 you: 4.5234375
 yield: 4.51953125
 people: 4.45703125
None


In [31]:
print(analogy('people', 'person', 'year', word_to_id, id_to_word, word_vecs))


[analogy] people:person = year:?
 month: 9.2890625
 week: 8.0859375
 summer: 6.38671875
 article: 5.89453125
 quarter: 5.2421875
None


In [32]:
print(analogy('body', 'shoulder', 'car', word_to_id, id_to_word, word_vecs))


[analogy] body:shoulder = car:?
 share: 5.26171875
 fell: 5.15234375
 percentage: 4.1640625
 auto: 4.00390625
 hour: 3.982421875
None


In [33]:
print(analogy('take', 'took', 'have', word_to_id, id_to_word, word_vecs))


[analogy] take:took = have:?
 has: 9.3828125
 had: 8.2890625
 've: 6.99609375
 showed: 4.08984375
 owns: 3.97265625
None
