## 통계 기반 기법 개선하기

###  상호정보량

In [None]:
def ppmi(C, verbose=False, eps = 1e-8):
    M = np.zeros_like(C, dtype=np.float32)
    N = np.sum(C)
    print("N=",N)
    S = np.sum(C, axis=0)
    total = C.shape[0] * C.shape[1]
    cnt = 0

    for i in range(C.shape[0]):
        for j in range(C.shape[1]):
            pmi = np.log2(C[i, j] * N / (S[j]*S[i]) + eps)
            M[i, j] = max(0, pmi)

            if verbose:
                cnt += 1
                if cnt % (total//100) == 0:
                    print('%.1f%% 완료' % (100*cnt/total))
    return M

In [None]:
import sys
sys.path.append('..')
import numpy as np
from common.util import preprocess, create_co_matrix, cos_similarity


text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(word_to_id)
C = create_co_matrix(corpus, vocab_size)
print(C)
W = ppmi(C)

np.set_printoptions(precision=3)  # 유효 자릿수를 세 자리로 표시
print('동시발생 행렬')
print(C)
print('-'*50)
print('PPMI')
print(W)

###  SVD에 의한 차원의 감소

In [None]:
import sys
sys.path.append('..')
import numpy as np
import matplotlib.pyplot as plt
from common.util import preprocess, create_co_matrix, ppmi


text = 'You say goodbye and I say hello.'
corpus, word_to_id, id_to_word = preprocess(text)
vocab_size = len(id_to_word)
C = create_co_matrix(corpus, vocab_size, window_size=1)
print(C)
W = ppmi(C)
print(W)
print(W.shape)
# SVD
U, S, V = np.linalg.svd(W)
print(U.shape)
print(S.shape)
print(V.shape)

np.set_printoptions(precision=3)  # 유효 자릿수를 세 자리로 표시

In [None]:
print(C[0])

In [None]:
print(W[0])

In [None]:
print(U[0])

In [None]:
for word, word_id in word_to_id.items():
    plt.annotate(word, (U[word_id, 0], U[word_id, 1]))
plt.scatter(U[:,0], U[:,1], alpha=0.5)
plt.show()

### PTB 데이터셋

In [21]:
import sys
sys.path.append('..')
from dataset import ptb

corpus, word_to_id, id_to_word = ptb.load_data('train')

print('말뭉치 크기:', len(corpus))
print('corpus[:30]:', corpus[:30])
print(len(id_to_word))
print('id_to_word[0]:', id_to_word[0])
print('id_to_word[1]:', id_to_word[1])
print('id_to_word[2]:', id_to_word[2])
print()
print("word_to_id['car']:", word_to_id['car'])
print("word_to_id['happy']:", word_to_id['happy'])
print("word_to_id['lexus']:", word_to_id['lexus'])

말뭉치 크기: 929589
corpus[:30]: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20 21 22 23
 24 25 26 27 28 29]
10000
id_to_word[0]: aer
id_to_word[1]: banknote
id_to_word[2]: berlitz

word_to_id['car']: 3856
word_to_id['happy']: 4428
word_to_id['lexus']: 7426


### PTB 데이터셋 평가

In [24]:
import sys
sys.path.append('..')
import numpy as np
from common.util import most_similar, create_co_matrix, ppmi
from dataset import ptb


window_size = 2
wordvec_size = 100

# corpus, word_to_id, id_to_word = ptb.load_data('train')
# vocab_size = len(word_to_id)
# print('동시발생 수 계산 ...')
# C = create_co_matrix(corpus, vocab_size, window_size)
# print(C.shape)
# print(C[100, :100])
# print('PPMI 계산 ...')
# W = ppmi(C, verbose=True)
# print(W.shape)
# print(W[100, :100])

print('calculating SVD ...')
try:
    # truncated SVD (빠르다!)
    from sklearn.utils.extmath import randomized_svd
    U, S, V = randomized_svd(W, n_components=wordvec_size, n_iter=5,
                             random_state=None)
except ImportError:
    # SVD (느리다)
    U, S, V = np.linalg.svd(W)

word_vecs = U[:, :wordvec_size]

querys = ['you', 'year', 'car', 'toyota']
for query in querys:
    most_similar(query, word_to_id, id_to_word, word_vecs, top=5)

calculating SVD ...

[query] you
 i: 0.6204785108566284
 we: 0.567757248878479
 anybody: 0.520540714263916
 somebody: 0.5114119648933411
 do: 0.5089985728263855

[query] year
 month: 0.6857327222824097
 earlier: 0.6832805871963501
 quarter: 0.6124237775802612
 next: 0.5931079983711243
 last: 0.5701931118965149

[query] car
 auto: 0.6566149592399597
 luxury: 0.5495672821998596
 truck: 0.546615719795227
 domestic: 0.5161890387535095
 cars: 0.47540727257728577

[query] toyota
 motor: 0.6442049145698547
 motors: 0.6331045627593994
 nissan: 0.6275714039802551
 lexus: 0.6140222549438477
 honda: 0.5720369815826416
