<a href="https://colab.research.google.com/github/2020-nlp-c/nlp-deeplearning/blob/master/jisang/14_Word2Vec_Practice.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Word2Vec 실습**

In [24]:
doc = 'you will never know until you try'

In [25]:
import nltk
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## **1. 데이터 전처리**

In [33]:
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

wl = WordNetLemmatizer()

# 문장 전처리
def tokenize(x):
    return x.split()
words = tokenize(doc)

tmp_docs = []
# Lemmatize
for word in words:
    tmp_docs.append(wl.lemmatize(word.lower(), pos = 'v' or 'n'))
# Pos Tagging
pos_docs = pos_tag(tmp_docs)

# 불용어 처리(stopWord)
stopPos = ['CC']
stopWord = [',']

docs_tokens = []
tokens = []

for pos_doc in pos_docs:
    # 불용 품사 지정
    if pos_doc[1] not in stopPos:
        # 불용어 지정
        if pos_doc[0] not in stopWord:
            # 문서 사용 단어
            docs_tokens.append(pos_doc[0])

# 전체 사용 단어
tokens = list(set(docs_tokens))

docs_tokens, tokens

(['you', 'will', 'never', 'know', 'until', 'you', 'try'],
 ['will', 'know', 'try', 'you', 'until', 'never'])

In [121]:
from numpy import argmax
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import OneHotEncoder

# 문자열 라벨링
label_enc = LabelEncoder()
label_docs = label_enc.fit_transform(docs_tokens)
# 바이너리 인코딩
onehot_enc = OneHotEncoder(sparse=False)
docs_label = label_docs.reshape(len(label_docs), 1) # n:1 matrix로 변환
onehot_docs = onehot_enc.fit_transform(docs_label)
    
label_enc.inverse_transform([5]), label_enc.transform(['you'])

onehot_docs

array([[0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 1., 0.],
       [0., 1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 1., 0., 0., 0.]])

## **2. Window 생성**

In [172]:
window_size = 1
x = []
y = []

for i in range(len(onehot_docs)):
    tmp = []
    for j in range(i-window_size, i+window_size + 1):
        if j < 0:
            pass
        elif j > len(onehot_docs):
            pass
        elif 0 <= j < len(onehot_docs):
            if i != j:
                tmp.append(onehot_docs[j])
    x.append(tmp)
    y.append(onehot_docs[i])

x, y

([[array([0., 0., 0., 0., 1., 0.])],
  [array([0., 0., 0., 0., 0., 1.]), array([0., 1., 0., 0., 0., 0.])],
  [array([0., 0., 0., 0., 1., 0.]), array([1., 0., 0., 0., 0., 0.])],
  [array([0., 1., 0., 0., 0., 0.]), array([0., 0., 0., 1., 0., 0.])],
  [array([1., 0., 0., 0., 0., 0.]), array([0., 0., 0., 0., 0., 1.])],
  [array([0., 0., 0., 1., 0., 0.]), array([0., 0., 1., 0., 0., 0.])],
  [array([0., 0., 0., 0., 0., 1.])]],
 [array([0., 0., 0., 0., 0., 1.]),
  array([0., 0., 0., 0., 1., 0.]),
  array([0., 1., 0., 0., 0., 0.]),
  array([1., 0., 0., 0., 0., 0.]),
  array([0., 0., 0., 1., 0., 0.]),
  array([0., 0., 0., 0., 0., 1.]),
  array([0., 0., 1., 0., 0., 0.])])

## **3. Feed Foward**

In [176]:
k = 4

# X to Hidden Layer Weight
x2h = np.random.rand(len(tokens), k)
# Hidden Layer to Y Weight
h2y = np.random.rand(k, len(tokens))

hidden = []
for words in x:
    tmp = []
    for word in words:
        tmp.append(np.dot(x2h.T, word.T))
    hidden.append(tmp)

def softmax(a) :
    exp_a = np.exp(a)
    sum_exp_a = np.sum(exp_a)
    y = exp_a / sum_exp_a
    
    return y

y_predict = []
for words in hidden:
    tmp = []
    for word in words:
        tmp.append(softmax(np.dot(h2y.T, word)))
    y_predict.append(tmp)

hidden, y_predict

([[array([0.06754982, 0.0545118 , 0.22466635, 0.54402886])],
  [array([0.37476888, 0.39952094, 0.25316527, 0.13961748]),
   array([0.97403416, 0.71751094, 0.87964693, 0.28081678])],
  [array([0.06754982, 0.0545118 , 0.22466635, 0.54402886]),
   array([0.19125611, 0.1286674 , 0.50945568, 0.81840018])],
  [array([0.97403416, 0.71751094, 0.87964693, 0.28081678]),
   array([0.85877328, 0.32173556, 0.64433476, 0.89671025])],
  [array([0.19125611, 0.1286674 , 0.50945568, 0.81840018]),
   array([0.37476888, 0.39952094, 0.25316527, 0.13961748])],
  [array([0.85877328, 0.32173556, 0.64433476, 0.89671025]),
   array([0.25662537, 0.19846585, 0.20178147, 0.47612152])],
  [array([0.37476888, 0.39952094, 0.25316527, 0.13961748])]],
 [[array([0.16948908, 0.18200717, 0.16688422, 0.18484098, 0.14650353,
          0.15027501])],
  [array([0.18157823, 0.20034417, 0.13626636, 0.22973571, 0.14589326,
          0.10618226]),
   array([0.16946556, 0.23748662, 0.09621911, 0.34458805, 0.09551736,
          0.0

## **4. Loss 계산**

In [174]:
loss = []
for predict in y_predict:
    tmp = []
    for j in range(len(predict)):
        tmp.append(np.log(predict[j]) * y[j])
    loss.append(np.sum(tmp))
loss = -np.sum(loss)
loss

22.02069231548569

## **5. Back Propagation**

In [205]:
alpha = 0.01
diff = []
h_layer = []
x_layer = []

for i in range(len(y)):
    for j in range(len(y_predict[i])):
        diff.append(y_predict[i][j] - y[i])
        h_layer.append(hidden[i][j])
        x_layer.append(x[i][j])

h2y = h2y - alpha*np.dot(np.matrix(h_layer).T, np.matrix(diff))
x2h = x2h - np.dot(np.matrix(x_layer).T, np.dot(h2y, np.matrix(diff).T).T)

h2y, x2h

(matrix([[0.94253575, 0.95607237, 0.54788113, 0.91982228, 0.52564631,
          0.03417574],
         [0.72913612, 0.65420656, 0.23575613, 0.86925077, 0.71368975,
          0.07544881],
         [0.24869692, 0.52461004, 0.33515029, 0.84645276, 0.12458523,
          0.59374935],
         [0.41530776, 0.40663807, 0.39129635, 0.25839081, 0.23353449,
          0.27442211]]),
 matrix([[ 1.14314743,  0.87548047,  1.35643243,  0.814836  ],
         [ 0.65323489,  0.89434217, -0.53389528,  0.25139187],
         [-1.09725366, -0.82141313,  0.46088537,  0.34299834],
         [-0.33057984, -0.62806556,  0.30645926,  0.90661075],
         [-0.65989954, -0.71699256,  0.62958885,  0.56033126],
         [-0.01053061,  0.38955723,  0.04006817, -0.0777082 ]]))