<a href="https://colab.research.google.com/github/hyunicecream/Natural-Language-Processing-NLP-/blob/main/7_7_%EC%88%98(Negative_Sampling).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import nltk
from tensorflow.keras.preprocessing.text import Tokenizer
from nltk.stem import LancasterStemmer
from tensorflow.keras.layers import Input, Embedding, Dense, Dot, Activation, Flatten
from tensorflow.keras.models import Model
from sklearn.metrics.pairwise import cosine_similarity
from tensorflow.keras.optimizers import Adam
import tensorflow.keras.backend as K

nltk.download('punkt')
nltk.download('gutenberg')

# 영문 소설 10개 불러오기
n = 10
stemmer = LancasterStemmer()
sent_stem = []
for i, text_id in enumerate(nltk.corpus.gutenberg.fileids()[:n]):
    text = nltk.corpus.gutenberg.raw(text_id)
    sentences = nltk.sent_tokenize(text)

    # 각 단어에 Lancaster stemmer를 적용한다.
    for sentence in sentences:
        word_tok = nltk.word_tokenize(sentence)
        stem = [stemmer.stem(word) for word in word_tok]
        sent_stem.append(stem)
    print('{}: {} ----- processed.'.format(i+1, text_id))

print("총 문장 개수 =", len(sent_stem))
print(sent_stem[0])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package gutenberg to /root/nltk_data...
[nltk_data]   Unzipping corpora/gutenberg.zip.
1: austen-emma.txt ----- processed.
2: austen-persuasion.txt ----- processed.
3: austen-sense.txt ----- processed.
4: bible-kjv.txt ----- processed.
5: blake-poems.txt ----- processed.
6: bryant-stories.txt ----- processed.
7: burgess-busterbrown.txt ----- processed.
8: carroll-alice.txt ----- processed.
9: chesterton-ball.txt ----- processed.
10: chesterton-brown.txt ----- processed.
총 문장 개수 = 59824
['[', 'emm', 'by', 'jan', 'aust', '1816', ']', 'volum', 'i', 'chapt', 'i', 'emm', 'woodh', ',', 'handsom', ',', 'clev', ',', 'and', 'rich', ',', 'with', 'a', 'comfort', 'hom', 'and', 'happy', 'disposit', ',', 'seem', 'to', 'unit', 'som', 'of', 'the', 'best', 'bless', 'of', 'ex', ';', 'and', 'had', 'liv', 'near', 'twenty-one', 'year', 'in', 'the', 'world', 'with', 'very', 'litt

In [None]:
# 토크나이저 하기
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sent_stem)

In [None]:
# 단어 사전 만들기
word2idx = tokenizer.word_index
idx2word = {v:k for k, v in word2idx.items()}
print("사전 크기 =", len(word2idx))

사전 크기 = 21181


In [None]:
# 문장을 단어의 인덱스로 표현
sent_idx = tokenizer.texts_to_sequences(sent_stem)

In [None]:
np.array(sent_idx[1])

array([  38,   17,    2, 2540,    4,    2,  134,  251,    4,    8,  233,
        570,    1, 1861,   98,   12,    3,   37,    1,    7, 1170,    4,
         26,  305,   54,  374,    1,   96, 2173,    4,   15,   86,   44,
          8,   75,  133, 2047,    5])

In [None]:
max(idx2word)
idx2word

21181

In [None]:
# trigram 만들기
import random
x_1 = []
x_2 = []
y = []
for sent in sent_idx:
    if len(sent) < 3:
        continue

    for a, b, c in nltk.trigrams(sent):
        for i in range(6):
          x_1.append(b)           
        for k in range(4):
          x_2.append(random.randint(1, len(idx2word)))
          y.append(0)
        x_2.append(a)
        x_2.append(c)
        y.append(1)
        y.append(1)

In [None]:
print(x_1[:100])
print(x_2[:100])
print(y[:100])

[230, 230, 230, 230, 230, 230, 50, 50, 50, 50, 50, 50, 544, 544, 544, 544, 544, 544, 4373, 4373, 4373, 4373, 4373, 4373, 12823, 12823, 12823, 12823, 12823, 12823, 2448, 2448, 2448, 2448, 2448, 2448, 3913, 3913, 3913, 3913, 3913, 3913, 11, 11, 11, 11, 11, 11, 962, 962, 962, 962, 962, 962, 11, 11, 11, 11, 11, 11, 230, 230, 230, 230, 230, 230, 529, 529, 529, 529, 529, 529, 1, 1, 1, 1, 1, 1, 1155, 1155, 1155, 1155, 1155, 1155, 1, 1, 1, 1, 1, 1, 1700, 1700, 1700, 1700, 1700, 1700, 1, 1, 1, 1]
[7653, 12907, 7549, 3913, 2447, 50, 16353, 15535, 19296, 7233, 230, 544, 2440, 13609, 6648, 4561, 50, 4373, 7379, 3582, 9825, 18720, 544, 12823, 8409, 12198, 18437, 13586, 4373, 2448, 19415, 5577, 20173, 9080, 12823, 3913, 9181, 20701, 14806, 5890, 2448, 11, 11553, 484, 8030, 17540, 3913, 962, 4489, 19396, 9495, 18285, 11, 11, 15184, 11100, 10080, 9072, 962, 230, 13741, 15422, 17335, 18049, 11, 529, 1111, 2929, 11331, 2070, 230, 1, 3128, 7133, 411, 7057, 529, 1155, 3599, 19464, 6292, 20683, 1, 1, 13997

In [None]:
# shape 1차원 행렬로 만들어주기
x_1 = np.array(x_1).reshape(-1, 1)
x_2 = np.array(x_2).reshape(-1, 1)
y = np.array(y).reshape(-1, 1)
#x_1
#x_2
#y

In [None]:
def my_activation(x):
  return K.sigmoid(x)
VOC_SIZE = len(word2idx) + 1
EMB_SIZE = 32
# 중심 단어를 위한 임베딩 
x_input_1 = Input(batch_shape=(None, 1))
x_emb_1 = Embedding(VOC_SIZE, EMB_SIZE)(x_input_1)
x_emb_1 = Flatten()(x_emb_1)
x_input_2 = Input(batch_shape=(None, 1))
x_emb_2 = Embedding(VOC_SIZE, EMB_SIZE)(x_input_2)
x_emb_2 = Flatten()(x_emb_2)
y_output = Dot(axes=1)([x_emb_1, x_emb_2]) 
y_output = Activation(my_activation)(y_output) 
#y_output = Dense(VOC_SIZE, activation='sigmoid')(y_output)

model = Model([x_input_1, x_input_2], y_output)
model.compile(loss = 'binary_crossentropy', optimizer='adam')
model.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 32)        677824      input_3[0][0]                    
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 32)        677824      input_4[0][0]                    
____________________________________________________________________________________________

In [None]:
model_vec = Model(x_input_1, x_emb_1)
model_vec = Model(x_input_2, x_emb_2)
hist = model.fit([x_1, x_2], y, batch_size=10240, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

def get_word2vec(word):
    stem_word = stemmer.stem(word)
    if stem_word not in word2idx:
        print('{}가 없습니다.'.format(word))
        return
    
    word2vec = model_vec.predict(np.array(word2idx[stem_word]).reshape(1,1))
    return word2vec

father = get_word2vec('father')
mother = get_word2vec('mother')
doctor = get_word2vec('doctor')

cosine_similarity(father, mother) # array([[0.64571965]], dtype=float32)

cosine_similarity(father, doctor) # array([[0.16309245]], dtype=float32)

cosine_similarity(mother, doctor) # array([[0.4438712]], dtype=float32)

array([[0.4438712]], dtype=float32)