<h2>Word2Vec with Keras</h2>

<b>데이터 전처리</b>

In [1]:
from sklearn.datasets import fetch_20newsgroups

In [2]:
dataset = fetch_20newsgroups(shuffle = True, random_state = 1,
                            remove = ('headers', 'footers', 'quotes'))

documents = dataset.data

print(len(documents))
documents[1]

11314


"\n\n\n\n\n\n\nYeah, do you expect people to read the FAQ, etc. and actually accept hard\natheism?  No, you need a little leap of faith, Jimmy.  Your logic runs out\nof steam!\n\n\n\n\n\n\n\nJim,\n\nSorry I can't pity you, Jim.  And I'm sorry that you have these feelings of\ndenial about the faith you need to get by.  Oh well, just pretend that it will\nall end happily ever after anyway.  Maybe if you start a new newsgroup,\nalt.atheist.hard, you won't be bummin' so much?\n\n\n\n\n\n\nBye-Bye, Big Jim.  Don't forget your Flintstone's Chewables!  :) \n--\nBake Timmons, III"

In [3]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [4]:
def clean_text(d):
    pattern = r'[^a-zA-Z\s]'
    text = re.sub(pattern, '', d)
    return text
def clean_stopword(d):
    stop_words = stopwords.words('english')
    return ' '.join([w.lower() for w in d.split() if w not in stop_words and len(w) > 3])
def tokenize(d):
    return word_tokenize(d)

In [5]:
import pandas as pd

In [6]:
news_df = pd.DataFrame({'article':documents})
len(news_df)

11314

In [7]:
news_df.replace("", float("NaN"), inplace = True)
news_df.dropna(inplace = True)
len(news_df)

11096

In [8]:
news_df.head(2)

Unnamed: 0,article
0,Well i'm not sure about the story nad it did s...
1,"\n\n\n\n\n\n\nYeah, do you expect people to re..."


In [9]:
news_df['article'] = news_df['article'].apply(clean_text)
news_df['article']

0        Well im not sure about the story nad it did se...
1        \n\n\n\n\n\n\nYeah do you expect people to rea...
2        Although I realize that principle is not one o...
3        Notwithstanding all the legitimate fuss about ...
4        Well I will have to change the scoring on my p...
                               ...                        
11309    Danny Rubenstein an Israeli journalist will be...
11310                                                   \n
11311    \nI agree  Home runs off Clemens are always me...
11312    I used HP DeskJet with Orange Micros Grappler ...
11313                                          \nNo arg...
Name: article, Length: 11096, dtype: object

In [10]:
news_df['article'] = news_df['article'].apply(clean_stopword)
news_df['article']

0        well sure story seem biased what disagree stat...
1        yeah expect people read actually accept hard a...
2        although realize principle strongest points wo...
3        notwithstanding legitimate fuss proposal much ...
4        well change scoring playoff pool unfortunately...
                               ...                        
11309    danny rubenstein israeli journalist speaking t...
11310                                                     
11311    agree home runs clemens always memorable kinda...
11312    used deskjet orange micros grappler system upd...
11313    argument murphy scared hell came last year han...
Name: article, Length: 11096, dtype: object

In [11]:
tokenized_news = news_df['article'].apply(tokenize)
tokenized_news = tokenized_news.to_list()

In [12]:
import numpy as np

In [13]:
drop_news = [index for index, sentence in enumerate(tokenized_news) if len(sentence) <= 1]
news_texts = np.delete(tokenized_news, drop_news, axis = 0)
print(len(news_texts))

10945


  return array(a, dtype, copy=False, order=order)


In [14]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [15]:
news_2000 = news_texts[:2000]

tokenizer = Tokenizer()
tokenizer.fit_on_texts(news_2000)

idx2word = {value:key for key, value in tokenizer.word_index.items()}
sequences = tokenizer.texts_to_sequences(news_2000)

In [16]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

29769


In [17]:
print(sequences[1])

[1263, 457, 2, 60, 119, 419, 61, 1374, 22, 69, 3498, 397, 6874, 412, 1173, 373, 2256, 458, 59, 12478, 458, 1900, 3850, 397, 22, 10, 4325, 8749, 177, 303, 136, 154, 664, 12479, 316, 12480, 15, 12481, 4, 790, 12482, 12483, 4917, 8750]


<h2>Skipgram 전처리</h2>
<b>네거티브 샘플링(Nagative Sampling)</b><br>
<b>Word2Vec은 출력 값에 소프트맥스 함수를 적용해 확률로 변환 후 정답과 비교해 역전파(backpropagation)</b><br>
<b>소프트맥스 적용시 분모에 중심단어와 나머지 모든 단어의 내적 후 다시 exp 계산하는데 단어가 많을 경우 계산량이 많아짐</b><br>
<b>네거티브 샘플링은 일부 단어만 뽑아서 계산 진행</b><br>
<b>등장하지 않는 단어(Nagative Sample) 5~20개 뽑아 정답 단어와 합쳐 전체 단어처럼 소프트맥스 계산하여 파라미터 업데이트</b><br>

In [18]:
from tensorflow.keras.preprocessing.sequence import skipgrams

In [19]:
skip_grams = [skipgrams(sample, vocabulary_size = vocab_size, window_size = 10) for sample in sequences[:10]]

In [20]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))

media(499), devalued(25029) -> 0
seem(173), berate(23996) -> 0
media(499), second(107) -> 0
think(7), israels(3496) -> 1
lived(1011), europe(1638) -> 1


In [21]:
print(len(skip_grams))
print(len(pairs))
print(len(labels))

10
2420
2420


In [23]:
skip_grams = [skipgrams(seq, vocabulary_size = vocab_size, window_size = 10) for seq in sequences]

<b>Skipgram 모델 구성</b>

In [25]:
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model

In [26]:
embed_size = 50

In [32]:
def word2vec():
    target_inputs = Input(shape = (1, ), dtype = 'int32')
    target_embedding = Embedding(vocab_size, embed_size)(target_inputs)
    
    context_inputs = Input(shape = (1, ), dtype = 'int32')
    context_embedding = Embedding(vocab_size, embed_size)(context_inputs)
    
    dot_product = Dot(axes = 2)([target_embedding, context_embedding])
    drop_product = Reshape((1, ), input_shape = (1, 1))(dot_product)
    output = Activation('sigmoid')(dot_product)
    
    model = Model(inputs = [target_inputs, context_inputs], outputs = output)
    model.compile(loss = 'binary_crossentropy', optimizer = 'adam')
    
    return model

In [33]:
model = word2vec()
model.summary()
plot_model(model, show_shapes = True, show_layer_names = True)

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_3 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 1, 50)        1488450     input_3[0][0]                    
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 50)        1488450     input_4[0][0]                    
______________________________________________________________________________________________

In [35]:
for epoch in range(1, 11):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype = 'int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype = 'int32')
        labels = np.array(elem[1], dtype = 'int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
        
    print('Epoch:', epoch, 'Loss: ', loss)

Epoch: 1 Loss:  1226.995453298092
Epoch: 2 Loss:  924.7855430990458
Epoch: 3 Loss:  827.3800959289074
Epoch: 4 Loss:  782.2293037772179
Epoch: 5 Loss:  756.1711380109191
Epoch: 6 Loss:  736.5765151157975
Epoch: 7 Loss:  717.0890983901918
Epoch: 8 Loss:  694.1040043272078
Epoch: 9 Loss:  666.0650393981487
Epoch: 10 Loss:  632.6677303109318


In [None]:
import gensim`

In [None]:
f = open('skipgram.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgram = gensim.modelsKeyedVectors.load_word2vec_format('skipgram.txt', binary = False)

In [None]:
skipgram.most_similar(positive = ['soldiers'])

In [None]:
skipgram.most_similar(positive = ['world'])

<h2>CBOW 전처리</h2>

In [36]:
def skipgram2cbow(skipgrams):
    cbows = []
    flag = 0
    for n in skip_grams:
        temp1 = []
        for t in n:
            if flag == 1:
                flag = 0
                temp1.append(t)
            else:
                flag = 1
                temp2 = []
                for x in t:
                    temp2.append(x[1], x[0])
                temp1.append(temp2)
        cbows.append(temp1)
    
    return cbows

In [None]:
cbows = skipgram2cbow(skip_grams)

In [None]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print("{:s}({:d}), {:s}({:d}) -> {:d}".format(
        idx2word[pairs[i][0]], pairs[i][0],
        idx2word[pairs[i][1]], pairs[i][1],
        labels[i]))

In [None]:
print(len(cbows))
print(len(pairs))
print(len(labels))

<b>cbow 모델 구성</b>

In [None]:
model = word2vec()
model.summary()
plot_model(model, show_shapes = True, show_layer_names = True)

In [None]:
for epoch in range(1, 101):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype = 'int32')
        second_elem = np.array(list(zip(*elem[0]))[1], dtype = 'int32')
        labels = np.array(elem[1], dtype = 'int32')
        X = [first_elem, second_elem]
        Y = labels
        loss += model.train_on_batch(X, Y)
        
    print('Epoch:', epoch, 'Loss: ', loss)

In [None]:
f = open('cbow.txt', 'w')
f.write('{} {}\n'.format(vocab_size-1, embed_size))
vectors = model.get_weights()[0]
for word, i in tokenizer.word_index.items():
    f.write('{} {}\n'.format(word, ' '.join(map(str, list(vectors[i, :])))))
f.close()

skipgram = gensim.modelsKeyedVectors.load_word2vec_format('cbow.txt', binary = False)

In [None]:
skipgram.most_similar(positive = ['soldiers'])

In [None]:
skipgram.most_similar(positive = ['world'])