# Skip-Gram with Negative Sampling을 사용한 전처리

In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.datasets import fetch_20newsgroups
from tensorflow.keras.preprocessing.text import Tokenizer

In [16]:
# 중심 단어, 주변 단어들을 포함해 최소 2개의 단어가 있어야 함.
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=("headers", "footers", "quotes"))
news = dataset.data

len(news)

11314

In [17]:
# 특수 문자와 길이가 3이하인 단어 제거 및 소문자 변환
news_df = pd.DataFrame({"doc": news})
news_df["clean_doc"] = news_df["doc"].str.replace("[^a-zA-Z]", " ", regex=True) # 알파벳이 아닌 것 제거
news_df["clean_doc"] = news_df["clean_doc"].apply(lambda doc: " ".join([word for word in doc.split() if len(word) > 3])) # 길이가 3이하인 단어는 제거
news_df["clean_doc"] = news_df["clean_doc"].str.lower() # 소문자 변환

news_df["clean_doc"].head()

0    well sure about story seem biased what disagre...
1    yeah expect people read actually accept hard a...
2    although realize that principle your strongest...
3    notwithstanding legitimate fuss about this pro...
4    well will have change scoring playoff pool unf...
Name: clean_doc, dtype: object

In [18]:
# 결측치 확인
news_df.isnull().any()

doc          False
clean_doc    False
dtype: bool

In [19]:
# 빈 값 확인
news_df = news_df.replace("", np.nan)
news_df.isnull().any()

doc          True
clean_doc    True
dtype: bool

In [20]:
# 삭제
news_df.dropna(inplace=True)

len(news_df)

10995

In [21]:
# 불용어 제거
stop_words = stopwords.words("english")
tokenized_doc = news_df["clean_doc"].apply(lambda doc: [word for word in doc.split() if word not in stop_words])

In [22]:
# 단어가 1개 이하인 경우 제거
tokenized_doc = tokenized_doc.apply(lambda doc: doc if len(doc) > 1 else "target")
mask = tokenized_doc == "target"
idx = tokenized_doc[mask].index
tokenized_doc = tokenized_doc.drop(idx)

tokenized_doc = tokenized_doc.to_list()

In [23]:
# 정수 인코딩
tokenizer = Tokenizer()
tokenizer.fit_on_texts(tokenized_doc)
word2idx = tokenizer.word_index
idx2word = {v: k for k, v in word2idx.items()}
encoded = tokenizer.texts_to_sequences(tokenized_doc)
vocab_size = len(word2idx) + 1

print(encoded[0])

[9, 59, 603, 207, 3278, 1495, 474, 702, 9470, 13686, 5533, 15227, 702, 442, 702, 70, 1148, 1095, 1036, 20294, 984, 705, 4294, 702, 217, 207, 1979, 15228, 13686, 4865, 4520, 87, 1530, 6, 52, 149, 581, 661, 4406, 4988, 4866, 1920, 755, 10668, 1102, 7837, 442, 957, 10669, 634, 51, 228, 2669, 4989, 178, 66, 222, 4521, 6066, 68, 4295]


In [24]:
# SGNS을 통한 dataset 구성
from tensorflow.keras.preprocessing.sequence import skipgrams

skip_grams = [skipgrams(sample, vocabulary_size=vocab_size, window_size=10) for sample in encoded[:10]]

In [25]:
pairs, labels = skip_grams[0][0], skip_grams[0][1]
for i in range(5):
    print(f"{idx2word[pairs[i][0]]} ({pairs[i][0]}), {idx2word[pairs[i][1]]} ({pairs[i][1]}) -> {labels[i]}")

acts (1102), beverly (11853) -> 0
makes (228), underwhelming (45547) -> 0
reason (149), commited (7837) -> 1
shame (4988), lqavd (56298) -> 0
blessing (10669), austria (4866) -> 1


In [26]:
len(pairs), len(labels)

(2220, 2220)

In [27]:
# SGNS
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Embedding, Reshape, Activation, Input, Dot
from tensorflow.keras.utils import plot_model
from IPython.display import SVG

embedding_dim = 100

# 중심 단어 임베딩 테이블
w_inputs = Input(shape=(1, ), dtype=np.int32)
word_embedding = Embedding(vocab_size, embedding_dim)(w_inputs)

# 주변 단어 임베딩 테이블
c_inputs = Input(shape=(1, ), dtype=np.int32)
context_embedding = Embedding(vocab_size, embedding_dim)(c_inputs)

dot_product = Dot(axes=2)([word_embedding, context_embedding])
dot_product = Reshape((1, ), input_shape=(1, 1))(dot_product)
output = Activation("sigmoid")(dot_product)

model = Model(inputs=[w_inputs, c_inputs], outputs=output)
model.summary()
model.compile(optimizer="adam", loss="binary_crossentropy")
plot_model(model, to_file="20newsgroups.png", show_shapes=True, show_layer_names=True, rankdir="TB")

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_3 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 input_4 (InputLayer)           [(None, 1)]          0           []                               
                                                                                                  
 embedding_2 (Embedding)        (None, 1, 100)       6427700     ['input_3[0][0]']                
                                                                                                  
 embedding_3 (Embedding)        (None, 1, 100)       6427700     ['input_4[0][0]']                
                                                                                            

In [28]:
for epoch in range(100):
    loss = 0
    for _, elem in enumerate(skip_grams):
        first_elem = np.array(list(zip(*elem[0]))[0], dtype=np.int32)
        second_elem = np.array(list(zip(*elem[0]))[1], dtype=np.int32)
        labels = np.array(elem[1], dtype=np.int32)
        X = [first_elem, second_elem]
        y = labels
        loss += model.train_on_batch(X, y)
    print(f"Epoch: {epoch}, Loss: {loss}")

2023-07-28 19:13:51.345281: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.
2023-07-28 19:13:51.616634: I tensorflow/core/grappler/optimizers/custom_graph_optimizer_registry.cc:113] Plugin optimizer for device_type GPU is enabled.


Epoch: 0, Loss: 6.930021107196808
Epoch: 1, Loss: 6.904720962047577
Epoch: 2, Loss: 6.876796245574951
Epoch: 3, Loss: 6.842806041240692
Epoch: 4, Loss: 6.800464391708374
Epoch: 5, Loss: 6.747705817222595
Epoch: 6, Loss: 6.6827380657196045
Epoch: 7, Loss: 6.604081630706787
Epoch: 8, Loss: 6.5106083154678345
Epoch: 9, Loss: 6.401570498943329
Epoch: 10, Loss: 6.276617050170898
Epoch: 11, Loss: 6.135801255702972
Epoch: 12, Loss: 5.9795748591423035
Epoch: 13, Loss: 5.808772087097168
Epoch: 14, Loss: 5.624581754207611
Epoch: 15, Loss: 5.428508639335632
Epoch: 16, Loss: 5.222320914268494
Epoch: 17, Loss: 5.007989436388016
Epoch: 18, Loss: 4.787619888782501
Epoch: 19, Loss: 4.5633814334869385
Epoch: 20, Loss: 4.337436705827713
Epoch: 21, Loss: 4.111874878406525
Epoch: 22, Loss: 3.888652265071869
Epoch: 23, Loss: 3.6695437133312225
Epoch: 24, Loss: 3.456105053424835
Epoch: 25, Loss: 3.2496496737003326
Epoch: 26, Loss: 3.051237612962723
Epoch: 27, Loss: 2.861675977706909
Epoch: 28, Loss: 2.68153

In [31]:
# 결과 확인
import gensim

with open("vectors.txt", "w") as f:
    f.write(f"{vocab_size - 1} {embedding_dim}\n")
    vectors = model.get_weights()[0]
    for word, i in tokenizer.word_index.items():
        f.write(f"{word}, {' '.join(map(str, list(vectors[i, :])))}\n")

w2v = gensim.models.KeyedVectors.load_word2vec_format("./vectors.txt", binary=False)

In [34]:
w2v.most_similar(positive=["people"]) # 이 부분은 오류 해결이 어려워서 실패...

KeyError: "Key 'people' not present in vocabulary"