### 사전학습된 모델
- p.295: 모델 학습
- p.333 - p.335
- word2vec 모델로 임베딩 계층 초기화


#### word2vec 모델 학습 w/ 네이버 영화리뷰 데이터

In [3]:
import torch.nn as nn

class VanillaSkipgram(nn.Module):
	def __init__(self, vocab_size, embedding_dim):
        
		super().__init__()
		self.embedding = nn.Embedding(
			num_embeddings=vocab_size,
			embedding_dim=embedding_dim
		)

		self.linear = nn.Linear(
			in_features=embedding_dim,
			out_features=vocab_size
		)

	def forward(self, input_ids):
		embeddings = self.embedding(input_ids)
		output = self.linear(embeddings)
		return output
	

In [4]:
import pandas as pd
from Korpora import Korpora
from konlpy.tag import Okt

In [5]:
corpus = Korpora.load("nsmc")

corpus = pd.DataFrame(corpus.test)

tokenizer = Okt()
tokens = [tokenizer.morphs(review) for review in corpus.text]



    Korpora 는 다른 분들이 연구 목적으로 공유해주신 말뭉치들을
    손쉽게 다운로드, 사용할 수 있는 기능만을 제공합니다.

    말뭉치들을 공유해 주신 분들에게 감사드리며, 각 말뭉치 별 설명과 라이센스를 공유 드립니다.
    해당 말뭉치에 대해 자세히 알고 싶으신 분은 아래의 description 을 참고,
    해당 말뭉치를 연구/상용의 목적으로 이용하실 때에는 아래의 라이센스를 참고해 주시기 바랍니다.

    # Description
    Author : e9t@github
    Repository : https://github.com/e9t/nsmc
    References : www.lucypark.kr/docs/2015-pyconkr/#39

    Naver sentiment movie corpus v1.0
    This is a movie review dataset in the Korean language.
    Reviews were scraped from Naver Movies.

    The dataset construction is based on the method noted in
    [Large movie review dataset][^1] from Maas et al., 2011.

    [^1]: http://ai.stanford.edu/~amaas/data/sentiment/

    # License
    CC0 1.0 Universal (CC0 1.0) Public Domain Dedication
    Details in https://creativecommons.org/publicdomain/zero/1.0/

[Korpora] Corpus `nsmc` is already installed at C:\Users\KDP-43\Korpora\nsmc\ratings_train.txt
[Korpora] Corpus `nsmc` is already installed at C:\Users\KD

In [None]:
print(tokens[:3])

In [None]:
#단어 사전 구축
from collections import Counter

def build_vocab(corpus, n_vocab, special_tokens):
	counter = Counter()
    
	for tokens in corpus:
		counter.update(tokens)

	vocab = special_tokens
	
	for token, count in counter.most_common(n_vocab):
		vocab.append(token)

	return vocab

vocab = build_vocab(corpus=tokens, n_vocab=5000, special_tokens=["<unk>"])
token_to_id = {token:id for idx, token in enumerate(vocab)}
id_to_token = {idx:token for idx, token in enumerate(vocab)}

print(vocab[:10])
print(len(vocab))

In [None]:
# Skip-gram의 단어 쌍 추출
def get_word_pairs(tokens, window_size):
	pairs = []
    
	for sentence in tokens:
		sentence_length = len(sentence)

		# idx => 현재 단어 index
		for idx, center_word in enumerate(sentence):
			window_start = max(0, idx- window_size)				 # 문장 경계 내 조건
			window_end = min(sentence_length, idx+window_size+1) # 문장 경계 내 조건

			center_word = sentence[idx]
			context_words = sentence[window_start:idx] + sentence[idx+1:window_end]

			for context_word in context_words:
				pairs.append([center_word, context_word])

	return pairs

# window_size => 고려할 주변 단어 개수
word_pairs = get_word_pairs(tokens, window_size=2)
print(word_pairs)

In [None]:
# word_pairs => 토큰 인덱스 쌍 변환
def get_index_pairs(word_pairs, token_to_id):
	pairs=[]
    
	unk_index = token_to_id["<unk>"]

	for word_pair in word_pairs:
		center_word, context_word = word_pair
		center_index = token_to_id.get(center_word, unk_index)
		context_index = token_to_id.get(context_word, unk_index)

		pairs.append( [center_index, context_index])
	return pairs

index_pairs = get_index_pairs(word_pairs, token_to_id)
print(index_pairs[:5])

In [6]:
# 데이터 로더 적용
import torch
from torch.utils.data import TensorDataset, DataLoader

In [None]:
index_pairs = torch.tensor(index_pairs)
center_indexs = index_pairs[:, 0]			# feature
context_indexs = index_pairs[:, 1]			# target

dataset = TensorDataset(center_indexs, context_indexs)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

In [None]:
# skip-gram 모델 준비 작업
from torch import optim

device = "cuda" if torch.cuda.is_available() else "cpu"
word2vec = VanillaSkipgram(vocab_size=len(token_to_id), embedding_dim=128).to(device)

criterion = nn.CrossEntropyLoss().to(device)
optimizer = optim.SGD(word2vec.parameters(), lr=0.1)

In [None]:
# 모델 학습
for epoch in range(10):
    
	cost = 0.0

	for input_ids, target_ids in dataloader:
		input_ids = input_ids.to(device)
		target_ids = target_ids.to(device)

		logits = word2vec(input_ids)
		loss = criterion(logits, target_ids)

		optimizer.zero_grad()
		loss.backward()
		optimizer.step()

		cost +=loss

	cost = cost/len(dataloader)

	print(f'Epoch: {epoch+1:4d}, Cost: {cost:.3f}')
	

In [None]:
# 임베딩 값 추출
token_to_embedding = dict()
embedding_matrix = word2vec.embedding.weight.detach().cpu().numpy()

for word, embedding in zip(vocab, embedding_matrix):
    token_to_embedding[word] = embedding
    
index = 30
token = vocab[30]

token_embedding = token_to_embedding[token]

print(token)
print(token_embedding)


In [6]:
from gensim.models import Word2Vec

word2vec = Word2Vec(
    sentences=tokens,		#
    vector_size = 128,
    window=5,
    min_count=1,
    sg=1,
    epochs=3,
    max_final_vocab=10000
)

In [7]:
# 학습 모델 저장
word2vec.save("./models/word2vec.model")
