## 5.2. CBOW 임베딩 학습하기

In [1]:
import os
from argparse import Namespace
from collections import Counter
import json
import re
import string

import numpy as np
import pandas as pd
import torch 
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
import tqdm

In [2]:
class CBOWDataset(Dataset):
    def __init__(self, cbow_df, vectorizer):
        """
        매개변수:
            cbow_df (pandas.DataFrame): 데이터셋
            vectorizer (CBOWVectorizer): 데이터셋에서 만든 CBOWVectorizer 객체
        """
        self.cbow_df = cbow_df
        self._vectorizer = vectorizer
        
        measure_len = lambda context: len(context.split(" ")) # 각 문맥의 토큰 수
        self._max_seq_length = max(map(measure_len, cbow_df.context)) # 여러 문맥들의 최대 토큰 수
        
        self.train_df = self.cbow_df[self.cbow_df.split=='train']
        self.train_size = len(self.train_df)

        self.val_df = self.cbow_df[self.cbow_df.split=='val']
        self.validation_size = len(self.val_df)

        self.test_df = self.cbow_df[self.cbow_df.split=='test']
        self.test_size = len(self.test_df)

        self._lookup_dict = {'train': (self.train_df, self.train_size),
                             'val': (self.val_df, self.validation_size),
                             'test': (self.test_df, self.test_size)}

        self.set_split('train')

    @classmethod
    def load_dataset_and_make_vectorizer(cls, cbow_csv):
        """데이터셋을 로드하고 처음부터 새로운 Vectorizer 만들기
        
        매개변수:
            cbow_csv (str): 데이터셋의 위치
        반환값:
            CBOWDataset의 인스턴스
        """
        cbow_df = pd.read_csv(cbow_csv)
        train_cbow_df = cbow_df[cbow_df.split=='train']
        return cls(cbow_df, CBOWVectorizer.from_dataframe(train_cbow_df))

    @classmethod
    def load_dataset_and_load_vectorizer(cls, cbow_csv, vectorizer_filepath):
        """ 데이터셋을 로드하고 새로운 CBOWVectorizer 객체를 만듭니다.
        캐시된 CBOWVectorizer 객체를 재사용할 때 사용합니다.
        
        매개변수:
            cbow_csv (str): 데이터셋의 위치
            vectorizer_filepath (str): CBOWVectorizer 객체의 저장 위치
        반환값:
            CBOWVectorizer의 인스턴스
        """
        cbow_df = pd.read_csv(cbow_csv)
        vectorizer = cls.load_vectorizer_only(vectorizer_filepath)
        return cls(cbow_df, vectorizer)

    @staticmethod
    def load_vectorizer_only(vectorizer_filepath):
        """파일에서 CBOWVectorizer 객체를 로드하는 정적 메서드
        
        매개변수:
            vectorizer_filepath (str): 직렬화된 CBOWVectorizer 객체의 위치
        반환값:
            CBOWVectorizer의 인스턴스
        """
        with open(vectorizer_filepath) as fp:
            return CBOWVectorizer.from_serializable(json.load(fp))

    def save_vectorizer(self, vectorizer_filepath):
        """CBOWVectorizer 객체를 json 형태로 디스크에 저장합니다
        
        매개변수:
            vectorizer_filepath (str): CBOWVectorizer 객체의 저장 위치
        """
        with open(vectorizer_filepath, "w") as fp:
            json.dump(self._vectorizer.to_serializable(), fp)

    def get_vectorizer(self):
        """ 벡터 변환 객체를 반환합니다 """
        return self._vectorizer
        
    def set_split(self, split="train"):
        """ 데이터프레임에 있는 열을 사용해 분할 세트를 선택합니다 """
        self._target_split = split
        self._target_df, self._target_size = self._lookup_dict[split]

    def __len__(self):
        return self._target_size

    def __getitem__(self, index):
        """파이토치 데이터셋의 주요 진입 메서드
        
        매개변수:
            index (int): 데이터 포인트의 인덱스
        반환값:
            데이터 포인트의 특성(x_data)과 레이블(y_target)로 이루어진 딕셔너리
        """
        row = self._target_df.iloc[index]

        context_vector = \
            self._vectorizer.vectorize(row.context, self._max_seq_length)
        target_index = self._vectorizer.cbow_vocab.lookup_token(row.target)

        return {'x_data': context_vector,
                'y_target': target_index}

    def get_num_batches(self, batch_size):
        """배치 크기가 주어지면 데이터셋으로 만들 수 있는 배치 개수를 반환합니다
        
        매개변수:
            batch_size (int)
        반환값:
            배치 개수
        """
        return len(self) // batch_size
    
def generate_batches(dataset, batch_size, shuffle=True,
                     drop_last=True, device="cpu"): 
    """
    파이토치 DataLoader를 감싸고 있는 제너레이터 함수.
    걱 텐서를 지정된 장치로 이동합니다.
    """
    dataloader = DataLoader(dataset=dataset, batch_size=batch_size,
                            shuffle=shuffle, drop_last=drop_last)

    for data_dict in dataloader:
        out_data_dict = {}
        for name, tensor in data_dict.items():
            out_data_dict[name] = data_dict[name].to(device)
        yield out_data_dict

In [3]:
class Vocabulary(object):
    """ 매핑을 위해 텍스트를 처리하고 어휘 사전을 만드는 클래스 """

    def __init__(self, token_to_idx=None, mask_token="<MASK>", add_unk=True, unk_token="<UNK>"):
        """
        매개변수:
            token_to_idx (dict): 기존 토큰-인덱스 매핑 딕셔너리
            mask_token (str): Vocabulary에 추가할 MASK 토큰.
                모델 파라미터를 업데이트하는데 사용하지 않는 위치를 나타냅니다.
            add_unk (bool): UNK 토큰을 추가할지 지정하는 플래그
            unk_token (str): Vocabulary에 추가할 UNK 토큰
        """

        if token_to_idx is None:
            token_to_idx = {}
        self._token_to_idx = token_to_idx

        self._idx_to_token = {idx: token 
                              for token, idx in self._token_to_idx.items()}
        
        self._add_unk = add_unk
        self._unk_token = unk_token
        self._mask_token = mask_token
        
        self.mask_index = self.add_token(self._mask_token)
        self.unk_index = -1
        if add_unk:
            self.unk_index = self.add_token(unk_token) 
        
    def to_serializable(self):
        """ 직렬화할 수 있는 딕셔너리를 반환합니다 """
        return {'token_to_idx': self._token_to_idx, 
                'add_unk': self._add_unk, 
                'unk_token': self._unk_token, 
                'mask_token': self._mask_token}

    @classmethod
    def from_serializable(cls, contents):
        """ 직렬화된 딕셔너리에서 Vocabulary 객체를 만듭니다 """
        return cls(**contents)

    def add_token(self, token):
        """ 토큰을 기반으로 매핑 딕셔너리를 업데이트합니다

        매개변수:
            token (str): Vocabulary에 추가할 토큰
        반환값:
            index (int): 토큰에 상응하는 정수
        """
        if token in self._token_to_idx:
            index = self._token_to_idx[token]
        else:
            index = len(self._token_to_idx)
            self._token_to_idx[token] = index
            self._idx_to_token[index] = token
        return index
            
    def add_many(self, tokens):
        """ 토큰 리스트를 Vocabulary에 추가합니다.
        
        매개변수:
            tokens (list): 문자열 토큰 리스트
        반환값:
            indices (list): 토큰 리스트에 상응되는 인덱스 리스트
        """
        return [self.add_token(token) for token in tokens]

    def lookup_token(self, token):
        """ 토큰에 대응하는 인덱스를 추출합니다.
        토큰이 없으면 UNK 인덱스를 반환합니다.
        
        매개변수:
            token (str): 찾을 토큰 
        반환값:
            index (int): 토큰에 해당하는 인덱스
        노트:
            UNK 토큰을 사용하려면 (Vocabulary에 추가하기 위해)
            `unk_index`가 0보다 커야 합니다.
        """
        if self.unk_index >= 0:
            return self._token_to_idx.get(token, self.unk_index)
        else:
            return self._token_to_idx[token]

    def lookup_index(self, index):
        """ 인덱스에 해당하는 토큰을 반환합니다.
        
        매개변수: 
            index (int): 찾을 인덱스
        반환값:
            token (str): 인텍스에 해당하는 토큰
        에러:
            KeyError: 인덱스가 Vocabulary에 없을 때 발생합니다.
        """
        if index not in self._idx_to_token:
            raise KeyError("the index (%d) is not in the Vocabulary" % index)
        return self._idx_to_token[index]

    def __str__(self):
        return "<Vocabulary(size=%d)>" % len(self)

    def __len__(self):
        return len(self._token_to_idx)

In [4]:
class CBOWVectorizer(object):
    """ 어휘 사전을 생성하고 관리합니다 """
    def __init__(self, cbow_vocab):
        """
        매개변수:
            cbow_vocab (Vocabulary): 단어를 정수에 매핑합니다
        """
        self.cbow_vocab = cbow_vocab

    def vectorize(self, context, vector_length=-1):
        """
        매개변수:
            context (str): 공백으로 나누어진 단어 문자열
            vector_length (int): 인덱스 벡터의 길이 매개변수
        """

        indices = [self.cbow_vocab.lookup_token(token) for token in context.split(' ')]
        if vector_length < 0:
            vector_length = len(indices)

        out_vector = np.zeros(vector_length, dtype=np.int64)
        out_vector[:len(indices)] = indices
        out_vector[len(indices):] = self.cbow_vocab.mask_index

        return out_vector
    
    @classmethod
    def from_dataframe(cls, cbow_df):
        """데이터셋 데이터프레임에서 Vectorizer 객체를 만듭니다
        
        매개변수::
            cbow_df (pandas.DataFrame): 타깃 데이터셋
        반환값:
            CBOWVectorizer 객체
        """
        cbow_vocab = Vocabulary()
        for index, row in cbow_df.iterrows():
            for token in row.context.split(' '):
                cbow_vocab.add_token(token)
            cbow_vocab.add_token(row.target)
            
        return cls(cbow_vocab)

    @classmethod
    def from_serializable(cls, contents):
        cbow_vocab = \
            Vocabulary.from_serializable(contents['cbow_vocab'])
        return cls(cbow_vocab=cbow_vocab)

    def to_serializable(self):
        return {'cbow_vocab': self.cbow_vocab.to_serializable()}

## 모델

In [5]:
class CBOWClassifier(nn.Module): # Simplified cbow Model
    def __init__(self, vocabulary_size, embedding_size, padding_idx=0):
        """
        매개변수:
            vocabulary_size (int): 어휘 사전 크기, 임베딩 개수와 예측 벡터 크기를 결정합니다
            embedding_size (int): 임베딩 크기
            padding_idx (int): 기본값 0; 임베딩은 이 인덱스를 사용하지 않습니다
        """
        super(CBOWClassifier, self).__init__()
        
        self.embedding =  nn.Embedding(num_embeddings=vocabulary_size, 
                                       embedding_dim=embedding_size,
                                       padding_idx=padding_idx)
        self.fc1 = nn.Linear(in_features=embedding_size,
                             out_features=vocabulary_size)

    def forward(self, x_in, apply_softmax=False):
        """분류기의 정방향 계산
        
        매개변수:
            x_in (torch.Tensor): 입력 데이터 텐서 
                x_in.shape는 (batch, input_dim)입니다.
            apply_softmax (bool): 소프트맥스 활성화 함수를 위한 플래그
                크로스-엔트로피 손실을 사용하려면 False로 지정합니다
        반환값:
            결과 텐서. tensor.shape은 (batch, output_dim)입니다.
        """
        x_embedded_sum = F.dropout(self.embedding(x_in).sum(dim=1), 0.3)
        y_out = self.fc1(x_embedded_sum)
        
        if apply_softmax:
            y_out = F.softmax(y_out, dim=1)
            
        return y_out

## 모델 훈련

In [6]:
def make_train_state(args):
    return {'stop_early': False,
            'early_stopping_step': 0,
            'early_stopping_best_val': 1e8,
            'learning_rate': args.learning_rate,
            'epoch_index': 0,
            'train_loss': [],
            'train_acc': [],
            'val_loss': [],
            'val_acc': [],
            'test_loss': -1,
            'test_acc': -1,
            'model_filename': args.model_state_file}

def update_train_state(args, model, train_state):
    """ 훈련 상태를 업데이트합니다.

    Components:
     - 조기 종료: 과대 적합 방지
     - 모델 체크포인트: 더 나은 모델을 저장합니다

    :param args: 메인 매개변수
    :param model: 훈련할 모델
    :param train_state: 훈련 상태를 담은 딕셔너리
    :returns:
        새로운 훈련 상태
    """

    # 적어도 한 번 모델을 저장합니다
    if train_state['epoch_index'] == 0:
        torch.save(model.state_dict(), train_state['model_filename'])
        train_state['stop_early'] = False

    # 성능이 향상되면 모델을 저장합니다
    elif train_state['epoch_index'] >= 1:
        loss_tm1, loss_t = train_state['val_loss'][-2:]

        # 손실이 나빠지면
        if loss_t >= train_state['early_stopping_best_val']:
            # 조기 종료 단계 업데이트
            train_state['early_stopping_step'] += 1
        # 손실이 감소하면
        else:
            # 최상의 모델 저장
            if loss_t < train_state['early_stopping_best_val']:
                torch.save(model.state_dict(), train_state['model_filename'])

            # 조기 종료 단계 재설정
            train_state['early_stopping_step'] = 0

        # 조기 종료 여부 확인
        train_state['stop_early'] = \
            train_state['early_stopping_step'] >= args.early_stopping_criteria

    return train_state

def compute_accuracy(y_pred, y_target):
    _, y_pred_indices = y_pred.max(dim=1)
    n_correct = torch.eq(y_pred_indices, y_target).sum().item()
    return n_correct / len(y_pred_indices) * 100

In [7]:
def set_seed_everywhere(seed, cuda):
    np.random.seed(seed)
    torch.manual_seed(seed)
    if cuda:
        torch.cuda.manual_seed_all(seed)

def handle_dirs(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [8]:
args = Namespace(
    # 날짜와 경로 정보
    cbow_csv="data/books/frankenstein_with_splits.csv",
    vectorizer_file="vectorizer.json",
    model_state_file="model.pth",
    save_dir="model_storage/ch5/cbow",
    # 모델 하이퍼파라미터
    embedding_size=50,
    # 훈련 하이퍼파라미터
    seed=1337,
    num_epochs=100,
    learning_rate=0.0001,
    batch_size=32,
    early_stopping_criteria=5,
    # 실행 옵션
    cuda=True,
    catch_keyboard_interrupt=True,
    reload_from_files=False,
    expand_filepaths_to_save_dir=True
)

if args.expand_filepaths_to_save_dir:
    args.vectorizer_file = os.path.join(args.save_dir,
                                        args.vectorizer_file)

    args.model_state_file = os.path.join(args.save_dir,
                                         args.model_state_file)
    
    print("파일 경로: ")
    print("\t{}".format(args.vectorizer_file))
    print("\t{}".format(args.model_state_file))
    

# CUDA 체크
if not torch.cuda.is_available():
    args.cuda = False

args.device = torch.device("cuda" if args.cuda else "cpu")
    
print("CUDA 사용여부: {}".format(args.cuda))

# 재현성을 위해 시드 설정
set_seed_everywhere(args.seed, args.cuda)

# 디렉토리 처리
handle_dirs(args.save_dir)

파일 경로: 
	model_storage/ch5/cbow\vectorizer.json
	model_storage/ch5/cbow\model.pth
CUDA 사용여부: False


In [9]:
if args.reload_from_files:
    print("데이터셋과 Vectorizer를 로드합니다")
    dataset = CBOWDataset.load_dataset_and_load_vectorizer(args.cbow_csv,
                                                           args.vectorizer_file)
else:
    print("데이터셋을 로드하고 Vectorizer를 만듭니다")
    dataset = CBOWDataset.load_dataset_and_make_vectorizer(args.cbow_csv)
    dataset.save_vectorizer(args.vectorizer_file)
    
vectorizer = dataset.get_vectorizer()

classifier = CBOWClassifier(vocabulary_size=len(vectorizer.cbow_vocab), 
                            embedding_size=args.embedding_size)


데이터셋을 로드하고 Vectorizer를 만듭니다


In [10]:
classifier = classifier.to(args.device)
    
loss_func = nn.CrossEntropyLoss()
optimizer = optim.Adam(classifier.parameters(), lr=args.learning_rate)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer=optimizer,
                                                 mode='min', factor=0.5,
                                                 patience=1)
train_state = make_train_state(args)

epoch_bar = tqdm.notebook.tqdm(desc='training routine', 
                               total=args.num_epochs,
                               position=0)

dataset.set_split('train')
train_bar = tqdm.notebook.tqdm(desc='split=train',
                               total=dataset.get_num_batches(args.batch_size), 
                               position=1, 
                               leave=True)
dataset.set_split('val')
val_bar = tqdm.notebook.tqdm(desc='split=val',
                             total=dataset.get_num_batches(args.batch_size), 
                             position=1, 
                             leave=True)

try:
    for epoch_index in range(args.num_epochs):
        train_state['epoch_index'] = epoch_index

        # 훈련 세트에 대한 순회

        # 훈련 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('train')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.0
        running_acc = 0.0
        classifier.train()

        for batch_index, batch_dict in enumerate(batch_generator):
            # 훈련 과정은 5단계로 이루어집니다

            # --------------------------------------
            # 단계 1. 그레이디언트를 0으로 초기화합니다
            optimizer.zero_grad()

            # 단계 2. 출력을 계산합니다
            y_pred = classifier(x_in=batch_dict['x_data'])

            # 단계 3. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 4. 손실을 사용해 그레이디언트를 계산합니다
            loss.backward()

            # 단계 5. 옵티마이저로 가중치를 업데이트합니다
            optimizer.step()
            # -----------------------------------------
            
            # 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)

            # 진행 바 업데이트
            train_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            train_bar.update()

        train_state['train_loss'].append(running_loss)
        train_state['train_acc'].append(running_acc)

        # 검증 세트에 대한 순회

        # 검증 세트와 배치 제너레이터 준비, 손실과 정확도를 0으로 설정
        dataset.set_split('val')
        batch_generator = generate_batches(dataset, 
                                           batch_size=args.batch_size, 
                                           device=args.device)
        running_loss = 0.
        running_acc = 0.
        classifier.eval()

        for batch_index, batch_dict in enumerate(batch_generator):

            # 단계 1. 출력을 계산합니다
            y_pred =  classifier(x_in=batch_dict['x_data'])

            # 단계 2. 손실을 계산합니다
            loss = loss_func(y_pred, batch_dict['y_target'])
            loss_t = loss.item()
            running_loss += (loss_t - running_loss) / (batch_index + 1)

            # 단계 3. 정확도를 계산합니다
            acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
            running_acc += (acc_t - running_acc) / (batch_index + 1)
            val_bar.set_postfix(loss=running_loss, acc=running_acc, 
                            epoch=epoch_index)
            val_bar.update()

        train_state['val_loss'].append(running_loss)
        train_state['val_acc'].append(running_acc)

        train_state = update_train_state(args=args, model=classifier,
                                         train_state=train_state)

        scheduler.step(train_state['val_loss'][-1])

        if train_state['stop_early']:
            break

        train_bar.n = 0
        val_bar.n = 0
        epoch_bar.update()
except KeyboardInterrupt:
    print("Exiting loop")

training routine:   0%|          | 0/100 [00:00<?, ?it/s]

split=train:   0%|          | 0/1984 [00:00<?, ?it/s]

split=val:   0%|          | 0/425 [00:00<?, ?it/s]

## 모델 성능 평가
### test set 손실&정확도 계산

In [42]:
# 가장 좋은 모델을 사용해 테스트 세트의 손실과 정확도를 계산합니다
classifier.load_state_dict(torch.load(train_state['model_filename']))
classifier = classifier.to(args.device)
loss_func = nn.CrossEntropyLoss()

dataset.set_split('test')
batch_generator = generate_batches(dataset, 
                                   batch_size=args.batch_size, 
                                   device=args.device)
running_loss = 0.
running_acc = 0.
classifier.eval()

for batch_index, batch_dict in enumerate(batch_generator):
    # 출력을 계산합니다
    y_pred =  classifier(x_in=batch_dict['x_data'])
    
    # 손실을 계산합니다
    loss = loss_func(y_pred, batch_dict['y_target'])
    loss_t = loss.item()
    running_loss += (loss_t - running_loss) / (batch_index + 1)

    # 정확도를 계산합니다
    acc_t = compute_accuracy(y_pred, batch_dict['y_target'])
    running_acc += (acc_t - running_acc) / (batch_index + 1)

train_state['test_loss'] = running_loss
train_state['test_acc'] = running_acc

print("테스트 손실: {};".format(train_state['test_loss']))
print("테스트 정확도: {}".format(train_state['test_acc']))

테스트 손실: 7.669673863579242;
테스트 정확도: 13.227941176470592


### 훈련된 임베딩

In [43]:
def pretty_print(results):
    """
    임베딩 결과를 출력합니다
    """
    for item in results:
        print ("...[%.2f] - %s"%(item[1], item[0]))

def get_closest(target_word, word_to_idx, embeddings, n=5):
    """
    n개의 최근접 단어를 찾습니다.
    """

    # 다른 모든 단어까지 거리를 계산합니다
    word_embedding = embeddings[word_to_idx[target_word.lower()]]
    distances = []
    for word, index in word_to_idx.items():
        if word == "<MASK>" or word == target_word:
            continue
        distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
    results = sorted(distances, key=lambda x: x[1])[1:n+2]
    return results

word = input('단어를 입력해 주세요: ')
embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx
pretty_print(get_closest(word, word_to_idx, embeddings, n=5))

단어를 입력해 주세요: monster
...[7.57] - cares
...[7.70] - griefs
...[7.74] - saw
...[7.78] - confused
...[7.81] - without
...[7.82] - truly


In [62]:
word_embedding = embeddings[word_to_idx["saw".lower()]]
distances = []
for word, index in word_to_idx.items():
    if word == "<MASK>" or word == target_word:
        continue
    distances.append((word, torch.dist(word_embedding, embeddings[index])))
    
results = sorted(distances, key=lambda x: x[1])
print(results)

[('saw', tensor(0.)), ('without', tensor(6.1729)), ('unveiled', tensor(6.2092)), ('us', tensor(6.2110)), ('calmed', tensor(6.3558)), ('jaws', tensor(6.3924)), ('mountains', tensor(6.4001)), ('perish', tensor(6.4593)), ('beaming', tensor(6.4789)), ('fitness', tensor(6.4866)), ('order', tensor(6.5305)), ('enounced', tensor(6.5311)), ('dungeon', tensor(6.5351)), ('beneficence', tensor(6.5360)), ('exploit', tensor(6.5584)), ('unsullied', tensor(6.5904)), ('urged', tensor(6.6033)), ('portion', tensor(6.6164)), ('surprising', tensor(6.6257)), ('mandate', tensor(6.6296)), ('benevolently', tensor(6.6335)), ('conveniently', tensor(6.6564)), ('company', tensor(6.6725)), ('inaction', tensor(6.6825)), ('whither', tensor(6.7026)), ('intercourse', tensor(6.7181)), ('respected', tensor(6.7233)), ('joys', tensor(6.7315)), ('assassinated', tensor(6.7422)), ('conducted', tensor(6.7459)), ('grievously', tensor(6.7834)), ('spread', tensor(6.7844)), ('encouraged', tensor(6.7946)), ('rain', tensor(6.8084)),

In [61]:
torch.dist(embeddings[100], embeddings[10])

tensor(10.6257)

In [58]:
embeddings[1100]

tensor([-0.1638, -0.2707,  0.4938,  0.6275, -0.7259,  1.4611,  0.9130, -0.0573,
        -2.3009,  0.3934,  0.5329,  0.8500,  0.6159,  0.0756,  0.0046, -0.5524,
         0.0709, -1.9389, -0.7591, -0.0961, -1.1743, -0.9154, -0.6987,  1.3813,
         0.2470, -1.0582, -0.4292, -0.2194, -0.8064,  0.1861, -0.2469,  1.1752,
        -0.6806, -0.7201,  0.3207, -0.7042, -0.1947, -0.3473,  1.4104,  1.7768,
        -1.1297,  0.5590, -0.4131, -0.2548,  0.7149, -2.1836,  0.6149,  0.2252,
         1.0097, -0.2807])

In [59]:
embeddings[1100].shape

torch.Size([50])

In [56]:
pretty_print(get_closest("saw", word_to_idx, embeddings, n=5))

...[6.21] - unveiled
...[6.21] - us
...[6.36] - calmed
...[6.39] - jaws
...[6.40] - mountains
...[6.46] - perish


In [45]:
embeddings

tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.3514, -0.2759, -1.5108,  ..., -1.3651, -0.1655,  0.9623],
        [-0.2085, -0.2980, -0.5346,  ..., -0.6181, -0.5156,  0.1985],
        ...,
        [ 1.9641,  0.4243,  0.9431,  ..., -0.4961,  0.8868,  0.0615],
        [ 0.5473, -0.2489,  0.6948,  ..., -0.1109, -1.4194, -0.3190],
        [-1.0696,  1.8218,  0.3448,  ...,  0.9619, -0.3866,  0.5445]])

In [46]:
embeddings.shape

torch.Size([6138, 50])

In [48]:
classifier

CBOWClassifier(
  (embedding): Embedding(6138, 50, padding_idx=0)
  (fc1): Linear(in_features=50, out_features=6138, bias=True)
)

In [47]:
classifier.embedding

Embedding(6138, 50, padding_idx=0)

In [49]:
classifier.embedding.weight

Parameter containing:
tensor([[ 0.0000,  0.0000,  0.0000,  ...,  0.0000,  0.0000,  0.0000],
        [ 1.3514, -0.2759, -1.5108,  ..., -1.3651, -0.1655,  0.9623],
        [-0.2085, -0.2980, -0.5346,  ..., -0.6181, -0.5156,  0.1985],
        ...,
        [ 1.9641,  0.4243,  0.9431,  ..., -0.4961,  0.8868,  0.0615],
        [ 0.5473, -0.2489,  0.6948,  ..., -0.1109, -1.4194, -0.3190],
        [-1.0696,  1.8218,  0.3448,  ...,  0.9619, -0.3866,  0.5445]],
       requires_grad=True)

In [44]:
target_words = ['frankenstein', 'monster', 'science', 'sickness', 'lonely', 'happy']

embeddings = classifier.embedding.weight.data
word_to_idx = vectorizer.cbow_vocab._token_to_idx

for target_word in target_words: 
    print(f"======={target_word}=======")
    if target_word not in word_to_idx:
        print("Not in vocabulary")
        continue
    pretty_print(get_closest(target_word, word_to_idx, embeddings, n=5))

...[7.24] - irradiated
...[7.68] - enslaved
...[7.71] - men
...[7.75] - gush
...[7.76] - mode
...[7.76] - austria
...[7.57] - cares
...[7.70] - griefs
...[7.74] - saw
...[7.78] - confused
...[7.81] - without
...[7.82] - truly
...[7.02] - mutual
...[7.02] - impression
...[7.06] - mist
...[7.16] - swelling
...[7.24] - darkened
...[7.30] - tempted
...[6.21] - while
...[6.59] - awoke
...[6.60] - foundations
...[6.66] - consoles
...[6.69] - literally
...[6.69] - know
...[6.77] - excessive
...[6.85] - moonlight
...[6.90] - ought
...[7.10] - bed
...[7.12] - three
...[7.20] - superhuman
...[6.33] - bottom
...[6.42] - penetrated
...[6.44] - wand
...[6.52] - chivalry
...[6.52] - joys
...[6.53] - altered


In [11]:
len(vectorizer.cbow_vocab)

6138

In [12]:
y_pred

tensor([[ -5.9272,  -6.0011,   3.6612,  ...,  -6.1245,  -1.2402,  -5.9829],
        [ -7.6743,  -7.7506,   1.6867,  ...,  -2.5098,  -2.3850,  -7.7347],
        [ -4.3471,  -4.4243,   2.1376,  ...,  -0.7625,   1.5628,  -4.4088],
        ...,
        [ -8.9114,  -8.9893,   2.5032,  ...,  -4.7501,  -5.5929,  -8.9697],
        [-15.9812, -16.0588,   3.4174,  ...,  -6.6900,  -5.7756, -16.0391],
        [-10.8336, -10.9087,   4.5460,  ...,  -6.6397,  -6.6852, -10.8904]],
       grad_fn=<AddmmBackward0>)

In [13]:
y_pred.shape

torch.Size([32, 6138])

In [19]:
batch_dict

{'x_data': tensor([[   1,   39,    1, 1132, 1023,   39],
         [   4,    1,    2, 4792,  230,  208],
         [  50,  868,   15,    0,    0,    0],
         [ 226,  620,  173, 2490, 3710,  230],
         [   4, 1054,    1,  334,  589, 1082],
         [1177, 3260,   72,    4,  496, 3286],
         [3253,  235,   49,   19,  559,   19],
         [ 319,    1,    8, 1353,  413,   15],
         [ 319,   27,  115,    4, 3246,   33],
         [ 909,   19, 2762,   50, 1360,   49],
         [ 337, 2487,  255,    0,    0,    0],
         [   4,    1,  632, 4319, 3387,    2],
         [  82,    2, 2523,    0,    0,    0],
         [1193,   82, 2550,  666,   49, 1360],
         [1637,  204,   50,    2,   49, 3412],
         [ 319,    2,  215, 1300,   50, 1553],
         [  90,    4, 4308,  157, 1245,    2],
         [  27,  677,   36,  209,  224,    2],
         [ 355,  255,    4,    0,    0,    0],
         [1973, 4685,    8, 3811,  160,  127],
         [ 319,  173,  819,   15,   43,    0],
   

In [21]:
batch_dict['x_data'].shape

torch.Size([32, 6])

In [22]:
batch_dict['y_target'].shape

torch.Size([32])

In [30]:
dataloader = DataLoader(dataset=dataset, batch_size=32,
                            shuffle=True, drop_last=True)

c = 0

for data_dict in dataloader:
    print(data_dict)
    c += 1
    
print(c) # 425

{'x_data': tensor([[ 732,  845,  474,    0,    0,    0],
        [   1,   33,  230,   15,   43,    0],
        [ 334,  290, 5631,    0,    0,    0],
        [ 598,    8,   72, 4608,  225,   48],
        [ 590,   60, 5721,  229,   49, 3025],
        [   2,    1,  901, 1907,    1,  550],
        [   1,   33, 3457,    0,    0,    0],
        [5835,   39,   50,   27,  264,   50],
        [   4, 4718,   33,   15,   43,    0],
        [  43, 5520,   53,   72,    0,    0],
        [   2,  492, 1106,    1,   15,   43],
        [   8, 1229,   48,   72, 1921,  894],
        [  72,    1,  319,  516,    4,  256],
        [  49,   72,  314,    2,   36,   48],
        [ 268,   90,  215,   77, 1263, 2853],
        [5997,    2,   49, 1884,  338, 4618],
        [  48,   49,    1,  766,    0,    0],
        [  23,    2,   86, 3462,    3, 3375],
        [4104,    2, 3389,    4,  559,   33],
        [1914, 3398,   33,  688,  760,   15],
        [   4,    1,   15,    0,    0,    0],
        [1132, 2627, 13

{'x_data': tensor([[1628,  417,    4,   33,   50,  337],
        [  48,   37,   19,   15,   43,    0],
        [   2,  225,   34, 1928,  209, 3599],
        [ 215,  216,   33,    3,  218,   15],
        [ 398, 1101, 5626,   19,    4, 4692],
        [  48,  272,   19,   77,  174,  334],
        [ 160,  137, 1448,  151,  565,    2],
        [  60,   27,  396,  908, 1555,   49],
        [  50,  337,  338,   49, 2493,  326],
        [  24,  218, 4236,    1,   49, 3620],
        [ 482,    4,    1,   72, 4319, 3387],
        [  90,   50,  200,   49,   50, 3292],
        [ 319, 2241,   49,   15,    1, 1388],
        [4979,    2,   48,   37,  666,    2],
        [ 273,    2,  239,   28, 2490, 4355],
        [ 215,    2,    4, 3868, 3869,   49],
        [  82, 3770,   33, 1343,    2,   49],
        [  48,   60,    4,  796,    0,    0],
        [  48,  319,   60, 1279,   33, 2682],
        [  68,   48,  620,   15,   43,    0],
        [   2,  255,   72,  482, 1008, 1131],
        [2934, 1894,   

{'x_data': tensor([[ 230, 4104, 3195,  230,    1,    1],
        [1935,    1,   50,   92,    4,  154],
        [ 109,    4, 1992,   48,  334,  770],
        [ 319, 1031,   15,    0,    0,    0],
        [  50, 1237,   49,    1,    1,   19],
        [  72, 2618, 3075,  290,   50, 4842],
        [  23,  152,    2,  239,    1,    0],
        [4310,    2,  239, 3228,   33,  491],
        [  60, 2553,  417,  536,   15,   43],
        [ 982, 1802, 1082,    1,   49,    1],
        [  33,  199,   49,    2,   36,    1],
        [  90,    4, 2080,   68,   65,   68],
        [2452,   33,  345,   43,    0,    0],
        [ 496,   33,   50, 5501,  115,   19],
        [3159,   19,   82,   49,  358,   33],
        [   4,  124, 3210,    4,  132,    0],
        [2785,   82,    4, 4500,  326, 1513],
        [  49, 1543,    2,    4,    1,   39],
        [ 137,  622,  319,  402,   33,  137],
        [ 151, 6121,    2, 2217,   19, 5176],
        [  27,   23, 3462,  278,   19,   82],
        [ 895,    2,  7

{'x_data': tensor([[ 417,  178,    3,  474,   43,    0],
        [ 819,    2, 2805,   50,    1,    2],
        [   4,    1,   33, 3329,   48,   24],
        [3906,    2,   48,    4, 2042,   13],
        [  27,  115, 1174,   48,  334, 1213],
        [ 620,  173, 3123,   90,  115, 5162],
        [2831,   27,   40, 4204,    4, 1159],
        [ 127,    1, 4537,    4, 4500,    4],
        [2477,  136, 2474,   43,    0,    0],
        [  37,   19, 1006,   43,    0,    0],
        [  24,  564,   60,   49, 1360,    2],
        [  27, 1136,    1,   50,  200,    2],
        [  48,    4, 1714,    2,    0,    0],
        [   1,   90,    4,  556,   50,  362],
        [   2,  377,  228,   23,  864,   82],
        [   1, 3053,   68,   48,    0,    0],
        [  48,   28, 2490,   60,    4,  796],
        [   4,  402,   90, 1082,    2,   49],
        [   2,   49,   48, 4316,  939,   33],
        [2918,   27,  326, 1513,  715,    4],
        [   1,    4,  150,  981,   49, 5526],
        [  60,  140, 10

{'x_data': tensor([[  72, 4702,    1,  335, 3455,   49],
        [ 257,   72, 4042,  471,  819, 4365],
        [  85, 5520,   53,  185, 3752, 1017],
        [ 160,    4,    1,  255,    4,  396],
        [   4, 4500,    2,  160,    1,  212],
        [ 151, 1733,  251,  160,  716,    0],
        [  27,   48,  409,   82, 1737,   33],
        [ 145,   19, 1336, 2209,   33,    4],
        [ 819, 4365, 1081,    2,   60,   36],
        [ 334,    1,    4, 3398,   33,   72],
        [3253, 3060,   49,    2,    1,  474],
        [1099,   49,    1,   33, 1992,   15],
        [   4,  879,   33,    1, 3867,   15],
        [ 115,  356, 4578,    0,    0,    0],
        [3863,   68,  212,  543,   72,  968],
        [  60,   85, 1070,  239,  225,   48],
        [ 209,  116,  319,    2, 1040,  455],
        [1802, 1082,   19, 4702,    1,   48],
        [   1,  470,    2, 1533,   48,   63],
        [ 136,   72, 2632,   49,   48,  783],
        [  48,  666,   33,  334,  390,    2],
        [  48,  334, 60

{'x_data': tensor([[2195,    2,  139, 1095,   19,    4],
        [  48,  783, 1584,  326, 2533,   19],
        [   1,  239,  149,  334, 2416,   60],
        [1194,   48,  140,   33,    4,  496],
        [  27,   33, 6049,   19,  567,  165],
        [ 326,   28, 3835,    0,    0,    0],
        [  72,  968, 1234, 1762,   60,   50],
        [1139,   50,  271,  140,  349,   60],
        [3833,   90,   50,   15,   43,    0],
        [   2,   60,   36,  226,  326,    1],
        [ 420,   85,    1,  239,    2, 3422],
        [  48,  830,  406,    2,   49,    2],
        [  53,   64, 1607,   43,    0,    0],
        [   2,   48,  319,   60,   72, 5466],
        [ 136,  765,   49,   39, 1263,   83],
        [1132,  845,   15,    0,    0,    0],
        [  49,    1,    2,  136,  140, 4429],
        [ 334,  393,   60,    0,    0,    0],
        [1013, 3688,   90, 5620,   60, 3318],
        [ 230, 2593,  235,   19,  720,    0],
        [ 770, 1555,  438, 2970,   33,   72],
        [6125,  215,   

{'x_data': tensor([[1221,   49,  569,    1,   15,   43],
        [  39, 1637,   48, 3798,    0,    0],
        [1008,   37,   64, 1320,    0,    0],
        [4355,   33, 1774,    0,    0,    0],
        [   2, 1078, 4495,    4, 4837,    2],
        [ 319,   19,  109, 3052,   15,   43],
        [ 264,  358, 1337,   48,    0,    0],
        [ 337,  319, 5079,    0,    0,    0],
        [  24,  109,   39,  160,  137,    1],
        [ 549,   27,  620, 1819,   15,   43],
        [4387, 1087, 1278, 4475,   19,    0],
        [   4, 3823,   33,  496, 1851, 4277],
        [  19, 1439,  192,   15,   43,    0],
        [  72, 5211,    2,    2,    1,    8],
        [ 567,   23,   60,  417,   72,  536],
        [  49, 3053,   68,    0,    0,    0],
        [  49,   39,   72,   33, 3228,  173],
        [1245,  160, 1192,   43,    0,    0],
        [1967, 1993,   49,  313,   51, 1234],
        [   4,   33,    1, 2169,    0,    0],
        [ 832,    4, 1068,   50, 1940,    2],
        [  82,  278,  2

{'x_data': tensor([[ 319,   50, 4252,    0,    0,    0],
        [4632,   39,  230, 2610,   15,   43],
        [  33,  762, 2947, 2621,    4, 3996],
        [  37,  290,  358,   60,    4, 3241],
        [  92,    4, 3260,   49, 1299,   72],
        [ 906,    2,  492,  567,   27,   48],
        [   2,   68,   72,    1,  136,   82],
        [ 398,   68,    1,   72,  292,    2],
        [ 559,   33,    4,    2,  482,   34],
        [ 226,  334,  290,  553,    4,  329],
        [   1,   19,  137,   15,   43,    0],
        [   4, 1983, 1719,   82,   39,    0],
        [   4,  787,  319,    0,    0,    0],
        [2017,   77,   82,    0,    0,    0],
        [  48,  319,  598,   72,  845,   33],
        [5514,    2,   49,  444, 3365,    2],
        [2017, 3060,   19,  799,   39, 3501],
        [1095,   19,    4,   48, 5259,    2],
        [  27,   48,  319,    4,    1, 5839],
        [1234,  204,  151, 5176,  160,  313],
        [ 414, 1116,  136,  521,   49,    1],
        [  19,  345,   

{'x_data': tensor([[  50, 5207,   49,    1,  326,  213],
        [3457,   49, 4040,  471, 2425,   82],
        [  49,  264,   23,   82,   60,   85],
        [2404, 1357,    4,   33,   72,  688],
        [   1, 1175,   85, 3742,    4,  271],
        [ 264,  136,  358,   48,  334,  666],
        [  19, 1970,  192,   49,   19,    1],
        [   4,   33,  766, 1640,    0,    0],
        [ 226,  395, 3788,    8,  230, 2490],
        [1637,   33, 1992,   43,    0,    0],
        [ 766, 1476,  766, 1283,    0,    0],
        [3262,  365,  601, 5640,   49,    1],
        [ 235,    2,   48, 1584,   36,  326],
        [ 239,  159,   28, 3813,    0,    0],
        [   8,    4, 3811,  127,    1,    1],
        [ 592,   48, 4678,  136,  351,    1],
        [1279,   33,  282, 2653,   19, 1992],
        [ 564,   50,  345,   27, 5413, 4579],
        [2995,    4,  229,  109,  806, 1657],
        [  27,    2, 4095,   68,   23,  145],
        [ 595,   19,    1, 1733,   34,    1],
        [2008,    2,  1

{'x_data': tensor([[  60,    4,    1, 4378,  157,  329],
        [  49,   72,  536,   50,    1,   48],
        [ 154,   49,    1, 2615,   36,   48],
        [  50,  337,   15,    0,    0,    0],
        [4422,   82,  386,    1,  257, 1264],
        [1854,   33,  331,   43,    0,    0],
        [  68,   48, 1499, 2080,   48,  335],
        [ 417,    4,  496, 1609, 3365,   82],
        [1193,   23,  157,   60,  137,  808],
        [  82,    2,   49,   72, 2726,  349],
        [1174,   19,   82,    4,  178,   33],
        [   4, 3996,  563,  666,   27,   50],
        [ 319,  742,    8,    0,    0,    0],
        [   4,   99,   36,  334, 1474,    0],
        [ 225,   48, 1030,   19, 1239,   48],
        [3329,  338, 2970,  160,  766, 3797],
        [ 732,    1,   19,  266,   48,   86],
        [ 189,   68,   72,    2,   72, 4046],
        [1312,  115,  326,  614,    2,  136],
        [   2,  239,    8,   48, 2824,   72],
        [1466,    2,   48,   19,  955,  235],
        [2515,  474,   

In [31]:
425*32

13600

In [32]:
dataset.validation_size

13605