In [1]:
%matplotlib inline

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
import matplotlib.pyplot as plt
import time

dev = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
dev

device(type='cuda')

In [2]:
txt_path = '../data/tokenized/corpus_mecab.txt'
corpus = [sent.strip().split() for sent in open(txt_path, 'r', encoding='utf-8').readlines()]

In [None]:
%%time
corpus = []
with open(txt_path, 'r', encoding='utf-8') as ifp:
    for sent in ifp:
        # do some jobs for line
        sent = sent.strip().split()
        corpus.append(sent)

In [4]:
vocab = set()
for i in range(len(corpus)):
    vocab.update(corpus[i])
    
print('단어 갯수 : ' + str(len(vocab)))

word_to_id = dict()
for word in vocab:
    word_to_id[word]=len(word_to_id)    
    
id_to_word = []
for word in word_to_id.keys():
    id_to_word.append(word)

단어 갯수 : 1170688


In [5]:
def sent_to_ngram(n, sent):
    ngrams = []
    
    for i in range(n, len(sent)-n):
        ngram = []
        for j in range(-n,n+1):
            ngram.append(sent[i+j])
        ngrams.append(ngram)
            
    return ngrams

def sent_to_vec(sent):
    vec = []
    for word in sent:
        vec.append(word_to_id[word])
    return vec

In [6]:
N = 3 # 윈도우 갯수
VEC_DIM = 100 # 임베딩 차원

In [18]:
%%time
train_input = []
train_target = []

for i in range(len(corpus)):
    n_grams = sent_to_ngram(N, corpus[i])
    for n_gram in n_grams:
        train_input.append(sent_to_vec(n_gram[:N])+sent_to_vec(n_gram[N+1:]))
        train_target.append(word_to_id[n_gram[N]])

MemoryError: 

In [19]:
BATCH_SIZE = 64
train = torch.utils.data.TensorDataset(torch.tensor(train_input), torch.tensor(train_target))
input_loader = torch.utils.data.DataLoader(train, batch_size=BATCH_SIZE)

RuntimeError: [enforce fail at ..\c10\core\CPUAllocator.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 1115647344 bytes. Buy new RAM!


In [None]:
class CBOW(nn.Module):
    def __init__(self, vec_dim):
        super(CBOW,self).__init__()
        self.emb = nn.Embedding(len(vocab), vec_dim)
        self.lin = nn.Linear(vec_dim, len(vocab), bias=False)
        
    def forward(self, x):
        x = torch.mean(self.emb(x), dim=1)
        x = self.lin(x).view(-1,len(vocab))
        return x

In [None]:
model = CBOW(VEC_DIM).to(dev)
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_f = nn.CrossEntropyLoss()

In [None]:
check = 10000
epoch = 1

# GPU time
start = time.time()

loss_list = []

for epo in range(epoch):
    loss_sum = 0
    for i, (x, y) in enumerate(input_loader):

        x,y = x.to(dev), y.to(dev)
        optimizer.zero_grad()
        
        output = model(x)
        
        loss = loss_f(output, y)
        loss.backward()
        optimizer.step()
        
        loss_sum += loss.item()
        
        
        if i % check == 0 && i != 0 :        
            elap = int(time.time() - start)
            loss_list.append(loss_sum/check)
            print('Epoch : {}, Iteration : {}, Loss : {:.2f}, Elapsed time : {:.0f}h {:.0f}m {}s'.format(\
                epo+1, i, loss_sum / check, elap // 3600, (elap % 3600) // 60, str(int((elap % 3600) % 60))))
            loss_sum = 0

In [None]:
%%time
train_input = []
train_target = []
for data in train_data:
    target = sent_to_vec(data[:N])+sent_to_vec(data[N+1:])
    for i in range(len(target)):
        train_input.append(word_to_id[data[N]])
        train_target.append(target[i])


In [10]:
train_input

[[1117922, 1164407, 402567, 402567, 76263, 35784],
 [1164407, 402567, 294068, 76263, 35784, 1075904],
 [402567, 294068, 402567, 35784, 1075904, 684204],
 [294068, 402567, 76263, 1075904, 684204, 1072681],
 [402567, 76263, 35784, 684204, 1072681, 289854],
 [76263, 35784, 1075904, 1072681, 289854, 985250],
 [35784, 1075904, 684204, 289854, 985250, 343396],
 [1075904, 684204, 1072681, 985250, 343396, 422915],
 [684204, 1072681, 289854, 343396, 422915, 1072681],
 [1072681, 289854, 985250, 422915, 1072681, 96706],
 [289854, 985250, 343396, 1072681, 96706, 753199],
 [985250, 343396, 422915, 96706, 753199, 1072681],
 [343396, 422915, 1072681, 753199, 1072681, 643922],
 [422915, 1072681, 96706, 1072681, 643922, 688084],
 [1072681, 96706, 753199, 643922, 688084, 804930],
 [96706, 753199, 1072681, 688084, 804930, 202691],
 [753199, 1072681, 643922, 804930, 202691, 883902],
 [1072681, 643922, 688084, 202691, 883902, 243305],
 [643922, 688084, 804930, 883902, 243305, 576163],
 [688084, 804930, 202

In [11]:
train_target

[294068,
 402567,
 76263,
 35784,
 1075904,
 684204,
 1072681,
 289854,
 985250,
 343396,
 422915,
 1072681,
 96706,
 753199,
 1072681,
 643922,
 688084,
 804930,
 202691,
 883902,
 243305,
 576163,
 39389,
 289854,
 183417,
 197562,
 312181,
 1075904,
 1072681,
 289854,
 792341,
 792341,
 422915,
 688084,
 1072681,
 289854,
 1157852,
 1072681,
 422915,
 804930,
 1165135,
 218589,
 47385,
 294068,
 76263,
 202691,
 675781,
 137659,
 934596,
 272835,
 175201,
 487273,
 1010595,
 590893,
 112806,
 218589,
 47385,
 675781,
 160810,
 322967,
 155077,
 411448,
 608833,
 426541,
 504319,
 218589,
 47385,
 211798,
 727810,
 554391,
 495028,
 992138,
 140766,
 899183,
 296546,
 899183,
 261195,
 1094734,
 965799,
 403470,
 643922,
 426541,
 504319,
 218589,
 47385,
 1072681,
 289854,
 359618,
 39389,
 422915,
 576163,
 554391,
 185454,
 1117412,
 512654,
 426541,
 504319,
 762619,
 879733,
 495554,
 899183,
 108520,
 1087853,
 805716,
 963372,
 1000029,
 722263,
 302521,
 805716,
 45717,
 5663