In [1]:
class Configs:
    def __init__(self):
        self.manifest_file = "total_am.txt"
        self.labels_path = "aihub_labels.csv"
        self.train_ratio = 0.8
        self.num_workers = 4
        self.batch_size = 64
        self.sample_mode = 'random' #'smart'
        self.teacher_forcing_ratio = 0.0
        
        self.num_classes = 2001
        self.d_model = 512
        self.d_ff = 2048
        self.num_heads = 4
        self.num_layers = 3
        self.model_name = "BERT"
        
configs = Configs()

In [2]:
from Tokenizer import Tokenizer
from data_module import DataModule


tokenizer = Tokenizer(label_file=configs.labels_path)
data_module = DataModule(configs, tokenizer)
train_dataloader = data_module.get_dl("train")
valid_dataloader = data_module.get_dl("valid")

In [5]:
from Model import Transformer_LM

model = Transformer_LM(
    num_classes=configs.num_classes,
    d_model=configs.d_model,
    d_ff=configs.d_ff,
    num_heads=configs.num_heads,
    num_layers=configs.num_layers,
    model=configs.model_name
)

In [6]:
model = model.cuda()

In [7]:
from criterion import CrossEntropyLoss, Perplexity
from torch.optim import Adam

Loss = CrossEntropyLoss(tokenizer)
optimizer = Adam(model.parameters(), lr=1e-4)

In [None]:
from torch.utils.tensorboard import SummaryWriter
import torch


writer = SummaryWriter('runs/bert')

for iteration, (inputs, seq_lengths, targets) in enumerate(train_dataloader):
    inputs = inputs.cuda()
    targets = targets.cuda()
    optimizer.zero_grad()
    logits, preds = model(inputs, seq_lengths)
    loss = Loss(logits, targets)
    perplexity = torch.exp(loss)
    writer.add_scalar("train_loss", loss, iteration)
    writer.add_scalar("train_perplexity", perplexity, iteration)
    
    loss.backward()
    optimizer.step()
    
    if iteration % 1000 == 0 and iteration != 0:
        VAL_LOSS = 0
        val_iter = 0
        for i, (val_inputs, val_lengths, val_targets) in enumerate(valid_dataloader):
            if i > 100:
                break
            val_inputs = val_inputs.cuda()
            val_targets = val_targets.cuda()
            with torch.no_grad():
                logits, preds = model(val_inputs, val_lengths)
            val_loss = Loss(logits, val_targets)
            VAL_LOSS += val_loss
            val_iter += 1
        validation_loss = VAL_LOSS/val_iter
        validation_perplexity = torch.exp(validation_loss)
        writer.add_scalar("validation_loss", validation_loss, iteration)
        writer.add_scalar("validation_perplexity", validation_perplexity, iteration)
            
writer.close()

In [8]:
torch.save(model.state_dict(), "bert.pt")

In [84]:
# Test Bert Model

sample = "이 좋은 날씨에 난 사무실에 앉아서 뭘 하는거람"

tokenizer.idx2char[2000] = "<mask>"

mask_indexes = random.sample(range(len(sample)), 3)

#inputs = [tokenizer.sos_token] + tokenizer.encode(sample) + [tokenizer.eos_token]
inputs = tokenizer.encode(sample)
for idx in mask_indexes:
    inputs[idx] = 2000
    
print(tokenizer.decode(inputs[1:-1]))

input_length = [len(inputs)]

inputs = torch.Tensor(inputs).unsqueeze(0).int()

inputs = inputs.cuda()

logits, preds = model(inputs, input_length)

for idx in mask_indexes:
    # <sos> 빼고
    inputs[0][idx] = preds[0][idx+1]
    
result = tokenizer.decode(inputs[0][1:-1])
print(result)

 좋은 날씨<mask><mask>난 사무실에 앉아서<mask>뭘<mask>하는거
 좋은 날씨  난 사무실에 앉아서 뭘 하는거


In [6]:
import torch

model.load_state_dict(torch.load("gpt.pt"))

<All keys matched successfully>

In [22]:
def text_generator(sentence: str, length: int):
    inputs = tokenizer.encode(sentence)
    inputs = torch.Tensor(inputs).unsqueeze(0).long().cuda()
    
    with torch.no_grad():
        for i in range(length):
            # 문장 뒤에 아무 글자나 붙여서 다음 글자까지 예측하도록 함.
            blank = torch.Tensor([[5]]).cuda()
            inputs = torch.cat((inputs, blank), dim=1).long()
            input_lengths = torch.Tensor(inputs.size(1)).long().cuda()
            logits, preds = model.forward(inputs, input_lengths)
            
            # 앞의 글자들은 바꿀 것이 아니기 때문에 예측한 맨 마지막 글자만 뒤에 추가한다!
            inputs[0][-1] = preds[0][-1]
#             inputs = torch.cat((inputs, preds[-1].unsqueeze(0)), dim=1)
            print(tokenizer.decode(inputs))
    return tokenizer.decode(inputs)

text_generator("이건 근의 공식을", 10)

이건 근의 공식을 
이건 근의 공식을  
이건 근의 공식을  립
이건 근의 공식을  립요
이건 근의 공식을  립요어
이건 근의 공식을  립요어지
이건 근의 공식을  립요어지 
이건 근의 공식을  립요어지 까
이건 근의 공식을  립요어지 까 
이건 근의 공식을  립요어지 까  


'이건 근의 공식을  립요어지 까  '

In [8]:
torch.save(model.state_dict(), "gpt.pt")