In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import ReduceLROnPlateau
    
import math
import time
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings(action='ignore')

SEED = 1234
BATCH_SIZE = 256

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [None]:
import os
import numpy as np
# from konlpy.tag import Mecab
from torchtext.legacy import data, datasets
from sklearn.model_selection import train_test_split

def generate_source_and_target(lines, split_cond, fpath="data"):
    src = []
    trg = []

    for line in lines:
        src.append(' '.join(line[:-1]) + '\n')
        trg.append(' '.join(line[1:]) + '\n')
    
    write_txt(split_cond + ".src", src, fpath)
    write_txt(split_cond + ".trg", trg, fpath)
    
def write_txt(fname, lines, fpath):
    with open(os.path.join(fpath, fname), "w") as f:
        f.writelines(lines)

if not os.path.exists("drive/MyDrive/data/train.src"):
    with open("drive/MyDrive/data/petitions_splited_mecab.txt", "r") as f:
         corpus = f.readlines()

    corpus = list(map(lambda x: str(x).replace("\n", ""), corpus))

    train_lines, test_lines = train_test_split(corpus, test_size=0.05, random_state=1234)
    train_lines, valid_lines = train_test_split(train_lines, test_size=1/19, random_state=1234)

    generate_source_and_target(train_lines, "train", fpath="drive/MyDrive/data")
    generate_source_and_target(valid_lines, "val", fpath="drive/MyDrive/data")
    generate_source_and_target(test_lines, "test", fpath="drive/MyDrive/data")

In [None]:
class ELMODataset:
    def __init__(self, filepath, batch_size, device):
        self.batch_size = batch_size
        self.device = device

        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False,
                              fix_length =20)
        
        self.TRG = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              fix_length =20)

        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device)
 
    def build_vocab(self, min_freq=5):
        self.SRC.build_vocab(self.train_data, min_freq=min_freq)
        self.TRG.build_vocab(self.train_data, min_freq=min_freq)
        
        print(f"Unique tokens in source vocabulary: {len(self.SRC.vocab)}")
        print(f"Unique tokens in target vocabulary: {len(self.TRG.vocab)}")

elmo_dataset = ELMODataset(filepath="drive/MyDrive/data", batch_size=BATCH_SIZE, device=device)

Unique tokens in source vocabulary: 1610
Unique tokens in target vocabulary: 1602
number of training data : 205654
number of valid data : 11426
number of test data : 11426


In [None]:
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, pad_idx, n_layers=2, bidirectional=True):
        super(ELMO_Embedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_dim)
        self.rnn       = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=bidirectional, batch_first=True)        
        self.fc_out    = nn.Linear(hid_dim, output_dim)

    def forward(self, src):
        embedding               = self.embedding(src)
        output, (hidden, state) = self.rnn(embedding)

        batch_size, seq_len, _  = output.size()
        output                  = output.reshape(batch_size, seq_len, -1, 2)

        forward_hid, backward_hid = output[:, :, :, 0], output[:, :, :, 1]
        
        forward_pred  = self.fc_out(forward_hid)
        backward_pred = self.fc_out(backward_hid)

        return forward_pred, backward_pred

In [None]:
def train(model, iterator, optimizer, criterion, output_dim, batch_size, device, clip=1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(iterator): 
        src = batch.src.to(device)
        trg = batch.trg.to(device).reshape(-1).long()
        
        optimizer.zero_grad()
        fpred, bpred = model(src)
        output_dim = fpred.shape[-1]
        
        fpred = fpred.reshape(-1, output_dim)
        bpred = bpred.reshape(-1, output_dim)

        forward_loss  = criterion(fpred, trg)
        backward_loss = criterion(bpred, trg)
        loss = forward_loss + backward_loss

        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)


def evaluate(model, iterator, criterion, output_dim, batch_size, device):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in iterator:
            src = batch.src.to(device)
            trg = batch.trg.to(device).reshape(-1).long()

            fpred, bpred = model(src)
            output_dim = fpred.shape[-1]

            fpred = fpred.reshape(-1, output_dim)
            bpred = bpred.reshape(-1, output_dim)

            forward_loss  = criterion(fpred, trg)
            backward_loss = criterion(bpred, trg)
            loss = forward_loss + backward_loss

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
SRC_VOCAB_SIZE = len(elmo_dataset.SRC.vocab.stoi)
TRG_VOCAB_SIZE = len(elmo_dataset.TRG.vocab.stoi)
EMB_DIM         = 200
HID_DIM         = 512
PAD_IDX         = elmo_dataset.SRC.vocab.stoi['<pad>']

model     = ELMO_Embedding(SRC_VOCAB_SIZE, EMB_DIM, HID_DIM, TRG_VOCAB_SIZE, PAD_IDX, n_layers=2, bidirectional=True).to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.005)
scheduler = ReduceLROnPlateau(optimizer, 'min')

In [None]:
N_EPOCHS  = 1000
PAITIENCE = 30

n_paitience = 0
best_valid_loss = float('inf')
optimizer.zero_grad()
optimizer.step()

for epoch in range(N_EPOCHS):
    train_loss = train(model, elmo_dataset.train_iterator, optimizer, criterion, TRG_VOCAB_SIZE, BATCH_SIZE, device)
    valid_loss = evaluate(model, elmo_dataset.valid_iterator, criterion, TRG_VOCAB_SIZE, BATCH_SIZE, device)

    scheduler.step(valid_loss)

    print(f'Epoch: {epoch + 1:02}')
    print(f'Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'Valid Loss: {valid_loss:.3f} | Train PPL: {math.exp(valid_loss):7.3f}')

    if n_paitience < PAITIENCE:
        if best_valid_loss > valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'drive/MyDrive/data/ELMO-LM_best.pt')
            n_paitience = 0
        elif best_valid_loss <= valid_loss:
            n_paitience += 1
    else:
        print("Early stop!")
        model.load_state_dict(torch.load('drive/MyDrive/data/ELMO-LM_best.pt'))
        break

HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 01
Train Loss: 2.139 | Train PPL:   8.495
Valid Loss: 0.504 | Train PPL:   1.656


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 02
Train Loss: 0.410 | Train PPL:   1.506
Valid Loss: 0.410 | Train PPL:   1.507


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 03
Train Loss: 0.345 | Train PPL:   1.412
Valid Loss: 0.397 | Train PPL:   1.487


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 04
Train Loss: 0.325 | Train PPL:   1.384
Valid Loss: 0.378 | Train PPL:   1.459


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 05
Train Loss: 0.313 | Train PPL:   1.368
Valid Loss: 0.355 | Train PPL:   1.426


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 06
Train Loss: 0.305 | Train PPL:   1.357
Valid Loss: 0.344 | Train PPL:   1.411


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 07
Train Loss: 0.293 | Train PPL:   1.340
Valid Loss: 0.350 | Train PPL:   1.419


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 08
Train Loss: 0.289 | Train PPL:   1.335
Valid Loss: 0.349 | Train PPL:   1.417


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 09
Train Loss: 0.283 | Train PPL:   1.327
Valid Loss: 0.325 | Train PPL:   1.385


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 10
Train Loss: 0.277 | Train PPL:   1.319
Valid Loss: 0.323 | Train PPL:   1.381


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 11
Train Loss: 0.269 | Train PPL:   1.309
Valid Loss: 0.351 | Train PPL:   1.420


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 12
Train Loss: 0.270 | Train PPL:   1.311
Valid Loss: 0.328 | Train PPL:   1.388


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 13
Train Loss: 0.265 | Train PPL:   1.303
Valid Loss: 0.327 | Train PPL:   1.386


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 14
Train Loss: 0.260 | Train PPL:   1.297
Valid Loss: 0.328 | Train PPL:   1.388


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 15
Train Loss: 0.259 | Train PPL:   1.295
Valid Loss: 0.324 | Train PPL:   1.382


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 16
Train Loss: 0.256 | Train PPL:   1.292
Valid Loss: 0.338 | Train PPL:   1.402


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 17
Train Loss: 0.255 | Train PPL:   1.290
Valid Loss: 0.329 | Train PPL:   1.390


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 18
Train Loss: 0.254 | Train PPL:   1.290
Valid Loss: 0.321 | Train PPL:   1.378


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 19
Train Loss: 0.253 | Train PPL:   1.288
Valid Loss: 0.325 | Train PPL:   1.384


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 20
Train Loss: 0.252 | Train PPL:   1.287
Valid Loss: 0.324 | Train PPL:   1.383


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 21
Train Loss: 0.251 | Train PPL:   1.285
Valid Loss: 0.327 | Train PPL:   1.387


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 22
Train Loss: 0.254 | Train PPL:   1.289
Valid Loss: 0.331 | Train PPL:   1.392


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 23
Train Loss: 0.253 | Train PPL:   1.287
Valid Loss: 0.327 | Train PPL:   1.387


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 24
Train Loss: 0.251 | Train PPL:   1.286
Valid Loss: 0.428 | Train PPL:   1.535


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 25
Train Loss: 0.261 | Train PPL:   1.298
Valid Loss: 0.330 | Train PPL:   1.391


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 26
Train Loss: 0.258 | Train PPL:   1.295
Valid Loss: 0.335 | Train PPL:   1.398


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 27
Train Loss: 0.254 | Train PPL:   1.290
Valid Loss: 0.330 | Train PPL:   1.391


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 28
Train Loss: 0.252 | Train PPL:   1.287
Valid Loss: 0.328 | Train PPL:   1.388


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 29
Train Loss: 0.251 | Train PPL:   1.286
Valid Loss: 0.332 | Train PPL:   1.393


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 30
Train Loss: 0.233 | Train PPL:   1.263
Valid Loss: 0.314 | Train PPL:   1.369


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 31
Train Loss: 0.216 | Train PPL:   1.241
Valid Loss: 0.311 | Train PPL:   1.365


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 32
Train Loss: 0.205 | Train PPL:   1.227
Valid Loss: 0.311 | Train PPL:   1.364


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 33
Train Loss: 0.196 | Train PPL:   1.217
Valid Loss: 0.310 | Train PPL:   1.363


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 34
Train Loss: 0.189 | Train PPL:   1.208
Valid Loss: 0.310 | Train PPL:   1.364


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 35
Train Loss: 0.183 | Train PPL:   1.201
Valid Loss: 0.311 | Train PPL:   1.364


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 36
Train Loss: 0.177 | Train PPL:   1.194
Valid Loss: 0.311 | Train PPL:   1.365


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 37
Train Loss: 0.172 | Train PPL:   1.187
Valid Loss: 0.312 | Train PPL:   1.366


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 38
Train Loss: 0.167 | Train PPL:   1.181
Valid Loss: 0.313 | Train PPL:   1.368


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 39
Train Loss: 0.162 | Train PPL:   1.176
Valid Loss: 0.314 | Train PPL:   1.369


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 40
Train Loss: 0.157 | Train PPL:   1.170
Valid Loss: 0.315 | Train PPL:   1.371


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 41
Train Loss: 0.153 | Train PPL:   1.165
Valid Loss: 0.317 | Train PPL:   1.373


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 42
Train Loss: 0.148 | Train PPL:   1.160
Valid Loss: 0.319 | Train PPL:   1.375


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 43
Train Loss: 0.144 | Train PPL:   1.155
Valid Loss: 0.321 | Train PPL:   1.378


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 44
Train Loss: 0.141 | Train PPL:   1.151
Valid Loss: 0.322 | Train PPL:   1.380


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 45
Train Loss: 0.136 | Train PPL:   1.146
Valid Loss: 0.321 | Train PPL:   1.379


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 46
Train Loss: 0.134 | Train PPL:   1.143
Valid Loss: 0.321 | Train PPL:   1.379


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 47
Train Loss: 0.132 | Train PPL:   1.142
Valid Loss: 0.321 | Train PPL:   1.379


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 48
Train Loss: 0.131 | Train PPL:   1.140
Valid Loss: 0.322 | Train PPL:   1.379


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 49
Train Loss: 0.131 | Train PPL:   1.139
Valid Loss: 0.322 | Train PPL:   1.380


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 50
Train Loss: 0.130 | Train PPL:   1.139
Valid Loss: 0.322 | Train PPL:   1.380


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 51
Train Loss: 0.129 | Train PPL:   1.138
Valid Loss: 0.322 | Train PPL:   1.380


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 52
Train Loss: 0.128 | Train PPL:   1.137
Valid Loss: 0.323 | Train PPL:   1.381


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 53
Train Loss: 0.128 | Train PPL:   1.136
Valid Loss: 0.323 | Train PPL:   1.381


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 54
Train Loss: 0.127 | Train PPL:   1.135
Valid Loss: 0.323 | Train PPL:   1.382


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 55
Train Loss: 0.126 | Train PPL:   1.135
Valid Loss: 0.324 | Train PPL:   1.382


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 56
Train Loss: 0.126 | Train PPL:   1.134
Valid Loss: 0.323 | Train PPL:   1.382


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))


Epoch: 57
Train Loss: 0.126 | Train PPL:   1.134
Valid Loss: 0.323 | Train PPL:   1.382


HBox(children=(FloatProgress(value=0.0, max=804.0), HTML(value='')))