In [63]:
%%capture
!pip install Korpora
# !pip install python-mecab-ko

from Korpora import Korpora
Korpora.fetch("namuwikitext", root_dir='/content')

In [86]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR

import torchtext

from torchtext.data.utils import get_tokenizer
from torchtext.legacy.data import Dataset, Field, BucketIterator
    
import math
import time
import mecab
import random
import linecache
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import defaultdict, Counter

SEED = 1234
BATCH_SIZE = 128

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [87]:
# train_df = pd.read_table("namuwikitext/namuwikitext_20200302.test", header=None, sep="\t")

# tokenizer = mecab.MeCab()

# TEXT = Field(
#     sequential=True,
#     use_vocab=True,
#     tokenize=tokenizer.morphs,
#     init_token='<sos>',
#     eos_token='<eos>',
#     unk_token='<unk>',
#     pad_token='<pad>',
#     lower=True, 
#     batch_first=True
#     ) 

# train_ds = DataFrameDataset.splits(
#   text_field=TEXT, train_df=train_df)

In [88]:
# # train_data, test_data = TabularDataset.splits(
# #     path='drive/MyDrive/', 
# #     train='train_data_cleaned.txt',
# #     test='test_data_cleaned.txt',
# #     format='tsv', 
# #     fields=[('text', TEXT)]
# #     )

# # train_data = TabularDataset(path='drive/MyDrive/sample_text.csv',
# #                             format='csv',
# #                             fields=[('text', TEXT)])

# train_data = TabularDataset(path='/content/namuwikitext/namuwikitext_20200302.test',
#                             format='tsv',
#                             fields=[('text', TEXT)])

# TEXT.build_vocab(train_data)

In [89]:
def get_special_list(first, pad, tot_len):
    ret = [first]
    ret = ret + [pad for _ in range(tot_len-1)]
    return ret


class KoDataset(Dataset):
    def __init__(self, data_path, max_character_length = 10, max_character_size=2000, max_vocab_size = 10000):
        """
        data_paths = list of de path, en data path
        """
        self.ko_path = data_path 
        self.ko_vocab = Vocabs(tokenizer = 'mecab')
        self.ko_vocab.update_vocabs_to_file(self.ko_path)
        self.ko_vocab.set_most_common_dict(size=max_vocab_size)

        self.id2word_dict = self.ko_vocab.get_index_dict()
        self.character_dict = self.ko_vocab.get_character_dict(size = max_character_size)
        self.max_character_length = max_character_length

        with open(self.ko_path, "r") as f:
            self._total_data = len(f.readlines())
        
    def __len__(self):
        return self._total_data 

    def __getitem__(self, idx):     
        raw_ko = linecache.getline(self.ko_path, idx + 1).strip()
        ko_tensor_ = torch.tensor([2]+[self.ko_vocab.vocab_dict[token] for token in self.ko_vocab.tokenizer(raw_ko)]+[3]).long()
        return ko_tensor_
    
    def collate_fn(self, data_batch, pad_idx=0, sos_idx=2, eos_idx=3):
        ko_batch = []
        char_batch = []

        token_max_len = self.max_character_length
        max_seq_len = 0
        for each_item in data_batch:
            # token_max_len = max(token_max_len, max([len(self.id2word_dict[int(i)]) for i in each_item]))
            max_seq_len = max(max_seq_len, len(each_item))
        
        for each_item in data_batch:
            ko_batch.append(each_item)
            chars = []
            
            for index in each_item:
                if index in [sos_idx, eos_idx]:
                    padded_each_characters = get_special_list(int(index), pad_idx, token_max_len)
                else:
                    # 안녕 -> padded_each_characters(index_안 + index_녕 + index_pad)
                    padded_each_characters = []
                    word = self.id2word_dict[int(index)]
                    for char in word:
                        padded_each_characters.append(self.character_dict[char])
                    if len(padded_each_characters) > token_max_len:
                        padded_each_characters = padded_each_characters[:token_max_len]
                    else:
                        left_length = token_max_len-len(padded_each_characters)
                        padded_each_characters = padded_each_characters + [0]*left_length
                chars.append(padded_each_characters)

            # max_sequence_length padding
            chars = chars + [[0 for _ in range(token_max_len)] for _ in range(max_seq_len - len(chars))]
            char_batch.append(chars)

        padded_ko_index_batch = pad_sequence(ko_batch, padding_value=pad_idx, batch_first=True)
        padded_ko_char_batch = torch.Tensor(char_batch).long()
        return padded_ko_index_batch, padded_ko_char_batch

In [92]:
class Vocabs:
    def __init__(self, tokenizer=None):
        if tokenizer is None:
            self.tokenizer = lambda x: x.split(" ")
        elif tokenizer == "mecab":
            self.tokenizer = mecab.MeCab().morphs
        else:
            self.tokenizer = tokenizer
        self.index_dict = None

        self.pad_idx = 0
        self.unk_idx = 1
        self.sos_idx = 2
        self.eos_idx = 3
        self._index = 4
        self.vocab_dict = defaultdict(lambda: self.unk_idx)
        self.vocab_dict["<PAD>"] = self.pad_idx
        self.vocab_dict["<UNK>"] = self.unk_idx
        self.vocab_dict["<SOS>"] = self.sos_idx
        self.vocab_dict["<EOS>"] = self.eos_idx

        self.character_dict = None

    def __len__(self):
        return len(self.vocab_dict)


    def build_character_dict(self, size = 2000):
        char_cnt_dict = defaultdict(lambda:0)

        ret = defaultdict(lambda: self.unk_idx)
        ret["<PAD>"] = self.pad_idx
        ret["<UNK>"] = self.unk_idx
        ret["<SOS>"] = self.sos_idx
        ret["<EOS>"] = self.eos_idx
        _index = 4
        for i in self.vocab_dict:
            if i not in ['<PAD>','<UNK>','<SOS>','<EOS>']:
                for char in i:
                    char_cnt_dict[char] += 1

        for k, v in Counter(char_cnt_dict).most_common(size):
            ret[k] = _index
            _index += 1

        return ret

    def get_character_dict(self, size = 2000):
        if self.character_dict is None:
            self.character_dict = self.build_character_dict(size = size)
        return self.character_dict

    def set_most_common_dict(self, size):
        new_dict = defaultdict(lambda: self.unk_idx)
        new_dict["<PAD>"] = self.pad_idx
        new_dict["<UNK>"] = self.unk_idx
        new_dict["<SOS>"] = self.sos_idx
        new_dict["<EOS>"] = self.eos_idx
        _index = 4
        for k, v in Counter(self.count_dict).most_common(size):
            new_dict[k] = _index
            _index += 1
        self.vocab_dict = new_dict

    def update_vocabs_to_file(self, filepath):
        count_dict = defaultdict(lambda: 1)
        with open(filepath, encoding="utf8") as f:
            for string_ in f:
                for token in self.tokenizer(string_.replace("\n","").lower()):
                    if token in self.vocab_dict:
                        count_dict[token] += 1
                        pass
                    else:
                        count_dict[token] = 1
                        self.vocab_dict[token] = self._index
                        self._index += 1
        self.count_dict = count_dict

    def __len__(self):
        return len(self.vocab_dict)

    def build_vocabs(self, sentence_list):
        for sentence in sentence_list:
            tokens_list = self.tokenizer(sentence)
            for word in tokens_list:
                if word in self.vocab_dict:
                    pass
                else:
                    self.vocab_dict[word] = self._index
                    self._index += 1

    def build_index_dict(self):
        self.index_dict = {v: k for k, v in self.vocab_dict.items()}

    def get_index_dict(self):
        if self.index_dict == None:
            self.build_index_dict()
        return self.index_dict


In [138]:
TrainDataset = KoDataset("drive/MyDrive/sample_text.csv", 
            max_character_length = 5,
            max_character_size = 1000,
            max_vocab_size = 10000
            )

TrainDataloader = DataLoader(TrainDataset, batch_size=2, shuffle=True, collate_fn=TrainDataset.collate_fn)

In [149]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 pad_idx, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):       
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim=1))
        
        return self.fc(cat)

In [150]:
class Highway(nn.Module):
    def __init__(self, size, n_layers, f):
        super(Highway, self).__init__()

        self.n_layers = n_layers
        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.f = f

    def forward(self, x):
        for layer in range(self.n_layers):
            gate = F.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x

In [151]:
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, pad_idx, n_layers=2, bidirectional=True):
        super().__init__()

        n_filters = 100
        filter_sizes = [3, 4, 5]

        self.embedding = CNN1d(vocab_size, emb_dim, n_filters, filter_sizes, emb_dim, pad_idx)
        self.highway   = Highway(size=emb_dim, n_layers=1, f=F.relu)
        self.rnn       = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=bidirectional)        
        self.fc_out    = nn.Linear(hid_dim, output_dim)

    def forward(self, src):
        embedding               = self.embedding(src)
        highway                 = self.highway(embedding)

        print(highway.shape)

        output, (hidden, state) = self.rnn(highway)

        batch_size, seq_len, _  = output.size()
        output                  = output.reshape(batch_size, seq_len, -1, 2)

        forward_hid, backward_hid = output[:, :, :, 0], output[:, :, :, 1]
        
        # parameter sharing?
        forward_pred  = self.fc_out(forward_hid)
        backward_pred = self.fc_out(backward_hid)

        return forward_pred, backward_pred

In [152]:
def train(model, iterator, optimizer, criterion, output_dim, clip=1):
    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):
        src = batch.text

        optimizer.zero_grad()
        fpred, bpred = model(src)

        forward_loss = criterion(fpred.reshape(-1, output_dim), trg.reshape(-1))
        backward_loss = criterion(bpred.reshape(-1, output_dim), trg.reshape(-1))
        loss = forward_loss + backward_loss
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [153]:
IN_DIM  = len(TrainDataloader.dataset.ko_vocab.vocab_dict)
OUT_DIM = len(TrainDataloader.dataset.ko_vocab.vocab_dict)
EMB_DIM = 256
HID_DIM = 1024
PAD_IDX = TrainDataloader.dataset.ko_vocab.pad_idx

model = ELMO_Embedding(IN_DIM, EMB_DIM, HID_DIM, OUT_DIM, PAD_IDX, n_layers=2, bidirectional=True)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [154]:
import warnings

N_EPOCHS = 2

warnings.filterwarnings(action='ignore')

best_valid_loss = float('inf')
optimizer.zero_grad()
optimizer.step()

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion, OUT_DIM)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

RuntimeError: ignored