In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR
    
import math
import time
import random
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm

import warnings
warnings.filterwarnings(action='ignore')

SEED = 1234
BATCH_SIZE = 256
MAX_WORD_LENGTH_IN_SENT = 25
MAX_CHAR_LENGTH_IN_WORD = 6

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

# device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = 'cpu'

In [2]:
import os
import numpy as np
from konlpy.tag import Mecab
from torchtext import data, datasets
from sklearn.model_selection import train_test_split

def generate_source_and_target(lines, split_cond, fpath="data"):
    src = []
    trg = []

    for line in lines:
        src.append(line[:-1] + '\n')
        trg.append(line[1:] + '\n')
    
    write_txt(split_cond + ".src", src, fpath)
    write_txt(split_cond + ".trg", trg, fpath)
    
def write_txt(fname, lines, fpath):
    with open(os.path.join(fpath, fname), "w") as f:
        f.writelines(lines)

if not os.path.exists("data/train.src"):
    with open("data/petitions_splited_mecab.txt", "r") as f:
         corpus = f.readlines()

    corpus = list(map(lambda x: str(x).replace("\n", ""), corpus))

    train_lines, test_lines = train_test_split(corpus, test_size=0.05, random_state=1234)
    train_lines, valid_lines = train_test_split(train_lines, test_size=1/19, random_state=1234)

    generate_source_and_target(train_lines, "train", fpath="data")
    generate_source_and_target(valid_lines, "val", fpath="data")
    generate_source_and_target(test_lines, "test", fpath="data")

In [3]:
class ELMODataset:
    def __init__(self, filepath, batch_size, max_length, device):
        self.batch_size = batch_size
        self.device = device

        self.SRC = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              include_lengths=False,
                              fix_length=max_length)
        
        self.TRG = data.Field(tokenize=lambda x: x.split(' '),
                              init_token='<sos>',
                              eos_token='<eos>',
                              pad_token='<pad>',
                              lower=True,
                              batch_first=True,
                              fix_length=max_length)

        self.train_data, self.valid_data, self.test_data = \
            datasets.TranslationDataset.splits(path=filepath, exts=('.src', '.trg'),
                                               fields=(self.SRC, self.TRG))

        self.build_vocab()

        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))

        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device)
 
    def build_vocab(self, min_freq=5):
        self.SRC.build_vocab(self.train_data, min_freq=min_freq)
        self.TRG.build_vocab(self.train_data, min_freq=min_freq)
        
        print(f"Unique tokens in source vocabulary: {len(self.SRC.vocab)}")
        print(f"Unique tokens in target vocabulary: {len(self.TRG.vocab)}")

elmo_dataset = ELMODataset(filepath="data", batch_size=BATCH_SIZE, max_length=MAX_WORD_LENGTH_IN_SENT, device=device)

Unique tokens in source vocabulary: 1610
Unique tokens in target vocabulary: 1602
number of training data : 205654
number of valid data : 11426
number of test data : 11426


In [45]:
class CharacterDecomposer:
    def __init__(self, elmo_dataset, max_word_in_sent, max_char_in_word, special_token_idx=[0, 1, 2, 3]):
        self.elmo_dataset = elmo_dataset
        self.max_word_in_sent = max_word_in_sent
        self.max_char_in_word = max_char_in_word
        self.special_token_idx = special_token_idx
        
        self.build_char_vocab()
        
    def build_char_vocab(self):
        char_vocab = set([char for word in self.elmo_dataset.SRC.vocab.itos for char in word])
        self.ctoi = {}
        self.itoc = {}
        
        
        
        for idx + 4, char in enumerate(char_vocab):
            self.ctoi[char] = idx
            self.itoc[idx]  = char
            
    def decompose(self, src):
        # pad token이 1로 되어 있음 주의
        batch_char_embedding = np.ones((src.shape[0], self.max_word_in_sent, self.max_char_in_word)).astype(int)
        
        for batch_order_idx, sent in enumerate(src):
            for word_order_idx, s in enumerate(sent):
                if word_order_idx < self.max_word_in_sent - 1:
                    if s in self.special_token_idx:
                        batch_char_embedding[batch_order_idx, word_order_idx, 0] = s
#                         if s == 0:
#                             # unk token
#                             batch_char_embedding[batch_order_idx, word_order_idx, 0] = 1
#                         elif s == 2 or s == 3:
#                             batch_char_embedding[batch_order_idx, word_order_idx, 0] = s
                    elif s not in self.special_token_idx:
                        for char_order_idx, char in enumerate(self.elmo_dataset.SRC.vocab.itos[s]):
                            if char_order_idx < self.max_char_in_word - 1:
                                batch_char_embedding[batch_order_idx, word_order_idx, char_order_idx] = self.ctoi[char] + 4
                                                                                    
        return torch.LongTensor(batch_char_embedding)
    
character_decomposer = CharacterDecomposer(elmo_dataset, max_word_in_sent=MAX_WORD_LENGTH_IN_SENT, max_char_in_word=MAX_CHAR_LENGTH_IN_WORD)


In [51]:
len(elmo_dataset.SRC.vocab.itos)

1610

In [50]:
character_decomposer.ctoi

{'과': 0,
 '규': 1,
 '럿': 2,
 '했': 3,
 '론': 4,
 '썽': 5,
 '딩': 6,
 '결': 7,
 '?': 8,
 '갰': 9,
 '섣': 10,
 '쩍': 11,
 '냔': 12,
 '텋': 13,
 '돕': 14,
 '텼': 15,
 '선': 16,
 '갚': 17,
 '째': 18,
 '츠': 19,
 '오': 20,
 '뎅': 21,
 '돼': 22,
 '딥': 23,
 '탠': 24,
 '뼛': 25,
 '려': 26,
 '써': 27,
 '템': 28,
 '웹': 29,
 '꿉': 30,
 '징': 31,
 '얏': 32,
 '환': 33,
 '팁': 34,
 '뻣': 35,
 '찔': 36,
 '짙': 37,
 '캄': 38,
 '거': 39,
 '허': 40,
 '잃': 41,
 '비': 42,
 '빵': 43,
 '잣': 44,
 '씀': 45,
 '집': 46,
 '혓': 47,
 '삿': 48,
 '쩌': 49,
 '섞': 50,
 '윗': 51,
 '잏': 52,
 '켓': 53,
 '샅': 54,
 '숭': 55,
 '팟': 56,
 '대': 57,
 '것': 58,
 '쵸': 59,
 '혜': 60,
 '휼': 61,
 '삽': 62,
 '웁': 63,
 '레': 64,
 '빈': 65,
 '휠': 66,
 '꿰': 67,
 '컴': 68,
 '짊': 69,
 '꿈': 70,
 '택': 71,
 '투': 72,
 '행': 73,
 '누': 74,
 '쪼': 75,
 '젼': 76,
 '톡': 77,
 '낌': 78,
 '死': 79,
 '압': 80,
 'z': 81,
 '긁': 82,
 '든': 83,
 '빼': 84,
 '죗': 85,
 '원': 86,
 '컹': 87,
 '낱': 88,
 '액': 89,
 '척': 90,
 '헤': 91,
 '입': 92,
 '싸': 93,
 '끙': 94,
 '낫': 95,
 '졍': 96,
 '6': 97,
 '껍': 98,
 '깍': 99,
 '썩': 100,

In [16]:
class CLSDataset:
    def __init__(self, elmo_dataset, filepath, batch_size, max_length, device):
        self.elmo_dataset = elmo_dataset
        self.batch_size = batch_size
        self.device = device
        
        self.title = data.Field(tokenize=lambda x: x.split(' '),
                                init_token='<sos>',
                                eos_token='<eos>',
                                pad_token='<pad>',
                                lower=True,
                                batch_first=True,
                                include_lengths=False, 
                                fix_length=max_length)
        
        self.label = data.Field(lower=True,
                                batch_first=True)
        
        fields = [('label', self.label), ('title', self.title)]
        self.train_data, self.valid_data, self.test_data = data.TabularDataset.splits(path=filepath,
                                                                                      train='train_tokenized.ynat',
                                                                                      validation='val_tokenized.ynat',
                                                                                      test='test_tokenized.ynat',
                                                                                      format='tsv',
                                                                                      fields=fields)
        
        self.build_vocab()
        print('number of training data : {}'.format(len(self.train_data)))
        print('number of valid data : {}'.format(len(self.valid_data)))
        print('number of test data : {}'.format(len(self.test_data)))
        
        self.train_iterator, self.valid_iterator, self.test_iterator = data.BucketIterator.splits(
            (self.train_data, self.valid_data, self.test_data), sort=True, sort_within_batch=True,
            batch_size=self.batch_size, device=self.device, sort_key=lambda x: len(x.title))
        
    def build_vocab(self):
#         self.title.build_vocab(self.train_data, min_freq=1)
        self.title.vocab = self.elmo_dataset.SRC.vocab
        self.label.build_vocab(self.train_data, min_freq=1)
        
        print(f"Unique tokens in title vocabulary: {len(self.title.vocab)}")
        print(f"Unique tokens in label vocabulary: {len(self.label.vocab)}")
        
cls_dataset = CLSDataset(elmo_dataset, filepath="data", batch_size=BATCH_SIZE, max_length=MAX_WORD_LENGTH_IN_SENT, device=device)

Unique tokens in title vocabulary: 1610
Unique tokens in label vocabulary: 9
number of training data : 45678
number of valid data : 9107
number of test data : 10


In [18]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, pad_idx, dropout=0.2):
        super(CNN1d, self).__init__()   
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels  = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size  = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, src):       
        embedded = self.embedding(src)
        batch_size, word_len, char_len, emb_dim = embedded.size()
        
        # [batch * word_len, char_len, emb_dim]
        embedded = embedded.reshape(-1, char_len, emb_dim)  
        embedded = embedded.permute(0, 2, 1)
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat    = self.dropout(torch.cat(pooled, dim=1))
        
        output = self.fc(cat)
        output = output.reshape(batch_size, word_len, -1)
        
        return output
    
class Highway(nn.Module):
    def __init__(self, size, n_layers, f):
        super(Highway, self).__init__()

        self.n_layers = n_layers
        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.f = f

    def forward(self, x):
        for layer in range(self.n_layers):
            gate = F.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x
    
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, pad_idx, n_layers=2, bidirectional=True, dropout_rate=0.2):
        super(ELMO_Embedding, self).__init__()

        n_filters = 100
        filter_sizes = [3, 4, 5]

        self.embedding = CNN1d(vocab_size, embedding_dim, n_filters, filter_sizes, embedding_dim, pad_idx, dropout_rate)
        self.highway   = Highway(size=embedding_dim, n_layers=1, f=F.relu)
        self.rnn       = nn.LSTM(embedding_dim, hidden_dim, n_layers, bidirectional=bidirectional, batch_first=True)        
        self.fc_out    = nn.Linear(hidden_dim, output_dim)

    def forward(self, src, max_word_len=20):
        hiddens = []
        embeddings = []
        steps = max_word_len
        
        for step in range(steps):
            inputs = src[:, i, :].unsqueeze(1)
            embedding = self.highway(self.embedding(inputs))
            output, (hidden, state) = self.rnn(embedding)
            
            embeddings.append(embedding)
            hiddens.append(hidden)

        embeddings = torch.stack(embeddings)
        hiddens = torch.stack(hiddens)

        return embeddings, hiddens


    

In [None]:
class ELMO_Task(nn.Module):
    def __init__(self, vocab_size, LM_embedding_dim, LM_hidden_dim, LM_output_dim, TASK_embedding_dim, TASK_hidden_dim, pad_idx):
        super(ELMO_Task, self).__init__()
        # eval 을 사용하고 task 관련 부분을 다른 subclass로 나누면 해결할 수 있을 것으로 보임
        # save - load 는 모델의 구조까지 다 관리 가능함 / 굳이 load_state_dict 쓰지 않아도 됨
        self.elmo_embedding = ELMO_Embedding(vocab_size, LM_embedding_dim, LM_hidden_dim, LM_output_dim, pad_idx, dropout_rate=0)
        self.elmo_embedding.load_state_dict(torch.load('weights/ELMO-LM_best.pt'))

        for param in self.elmo_embedding.features.parameters():
            param.require_grad = False
        
        self.layer_coef = nn.Parameter(torch.ones(1, 3 , require_grad=True))
        self.scale_coef = nn.Parameter(torch.ones(1, require_grad=True))
        self.softmax    = nn.Softmax(dim=0)
        self.sigmoid    = nn.sigmoid()
        
        self.rnn = nn.GRU(TASK_embedding_dim, TASK_hidden_dim, n_layers=2, batch_first=True)
#         self.fc  = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        
    def forward(self, src):
        src_embedding, src_hiddens = self.elmo_embedding(src)
        elmo_representation = torch.cat([src_embedding, src_hiddens], dim=-1)
        representation = torch.matmul(self.softmax(self.layer_coef), elmo_representation)
        # sigmoid check
        representation *= self.sigmoid(self.scale_coef)
        representation = torch.sum(representation)
        
        pred = self.fc(self.rnn(representation))

        return pred
        

In [55]:
for batch in cls_dataset.train_iterator: 
    title = character_decomposer.decompose(batch.title)
    
    print(sum([t for t in title]))
    
    break


tensor([[  512,   256,   256,   256,   256,   256],
        [11275,   256,   256,   256,   256,   256],
        [66522,   256,   256,   256,   256,   256],
        [49775,   256,   256,   256,   256,   256],
        [ 9772,   256,   256,   256,   256,   256],
        [  572,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  256,   256,   256,   256,   256,   256],
        [  2

In [49]:
cls_dataset

<__main__.CLSDataset at 0x7f3a61e2d7d0>