In [None]:
%%capture
!pip install Korpora
# !pip install python-mecab-ko

from Korpora import Korpora
Korpora.fetch("namuwikitext", root_dir='/content')

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR

import torchtext

from torchtext.data.utils import get_tokenizer
from torchtext.legacy.data import Field, BucketIterator
    
import math
import time
import mecab
import random
import linecache
import numpy as np
import matplotlib.pyplot as plt

from tqdm.notebook import tqdm
from collections import defaultdict, Counter

SEED = 1234
BATCH_SIZE = 128

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

In [62]:
tokenizer = mecab.MeCab()

TEXT = Field(
    sequential=True,
    use_vocab=True,
    tokenize=tokenizer.morphs,
    init_token='<sos>',
    eos_token='<eos>',
    unk_token='<unk>',
    pad_token='<pad>',
    lower=True, 
    batch_first=True
    ) 

# train_data, test_data = TabularDataset.splits(
#     path='drive/MyDrive/', 
#     train='train_data_cleaned.txt',
#     test='test_data_cleaned.txt',
#     format='tsv', 
#     fields=[('text', TEXT)]
#     )

# train_data = TabularDataset(path='drive/MyDrive/sample_text.csv',
#                             format='csv',
#                             fields=[('text', TEXT)])

train_data = TabularDataset(path='drive/MyDrive/test_data_cleaned.txt',
                            format='csv',
                            fields=[('text', TEXT)])

TEXT.build_vocab(train_data)

AttributeError: ignored

In [None]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, 
                 pad_idx, dropout=0.5):
        super().__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size = fs)
                                    for fs in filter_sizes
                                    ])
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):       
        embedded = self.embedding(text)
        embedded = embedded.permute(0, 2, 1)
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat = self.dropout(torch.cat(pooled, dim = 1))
        
        return self.fc(cat)

In [None]:
class Highway(nn.Module):
    def __init__(self, size, n_layers, f):
        super(Highway, self).__init__()

        self.n_layers = n_layers
        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.f = f

    def forward(self, x):
        for layer in range(self.n_layers):
            gate = F.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x

In [None]:
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, pad_idx, n_layers=2, bidirectional=True):
        super().__init__()

        n_filters = 100
        filter_sizes = [3, 4, 5]

        self.embedding = CNN1d(vocab_size, emb_dim, n_filters, filter_sizes, output_dim, pad_idx)
        self.highway   = Highway(size=0, n_layers=3, f=F.relu())
        self.rnn       = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=bidirectional)        
        self.fc_out    = nn.Linear(hid_dim, output_dim)

    def forward(self, src):
        embedding               = self.embedding(src)
        highway                 = self.highway(embedding)
        output, (hidden, state) = self.rnn(highway)

        batch_size, seq_len, _  = output.size()
        output                  = output.reshape(batch_size, seq_len, -1, 2)

        forward_hid, backward_hid = output[:, :, :, 0], output[:, :, :, 1]
        
        # parameter sharing?
        forward_pred  = self.fc_out(forward_hid)
        backward_pred = self.fc_out(backward_hid)

        return forward_pred, backward_pred

In [None]:
def train(model, iterator, optimizer, criterion, clip=1, output_dim=output_dim):
    output_dim = output_dim

    model.train()
    epoch_loss = 0
    
    for i, batch in enumerate(iterator):

        src = batch.src
        trg = batch.trg

        optimizer.zero_grad()
        fpred, bpred = model(src)

        forward_loss = criterion(fpred.reshape(-1, output_dim), trg.reshape(-1))
        backward_loss = criterion(bpred.reshape(-1, output_dim), trg.reshape(-1))
        loss = forward_loss + backward_loss
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [None]:
IN_DIM  = len(TEXT.vocab)
OUT_DIM = len(TEXT.vocab)
EMB_DIM = 300
HID_DIM = 1024
PAD_IDX = TEXT.vocab.stoi['<pad>']

model = ELMO_Embedding(IN_DIM, EMB_DIM, HID_DIM, OUT_DIM, PAD_IDX, n_layers=2, bidirectional=True)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [None]:
import warnings

warnings.filterwarnings(action='ignore')

best_valid_loss = float('inf')
optimizer.zero_grad()
optimizer.step()

for epoch in range(N_EPOCHS):
    start_time = time.time()

    train_loss = train(model, train_iterator, optimizer, criterion)
    
    end_time = time.time()
    
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)

    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')

torch.Size([128, 36])
torch.Size([128, 30])
torch.Size([128, 36, 256])
torch.Size([128, 36, 1024, 2])
torch.Size([128, 36, 5893])
torch.Size([128, 36, 5893])


ValueError: ignored