In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import Dataset, DataLoader, TensorDataset
from torch.optim.lr_scheduler import _LRScheduler, ReduceLROnPlateau, StepLR, LambdaLR

import torchtext

from torchtext.data.utils import get_tokenizer
from torchtext.legacy.data import Dataset, Field, BucketIterator
    
import math
import time
import random
import pickle
import linecache
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm
from collections import defaultdict, Counter

import warnings

warnings.filterwarnings(action='ignore')

SEED = 1234
BATCH_SIZE = 256

random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

In [2]:
with open('drive/MyDrive/data/word_vocabulary.p', 'rb') as f:
    index_to_word, word_to_index = pickle.load(f)

with open('drive/MyDrive/data/char_vocabulary.p', 'rb')as f:
    index_to_char, char_to_index = pickle.load(f)

char_array = np.load("drive/MyDrive/data/char_array.npy")
word_array = np.load("drive/MyDrive/data/word_array.npy").squeeze(1)

X_train, X_test, y_train, y_test = train_test_split(char_array, word_array, test_size=0.2, random_state=SEED)

X_train = torch.tensor(X_train, dtype=torch.long, device=device)
X_test  = torch.tensor(X_test, dtype=torch.long, device=device)
y_train = torch.tensor(y_train, dtype=torch.long, device=device)
y_test  = torch.tensor(y_test, dtype=torch.long, device=device)

train_dataset = TensorDataset(X_train, y_train)
test_dataset  = TensorDataset(X_test, y_test)

train_dataloader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_dataloader  = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [3]:
class CNN1d(nn.Module):
    def __init__(self, vocab_size, embedding_dim, n_filters, filter_sizes, output_dim, pad_idx, dropout=0.2):
        super(CNN1d, self).__init__()
        
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=pad_idx)
        self.convs = nn.ModuleList([
                                    nn.Conv1d(in_channels  = embedding_dim, 
                                              out_channels = n_filters, 
                                              kernel_size  = fs)
                                    for fs in filter_sizes
                                    ])
        
        self.fc = nn.Linear(len(filter_sizes) * n_filters, output_dim)
        self.dropout = nn.Dropout(dropout)
        
    def forward(self, text):       
        embedded = self.embedding(text)
        batch_size, word_len, char_len, emb_dim = embedded.size()
        
        # [batch * word_len, char_len, emb_dim]
        embedded = embedded.reshape(-1, char_len, emb_dim)  
        embedded = embedded.permute(0, 2, 1)
        
        conved = [F.relu(conv(embedded)) for conv in self.convs]
        pooled = [F.max_pool1d(conv, conv.shape[2]).squeeze(2) for conv in conved]
        cat    = self.dropout(torch.cat(pooled, dim=1))
        output = self.fc(cat)

        # [batch, word_len, output_dim]
        output = cat.reshape(batch_size, word_len, -1)
        
        return output

In [4]:
class Highway(nn.Module):
    def __init__(self, size, n_layers, f):
        super(Highway, self).__init__()

        self.n_layers = n_layers
        self.nonlinear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.linear = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.gate = nn.ModuleList([nn.Linear(size, size) for _ in range(n_layers)])
        self.f = f

    def forward(self, x):
        for layer in range(self.n_layers):
            gate = F.sigmoid(self.gate[layer](x))

            nonlinear = self.f(self.nonlinear[layer](x))
            linear = self.linear[layer](x)

            x = gate * nonlinear + (1 - gate) * linear

        return x

In [5]:
class ELMO_Embedding(nn.Module):
    def __init__(self, vocab_size, emb_dim, hid_dim, output_dim, pad_idx, n_layers=2, bidirectional=True):
        super(ELMO_Embedding, self).__init__()

        n_filters = 100
        filter_sizes = [3, 4, 5]

        self.embedding = CNN1d(vocab_size, emb_dim, n_filters, filter_sizes, emb_dim, pad_idx)
        self.highway   = Highway(size=emb_dim, n_layers=1, f=F.relu)
        self.rnn       = nn.LSTM(emb_dim, hid_dim, n_layers, bidirectional=bidirectional)        
        self.fc_out    = nn.Linear(hid_dim, output_dim)

    def forward(self, src):
        embedding               = self.embedding(src)
        highway                 = self.highway(embedding)

        output, (hidden, state) = self.rnn(highway)

        batch_size, seq_len, _  = output.size()
        output                  = output.reshape(batch_size, seq_len, -1, 2)

        forward_hid, backward_hid = output[:, :, :, 0], output[:, :, :, 1]
        
        forward_pred  = self.fc_out(forward_hid)
        backward_pred = self.fc_out(backward_hid)

        return forward_pred, backward_pred

In [6]:
def train(model, iterator, optimizer, criterion, output_dim, clip=1):
    model.train()
    epoch_loss = 0
    
    for batch in tqdm(iterator): 
        src = batch[0]
        trg = batch[1]

        optimizer.zero_grad()
        fpred, bpred = model(src)

        forward_loss  = criterion(fpred.reshape(-1, output_dim), trg.reshape(-1))
        backward_loss = criterion(bpred.reshape(-1, output_dim), trg.reshape(-1))
        loss = forward_loss + backward_loss
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)
        
        optimizer.step()
        
        epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [7]:
def evaluate(model, iterator, criterion, output_dim):
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for batch in iterator:
            src = batch[0]
            trg = batch[1]

            fpred, bpred = model(src)

            forward_loss  = criterion(fpred.reshape(-1, output_dim), trg.reshape(-1))
            backward_loss = criterion(bpred.reshape(-1, output_dim), trg.reshape(-1))
            loss = forward_loss + backward_loss

            epoch_loss += loss.item()
        
    return epoch_loss / len(iterator)

In [8]:
CHAR_VOCAB_SIZE = len(index_to_char)
WORD_VOCAB_SIZE = len(index_to_word)
EMB_DIM         = 300
HID_DIM         = 1024
PAD_IDX         = 0

model     = ELMO_Embedding(CHAR_VOCAB_SIZE, EMB_DIM, HID_DIM, WORD_VOCAB_SIZE, PAD_IDX, n_layers=2, bidirectional=True)
model     = model.to(device)
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.0005)

In [None]:
N_EPOCHS  = 100
PAITIENCE = 10

n_paitience = 0
best_valid_loss = float('inf')
optimizer.zero_grad()
optimizer.step()

for epoch in range(N_EPOCHS):
    train_loss = train(model, train_dataloader, optimizer, criterion, WORD_VOCAB_SIZE)
    valid_loss = evaluate(model, valid_iterator, criterion)

    print(f'Epoch: {epoch + 1:02}')
    print(f'Train Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'Valid Loss: {valid_loss:.3f} | Train PPL: {math.exp(valid_loss):7.3f}')

    if n_paitience < PAITIENCE:
        if best_valid_loss > valid_loss:
            best_valid_loss = valid_loss
            torch.save(model.state_dict(), 'ELMO-LM_best.pt')
            n_paitience = 0
        elif best_valid_loss <= valid_loss:
            n_paitience += 1
    else:
        print("Early stop!")
        model.load_state_dict(torch.load('ELMO-LM_best.pt'))
        break

HBox(children=(FloatProgress(value=0.0, max=540.0), HTML(value='')))