### Model training -- GRU

In [27]:
import os 
import pandas as pd
import numpy as np
import string
import re
import random

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torchtext.legacy.data import Field, BucketIterator
from torchtext.legacy.data import TabularDataset

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [28]:
from spacy.lang.sv import Swedish

nlp = Swedish()     

def tokenize(text):
    return [tok.text for tok in nlp.tokenizer(text)]

In [29]:
SRC = Field(tokenize = tokenize, 
            init_token = '<SOS>', 
            eos_token = '<EOS>',
            include_lengths=True)

TGT = Field(tokenize = tokenize, 
            init_token = '<SOS>', 
            eos_token = '<EOS>', 
            lower = False)

train_data, valid_data, test_data = TabularDataset.splits(
                                           path='./term2/data/training',
                                           train='train.tsv', # seed training corpus
                                           validation='valid.tsv', 
                                           test='test.tsv',
                                           format='tsv',
                                           fields=[('src', SRC), ('tgt', TGT)],
                                           skip_header=True)

BATCH_SIZE=32
train_iterator, valid_iterator, test_iterator = BucketIterator.splits(
    (train_data, valid_data, test_data),
    batch_size=BATCH_SIZE,
    sort_key = lambda x: len(x.src),
    sort_within_batch=True)

SRC.build_vocab(train_data)
TGT.build_vocab(train_data)

In [30]:
print('Num of train data : {}'.format(len(train_data)))
print('Num of valid data : {}'.format(len(valid_data)))
print('Num of test data : {}'.format(len(test_data)))

Num of train data : 4451
Num of valid data : 636
Num of test data : 1272


###  Code is almost similar to https://github.com/bentrevett/pytorch-seq2seq's packed padded sequences tutorial. 

In [79]:
class Encoder(nn.Module):
    def __init__(self, input_dim, emb_dim, hid_dim, dropout):
        super().__init__()
        self.hid_dim = hid_dim
        self.embedding = nn.Embedding(input_dim, emb_dim)
        self.rnn = nn.GRU(emb_dim, hid_dim, bidirectional=True)
        self.fc = nn.Linear(hid_dim * 2, hid_dim)
        self.drop_out = nn.Dropout(dropout)
        
    def forward(self, src, src_len):
        # src = [src len, batch_size]
        embedded = self.drop_out(self.embedding(src))
        packed_embedded = nn.utils.rnn.pack_padded_sequence(embedded, src_len.to(device))
        packed_outputs, hidden = self.rnn(packed_embedded)
        # embedded = [src len, batch size, emb dim]
        outputs,_ = nn.utils.rnn.pad_packed_sequence(packed_outputs)
        hidden = torch.tanh(self.fc(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1)))
        
        # outputs = [src len, batch size, hid dim * n directions]
        # hidden = [n layers * n directions, batch size, hid dim]
        # cell = [n layers * n directions, batch size, hid dim]
        
        return outputs, hidden
    
class Attention(nn.Module):
    def __init__(self, enc_hid_dim, dec_hid_dim):
        super().__init__()
        self.attn = nn.Linear((enc_hid_dim * 2) + dec_hid_dim, dec_hid_dim)
        self.v = nn.Linear(dec_hid_dim, 1, bias = False)
        
    def forward(self, hidden, encoder_outputs, mask):
        
        #hidden = [batch size, dec hid dim]
        #encoder_outputs = [src len, batch size, enc hid dim * 2]
        
        batch_size = encoder_outputs.shape[1]
        src_len = encoder_outputs.shape[0]
        
        #repeat decoder hidden state src_len times
        hidden = hidden.unsqueeze(1).repeat(1, src_len, 1)  
        encoder_outputs = encoder_outputs.permute(1, 0, 2)        
        #hidden = [batch size, src len, dec hid dim]
        #encoder_outputs = [batch size, src len, enc hid dim * 2]        
        energy = torch.tanh(self.attn(torch.cat((hidden, encoder_outputs), dim = 2)))         
        #energy = [batch size, src len, dec hid dim]
        attention = self.v(energy).squeeze(2)        
        #attention = [batch size, src len]
        attention = attention.masked_fill(mask == 0, -1e10)
        
        return F.softmax(attention, dim = 1)
    
class Decoder(nn.Module):
    def __init__(self, output_dim, emb_dim, hid_dim, dropout, attention):
        super().__init__()
        
        self.hid_dim = hid_dim
        self.output_dim = output_dim
        self.attention = attention
        self.embedding = nn.Embedding(output_dim, emb_dim)
        self.rnn = nn.GRU((hid_dim * 2) + emb_dim, hid_dim)
        self.fc_out = nn.Linear((hid_dim * 2) + hid_dim + emb_dim, output_dim)
        self.drop_out = nn.Dropout(dropout)
        
    def forward(self, input, hidden, encoder_outputs, mask):
        input = input.unsqueeze(0)
        
        # embedded = [1, batch size, emb dim]
        embedded = self.drop_out(self.embedding(input))
        a = self.attention(hidden, encoder_outputs, mask)
        a = a.unsqueeze(1)
        encoder_outputs = encoder_outputs.permute(1, 0, 2)
        weighted = torch.bmm(a, encoder_outputs)
        weighted = weighted.permute(1, 0, 2)
        rnn_input = torch.cat((embedded, weighted), dim=2)
        output, hidden = self.rnn(rnn_input, hidden.unsqueeze(0))
        
        assert (output==hidden).all()
        
        embedded = embedded.squeeze(0)
        output = output.squeeze(0)
        weighted = weighted.squeeze(0)
        
        prediction = self.fc_out(torch.cat((output, weighted, embedded), dim=1)
        return prediction, hidden.squeeze(0), a.squeeze(1)
    
class Seq2Seq(nn.Module):
    def __init__(self, encoder, decoder, src_pad_idx):
        super().__init__()
       
        self.encoder = encoder
        self.decoder = decoder
        self.src_pad_idx = src_pad_idx
       
        assert encoder.hid_dim == decoder.hid_dim
    
    def create_mask(self, src):
        mask = (src != self.src_pad_idx).permute(1, 0)
        return mask 
    
    
    def forward(self, src, src_len, trg, teacher_forcing_ratio=0.5):
       
        trg_len = trg.shape[0]
        batch_size = trg.shape[1]
        trg_vocab_size = self.decoder.output_dim
       
        outputs = torch.zeros(trg_len, batch_size, trg_vocab_size)
       
        encoder_outputs, hidden = self.encoder(src, src_len)
       
        input = trg[0, :]
        mask = self.create_mask(src)
        
        for t in range(1, trg_len):
            output, hidden, _ = self.decoder(input, hidden, encoder_outputs, mask)
            outputs[t] = output

            teacher_force = random.random() < teacher_forcing_ratio

            top1 = output.argmax(1) 
            input = trg[t] if teacher_force else top1
       
        return outputs

In [80]:
INPUT_DIM = len(SRC.vocab)
OUTPUT_DIM = len(TGT.vocab)
EMB_DIM = 256
HID_DIM = 512
DROPOUT = 0.3
SRC_PAD_IDX = SRC.vocab.stoi[SRC.pad_token]

attn = Attention(HID_DIM, HID_DIM)
enc = Encoder(INPUT_DIM, EMB_DIM, HID_DIM, DROPOUT)
dec = Decoder(OUTPUT_DIM, EMB_DIM, HID_DIM, DROPOUT, attn)
model = Seq2Seq(enc, dec, SRC_PAD_IDX)

In [81]:
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.normal_(param.data, mean=0, std=0.01)

In [82]:
model.apply(init_weights)
optimizer = optim.Adam(model.parameters())
TGT_PAD_IDX = TGT.vocab.stoi[TGT.pad_token]
criterion = nn.CrossEntropyLoss(ignore_index=TGT_PAD_IDX)

In [83]:
def train(model, iterator, optimizer, criterion, clip):
    
    model.train()
    epoch_loss=0
    
    for i, batch in enumerate(iterator):
        src, src_len = batch.src
        trg = batch.tgt        
        optimizer.zero_grad()        
        output = model(src, src_len, trg)        
        # trg = [trg len, batch size]
        # output = [trg len, batch size, output dim]
        output_dim = output.shape[-1]        
        # loss 함수는 2d input으로만 계산 가능 
        output = output[1:].view(-1, output_dim)
        trg = trg[1:].view(-1)        
        # trg = [(trg len-1) * batch size]
        # output = [(trg len-1) * batch size, output dim)]
        loss = criterion(output, trg)        
        loss.backward()       
        torch.nn.utils.clip_grad_norm_(model.parameters(), clip)        
        optimizer.step()        
        epoch_loss+=loss.item()
        
    return epoch_loss/len(iterator)

In [84]:
def evaluate(model, iterator, criterion):
    
    model.eval()
    epoch_loss = 0
    
    with torch.no_grad():
        for i, batch in enumerate(iterator):
            src, src_len = batch.src
            trg = batch.tgt            
            # teacher_forcing_ratio = 0 (아무것도 알려주면 안 됨)
            output = model(src, src_len, trg, 0)
            # trg = [trg len, batch size]
            # output = [trg len, batch size, output dim]
            output_dim = output.shape[-1]            
            output = output[1:].view(-1, output_dim)
            trg = trg[1:].view(-1)            
            # trg = [(trg len - 1) * batch size]
            # output = [(trg len - 1) * batch size, output dim]            
            loss = criterion(output, trg)            
            epoch_loss+=loss.item()
        
        return epoch_loss/len(iterator)

In [85]:
def epoch_time(start_time: int,
               end_time: int):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [88]:
import time
import math

N_EPOCHS = 3 
CLIP = 1

best_valid_loss = float('inf')
train_loss_ = []
valid_loss_ = []

for epoch in range(N_EPOCHS):
    
    start_time = time.time()
    
    train_loss = train(model, train_iterator, optimizer, criterion, CLIP)
    valid_loss = evaluate(model, valid_iterator, criterion)
    train_loss_.append(train_loss)
    valid_loss_.append(valid_loss)
    
    end_time = time.time()
    epoch_mins, epoch_secs = epoch_time(start_time, end_time)
    
    if valid_loss < best_valid_loss:
        best_valid_loss = valid_loss
        torch.save(model.state_dict(), './term2/model/GRU_base.pt')
    
    print(f'Epoch: {epoch+1:02} | Time: {epoch_mins}m {epoch_secs}s')
    print(f'\tTrain Loss: {train_loss:.3f} | Train PPL: {math.exp(train_loss):7.3f}')
    print(f'\t Val. Loss: {valid_loss:.3f} |  Val. PPL: {math.exp(valid_loss):7.3f}')

Epoch: 01 | Time: 58m 57s
	Train Loss: 1.839 | Train PPL:   6.291
	 Val. Loss: 3.723 |  Val. PPL:  41.393
Epoch: 02 | Time: 58m 11s
	Train Loss: 1.309 | Train PPL:   3.702
	 Val. Loss: 3.371 |  Val. PPL:  29.100
Epoch: 03 | Time: 58m 16s
	Train Loss: 1.036 | Train PPL:   2.819
	 Val. Loss: 3.184 |  Val. PPL:  24.151


In [None]:
#model.load_state_dict(torch.load('./term2/model/')) # save model 

# Evaluation for the model

In [89]:
def translate_sentence(model, sentence, src_field, tgt_field, max_length=221):
    
    model.eval()
    tokens = sentence
    tokens = [src_field.init_token] + tokens + [src_field.eos_token]
    src_index = [src_field.vocab.stoi[token] for token in tokens]
    src_tensor = torch.LongTensor(src_index).unsqueeze(1)
    src_len = torch.LongTensor([len(src_index)])
    
    with torch.no_grad():
        encoder_outputs, hidden = model.encoder(src_tensor, src_len)
    
    mask = model.create_mask(src_tensor)
    tgt_index = [tgt_field.vocab.stoi[tgt_field.init_token]]
    attentions = torch.zeros(max_length, 1, len(src_index))
    
    for i in range(max_length):
        tgt_tensor = torch.LongTensor([tgt_index[-1]])
        with torch.no_grad():
            output, hidden, attention = model.decoder(tgt_tensor, hidden, encoder_outputs, mask)
        attentions[i] = attention
        pred_token = output.argmax(1).item()
        tgt_index.append(pred_token)
        if pred_token == tgt_field.vocab.stoi[tgt_field.eos_token]:
            break
    
    tgt_tokens = [tgt_field.vocab.itos[i] for i in tgt_index]
    
    return tgt_index[1:], tgt_tokens[1:]


def f_measure(data, model, src_field, tgt_field, max_length=221):
    
    import nltk
    from nltk.translate.chrf_score import chrf_precision_recall_fscore_support

    precisions = []
    recalls = []
    f_scores = []
    
    predicted_sents = []
    target_sents = []
    
    for example in data:
        src = vars(example)["src"]
        tgt = vars(example)["tgt"]
        #print(tgt)
        #target = [TGT.vocab.stoi[token] for token in tgt]
        prediction, _ = translate_sentence(model, src, src_field, tgt_field) 
        prediction = prediction[:-1]
        prediction = [TGT.vocab.itos[token] for token in prediction]
        target_sents.append(tgt)
        predicted_sents.append(prediction)
        
        precision_,recall_,f_score, _ = chrf_precision_recall_fscore_support(tgt,prediction, n=1, beta=0.5, epsilon=1e-16)

        precisions.append(precision_)
        recalls.append(recall_)
        f_scores.append(f_score)
        
    final_p = sum(precisions)/len(precisions)
    final_r = sum(recalls)/len(recalls)
    final_f = sum(f_scores)/len(f_scores)
    return predicted_sents, target_sents, final_p, final_r, final_f

In [None]:
%%time
prediction, target, test_precision, test_recall, test_fscore = f_measure(test_data.examples, model, SRC, TGT)

print(f'test precision: {test_precision}')
print(f'test recall: {test_recall}')
print(f'test f_measure: {test_fscore}')