# Finetuning FacQA
Question Answering (Modul Answer Finder) adalah Sebuah task NLP (Natural Language Processing) yang bertujuan untuk menghasilkan jawaban (factoid) berdasar dua input yang tersedia yaitu pertanyaan pengguna dan paragraf/kalimat yang merupakan sumber jawaban. Tipe factoid adalah jawaban yang berbentuk serangkaian kata yang merupakan bagian dari sebuah kalimat. Tipe factoid yang tersedia pada dataset ini adalah (`Person`, `Organization`, `Location`, `Datetime`, `Quantity`)

In [1]:
%%capture
!pip install transformers>=2.9.0 pandas>=0.25.3 numpy>=1.17.4 scikit-learn>=0.22.1 nltk==3.4.5 unidecode>=1.1.1

In [49]:
import os, sys
# sys.path.append('../')
# os.chdir('../')

import re
import random
import torch
import string
import numpy as np
import pandas as pd
from torch import optim
from tqdm import tqdm

from transformers import BertConfig, BertTokenizer, AutoTokenizer
from nltk.tokenize import word_tokenize
from conlleval import conll_evaluation

from torch.utils.data import Dataset, DataLoader
from torch.optim.lr_scheduler import StepLR
from modules.word_classification import BertForWordClassification

In [50]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
def get_lr(optimizer):
    for param_group in optimizer.param_groups:
        return param_group['lr']

def metrics_to_string(metric_dict):
    string_list = []
    for key, value in metric_dict.items():
        string_list.append('{}:{:.2f}'.format(key, value))
    return ' '.join(string_list)

In [51]:
# Forward function for word classification
def forward_word_classification(model, batch_data, i2w, is_test=False, device='cpu', **kwargs):
    # Unpack batch data
    if len(batch_data) == 4:
        (subword_batch, mask_batch, subword_to_word_indices_batch, label_batch) = batch_data
        token_type_batch = None
    elif len(batch_data) == 5:
        (subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, label_batch) = batch_data
    
    # Prepare input & label
    subword_batch = torch.LongTensor(subword_batch)
    mask_batch = torch.FloatTensor(mask_batch)
    token_type_batch = torch.LongTensor(token_type_batch) if token_type_batch is not None else None
    subword_to_word_indices_batch = torch.LongTensor(subword_to_word_indices_batch)
    label_batch = torch.LongTensor(label_batch)

    if device == "cuda":
        subword_batch = subword_batch.cuda()
        mask_batch = mask_batch.cuda()
        token_type_batch = token_type_batch.cuda() if token_type_batch is not None else None
        subword_to_word_indices_batch = subword_to_word_indices_batch.cuda()
        label_batch = label_batch.cuda()

    # Forward model
    outputs = model(subword_batch, subword_to_word_indices_batch, attention_mask=mask_batch, token_type_ids=token_type_batch, labels=label_batch)
    loss, logits = outputs[:2]
    
    # generate prediction & label list
    list_hyps = []
    list_labels = []
    hyps_list = torch.topk(logits, k=1, dim=-1)[1].squeeze(dim=-1)
    for i in range(len(hyps_list)):
        hyps, labels = hyps_list[i].tolist(), label_batch[i].tolist()        
        list_hyp, list_label = [], []
        for j in range(len(hyps)):
            if labels[j] == -100:
                break
            else:
                list_hyp.append(i2w[hyps[j]])
                list_label.append(i2w[labels[j]])
        list_hyps.append(list_hyp)
        list_labels.append(list_label)
        
    return loss, list_hyps, list_labels

In [52]:
def qa_factoid_metrics_fn(list_hyp, list_label):
    metrics = {}
    acc, pre, rec, f1, tm_pre, tm_rec, tm_f1 = conll_evaluation(list_hyp, list_label)
    metrics["ACC"] = acc
    metrics["F1"] = tm_f1
    metrics["REC"] = tm_rec
    metrics["PRE"] = tm_pre
    return metrics

In [53]:
#####
# QA Factoid ITB
#####
class QAFactoidDataset(Dataset):
    # Static constant variable
    LABEL2INDEX = {'O':0, 'B':1, 'I':2}
    INDEX2LABEL = {0:'O', 1:'B', 2:'I'}
    NUM_LABELS = 3
    
    def load_dataset(self, path):
        # Read file
        dataset = pd.read_csv(path)
        
        # Question and passage are a list of words and seq_label is list of B/I/O
        dataset['question'] = dataset['question'].apply(lambda x: eval(x))
        dataset['passage'] = dataset['passage'].apply(lambda x: eval(x))
        dataset['seq_label'] = dataset['seq_label'].apply(lambda x: [self.LABEL2INDEX[l] for l in eval(x)])

        return dataset
    
    def __init__(self, dataset_path, tokenizer, *args, **kwargs):
        self.data = self.load_dataset(dataset_path)
        self.tokenizer = tokenizer
        
    def __getitem__(self, index):
        data = self.data.loc[index,:]
        question, passage, seq_label = data['question'],  data['passage'], data['seq_label']
        
        # Add CLS token
        subwords = [self.tokenizer.cls_token_id]
        subword_to_word_indices = [-1] # For CLS
        token_type_ids = [0]
        
        # Add subwords for question
        for word_idx, word in enumerate(question):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [-1 for i in range(len(subword_list))]
            token_type_ids += [0 for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add intermediate SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        token_type_ids += [0]
        
        # Add subwords
        for word_idx, word in enumerate(passage):
            subword_list = self.tokenizer.encode(word, add_special_tokens=False)
            subword_to_word_indices += [word_idx for i in range(len(subword_list))]
            token_type_ids += [1 for i in range(len(subword_list))]
            subwords += subword_list
            
        # Add last SEP token
        subwords += [self.tokenizer.sep_token_id]
        subword_to_word_indices += [-1]
        token_type_ids += [1]
        
        return np.array(subwords), np.array(token_type_ids), np.array(subword_to_word_indices), np.array(seq_label), ' '.join(question) + "|" + ' '.join(passage)
    
    def __len__(self):
        return len(self.data)
        
class QAFactoidDataLoader(DataLoader):
    def __init__(self, max_seq_len=512, *args, **kwargs):
        super(QAFactoidDataLoader, self).__init__(*args, **kwargs)
        self.collate_fn = self._collate_fn
        self.max_seq_len = max_seq_len
        
    def _collate_fn(self, batch):
        batch_size = len(batch)
        max_seq_len = max(map(lambda x: len(x[0]), batch))
        max_seq_len = min(self.max_seq_len, max_seq_len)
        max_tgt_len = max(map(lambda x: len(x[3]), batch))
        
        subword_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        mask_batch = np.zeros((batch_size, max_seq_len), dtype=np.float32)
        token_type_batch = np.zeros((batch_size, max_seq_len), dtype=np.int64)
        subword_to_word_indices_batch = np.full((batch_size, max_seq_len), -1, dtype=np.int64)
        seq_label_batch = np.full((batch_size, max_tgt_len), -100, dtype=np.int64)

        seq_list = []
        for i, (subwords, token_type_ids, subword_to_word_indices, seq_label, raw_seq) in enumerate(batch):
            subwords = subwords[:max_seq_len]
            subword_to_word_indices = subword_to_word_indices[:max_seq_len]

            subword_batch[i,:len(subwords)] = subwords
            mask_batch[i,:len(subwords)] = 1
            token_type_batch[i,:len(subwords)] = token_type_ids
            subword_to_word_indices_batch[i,:len(subwords)] = subword_to_word_indices
            seq_label_batch[i,:len(seq_label)] = seq_label

            seq_list.append(raw_seq)
            
        return subword_batch, mask_batch, token_type_batch, subword_to_word_indices_batch, seq_label_batch, seq_list

In [14]:
# Set random seed
set_seed(42)

# Load IndoBERT Model

In [15]:
# Load Tokenizer and Config
tokenizer = BertTokenizer.from_pretrained('indobenchmark/indobert-base-p1')
config = BertConfig.from_pretrained('indobenchmark/indobert-base-p1')
config.num_labels = QAFactoidDataset.NUM_LABELS

# Instantiate model
model = BertForWordClassification.from_pretrained('indobenchmark/indobert-base-p1', config=config)

Downloading:   0%|          | 0.00/229k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.00 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.53k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/498M [00:00<?, ?B/s]

Some weights of BertForWordClassification were not initialized from the model checkpoint at indobenchmark/indobert-base-p1 and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
model

BertForWordClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(50000, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise

In [17]:
count_param(model)

124443651

# Prepare Factoid QA Dataset

In [39]:
train_dataset_path = '/home/facqa/dataset/facqa_qa-factoid-itb/train_preprocess.csv'
valid_dataset_path = '/home/facqa/dataset/facqa_qa-factoid-itb/valid_preprocess.csv'
# test_dataset_path = '/home/facqa/dataset/facqa_qa-factoid-itb/test_preprocess_masked_label.csv'
test_dataset_path = '/home/facqa/dataset/facqa_qa-factoid-itb/test_preprocess.csv'

In [40]:
train_dataset = QAFactoidDataset(train_dataset_path, tokenizer, lowercase=True)
valid_dataset = QAFactoidDataset(valid_dataset_path, tokenizer, lowercase=True)
test_dataset = QAFactoidDataset(test_dataset_path, tokenizer, lowercase=True)

train_loader = QAFactoidDataLoader(dataset=train_dataset, max_seq_len=512, batch_size=8, num_workers=7, shuffle=True)  
valid_loader = QAFactoidDataLoader(dataset=valid_dataset, max_seq_len=512, batch_size=8, num_workers=7, shuffle=False)  
test_loader = QAFactoidDataLoader(dataset=test_dataset, max_seq_len=512, batch_size=8, num_workers=7, shuffle=False)

In [41]:
w2i, i2w = QAFactoidDataset.LABEL2INDEX, QAFactoidDataset.INDEX2LABEL
print(w2i)
print(i2w)

{'O': 0, 'B': 1, 'I': 2}
{0: 'O', 1: 'B', 2: 'I'}


# Test model on sample sentences

In [54]:
def word_subword_tokenize(question, passage, tokenizer):        
    # Add CLS token
    subwords = [tokenizer.cls_token_id]
    subword_to_word_indices = [-1] # For CLS
    token_type_ids = [0]

    # Add subwords for question
    for word_idx, word in enumerate(question):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [-1 for i in range(len(subword_list))]
        token_type_ids += [0 for i in range(len(subword_list))]
        subwords += subword_list

    # Add intermediate SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]
    token_type_ids += [0]

    # Add subwords
    for word_idx, word in enumerate(passage):
        subword_list = tokenizer.encode(word, add_special_tokens=False)
        subword_to_word_indices += [word_idx for i in range(len(subword_list))]
        token_type_ids += [1 for i in range(len(subword_list))]
        subwords += subword_list

    # Add last SEP token
    subwords += [tokenizer.sep_token_id]
    subword_to_word_indices += [-1]
    token_type_ids += [1]

    return np.array(subwords), np.array(subword_to_word_indices)

In [43]:
question = ['Siapakah', 'pelatih', 'ganda', 'putra', 'bulu', 'tangkis', 'yang', 'tidak', 'meragukan', 'tekad', 'Candra', '/', 'Sigit', 'untuk', 'bekerja', 'lebih', 'keras', 'pada', 'perebutan', 'piala', 'Thomas']
passage = ['Pelatih', 'ganda', 'putra', ',', 'Christian', 'Hadinata', ',', 'tak', 'meragukan', 'tekad', 'Candra', '/', 'Sigit', '.', 'Bila', 'dia', 'sudah', 'ngomong', 'begitu', ',', 'saya', 'tidak', 'ragu', 'lagi', ',', 'ujarnya', '.']
subwords, subword_to_word_indices = word_subword_tokenize(question, passage, tokenizer)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]

pd.DataFrame({'words': passage, 'label': labels})

Unnamed: 0,words,label
0,Pelatih,I
1,ganda,O
2,putra,O
3,",",I
4,Christian,O
5,Hadinata,O
6,",",O
7,tak,I
8,meragukan,O
9,tekad,O


# Fine Tuning & Evaluation

In [55]:
###
# Training & Evaluation Function
###

# Evaluate function for validation and test
def evaluate(model, data_loader, forward_fn, metrics_fn, i2w, is_test=False):
    model.eval()
    total_loss, total_correct, total_labels = 0, 0, 0

    list_hyp, list_label, list_seq = [], [], []

    pbar = tqdm(iter(data_loader), leave=True, total=len(data_loader))
    for i, batch_data in enumerate(pbar):
        batch_seq = batch_data[-1]        
        loss, batch_hyp, batch_label = forward_fn(model, batch_data[:-1], i2w=i2w, device='cuda')

        
        # Calculate total loss
        test_loss = loss.item()
        total_loss = total_loss + test_loss

        # Calculate evaluation metrics
        list_hyp += batch_hyp
        list_label += batch_label
        list_seq += batch_seq
        metrics = metrics_fn(list_hyp, list_label)

        if not is_test:
            pbar.set_description("VALID LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
        else:
            pbar.set_description("TEST LOSS:{:.4f} {}".format(total_loss/(i+1), metrics_to_string(metrics)))
    
    if is_test:
        return total_loss, metrics, list_hyp, list_label, list_seq
    else:
        return total_loss, metrics

# Training function and trainer
def train(model, train_loader, valid_loader, optimizer, forward_fn, metrics_fn, valid_criterion, i2w, n_epochs, evaluate_every=1, early_stop=3, step_size=1, gamma=0.5, model_dir="", exp_id=None):
    # scheduler = StepLR(optimizer, step_size=step_size, gamma=gamma)

    best_val_metric = -100
    count_stop = 0

    for epoch in range(n_epochs):
        model.train()
        total_train_loss = 0
        list_hyp, list_label = [], []
        
        train_pbar = tqdm(iter(train_loader), leave=True, total=len(train_loader))
        for i, batch_data in enumerate(train_pbar):
            loss, batch_hyp, batch_label = forward_fn(model, batch_data[:-1], i2w=i2w, device='cuda')

            optimizer.zero_grad()
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), 10.0)
            optimizer.step()

            tr_loss = loss.item()
            total_train_loss = total_train_loss + tr_loss

            # Calculate metrics
            list_hyp += batch_hyp
            list_label += batch_label
            
            train_pbar.set_description("(Epoch {}) TRAIN LOSS:{:.4f} LR:{:.8f}".format((epoch+1),
                total_train_loss/(i+1), get_lr(optimizer)))
                        
        metrics = metrics_fn(list_hyp, list_label)
        print("(Epoch {}) TRAIN LOSS:{:.4f} {} LR:{:.8f}".format((epoch+1),
            total_train_loss/(i+1), metrics_to_string(metrics), get_lr(optimizer)))
        
        # Decay Learning Rate
        # scheduler.step()

        # evaluate
        if ((epoch+1) % evaluate_every) == 0:
            val_loss, val_metrics = evaluate(model, valid_loader, forward_fn, metrics_fn, i2w, is_test=False)

            # Early stopping
            val_metric = val_metrics[valid_criterion]
            if best_val_metric < val_metric:
                best_val_metric = val_metric
                # save model
                if exp_id is not None:
                    torch.save(model.state_dict(), model_dir + "/best_model_" + str(exp_id) + ".th")
                else:
                    torch.save(model.state_dict(), model_dir + "/best_model.th")
                count_stop = 0
            else:
                count_stop += 1
                print("count stop:", count_stop)
                if count_stop == early_stop:
                    break

In [45]:
optimizer = optim.Adam(model.parameters(), lr= 1e-5)
model = model.cuda()

In [46]:
model_dir = '{}/{}/{}'.format("/home/facqa/save","qa-factoid-itb","exp")
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

In [56]:
set_seed(42)
train(
    model,
    train_loader=train_loader,
    valid_loader=valid_loader,
    optimizer=optimizer,
    forward_fn=forward_word_classification,
    metrics_fn=qa_factoid_metrics_fn,
    valid_criterion='F1',
    i2w=i2w,
    n_epochs=25,
    evaluate_every=1,
    early_stop=12,
    step_size=1,
    gamma=0.5,
    model_dir=model_dir,
    exp_id=0
)

(Epoch 1) TRAIN LOSS:0.2399 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.46it/s]


(Epoch 1) TRAIN LOSS:0.2399 ACC:0.95 F1:0.04 REC:0.02 PRE:0.15 LR:0.00001000


VALID LOSS:0.2239 ACC:0.95 F1:0.05 REC:0.03 PRE:0.18: 100%|██████████| 39/39 [00:02<00:00, 16.08it/s]
(Epoch 2) TRAIN LOSS:0.1922 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.44it/s]


(Epoch 2) TRAIN LOSS:0.1922 ACC:0.95 F1:0.24 REC:0.20 PRE:0.30 LR:0.00001000


VALID LOSS:0.1986 ACC:0.94 F1:0.30 REC:0.37 PRE:0.25: 100%|██████████| 39/39 [00:02<00:00, 16.78it/s]
(Epoch 3) TRAIN LOSS:0.1463 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.29it/s]


(Epoch 3) TRAIN LOSS:0.1463 ACC:0.97 F1:0.41 REC:0.41 PRE:0.42 LR:0.00001000


VALID LOSS:0.1618 ACC:0.96 F1:0.40 REC:0.43 PRE:0.37: 100%|██████████| 39/39 [00:02<00:00, 15.71it/s]
(Epoch 4) TRAIN LOSS:0.1092 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 14.08it/s]


(Epoch 4) TRAIN LOSS:0.1092 ACC:0.98 F1:0.55 REC:0.57 PRE:0.53 LR:0.00001000


VALID LOSS:0.1484 ACC:0.96 F1:0.41 REC:0.42 PRE:0.41: 100%|██████████| 39/39 [00:02<00:00, 14.98it/s]
(Epoch 5) TRAIN LOSS:0.0849 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.46it/s]


(Epoch 5) TRAIN LOSS:0.0849 ACC:0.98 F1:0.65 REC:0.68 PRE:0.62 LR:0.00001000


VALID LOSS:0.1509 ACC:0.96 F1:0.43 REC:0.43 PRE:0.44: 100%|██████████| 39/39 [00:02<00:00, 16.87it/s]
(Epoch 6) TRAIN LOSS:0.0653 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.41it/s]


(Epoch 6) TRAIN LOSS:0.0653 ACC:0.99 F1:0.74 REC:0.77 PRE:0.72 LR:0.00001000


VALID LOSS:0.1446 ACC:0.97 F1:0.51 REC:0.54 PRE:0.49: 100%|██████████| 39/39 [00:02<00:00, 16.10it/s]
(Epoch 7) TRAIN LOSS:0.0554 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 14.06it/s]


(Epoch 7) TRAIN LOSS:0.0554 ACC:0.99 F1:0.78 REC:0.80 PRE:0.75 LR:0.00001000


VALID LOSS:0.1652 ACC:0.97 F1:0.52 REC:0.55 PRE:0.50: 100%|██████████| 39/39 [00:02<00:00, 15.66it/s]
(Epoch 8) TRAIN LOSS:0.0432 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.53it/s]


(Epoch 8) TRAIN LOSS:0.0432 ACC:0.99 F1:0.82 REC:0.84 PRE:0.80 LR:0.00001000


VALID LOSS:0.1689 ACC:0.97 F1:0.51 REC:0.53 PRE:0.50: 100%|██████████| 39/39 [00:02<00:00, 16.85it/s]

count stop: 1



(Epoch 9) TRAIN LOSS:0.0375 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.33it/s]


(Epoch 9) TRAIN LOSS:0.0375 ACC:0.99 F1:0.84 REC:0.86 PRE:0.83 LR:0.00001000


VALID LOSS:0.1862 ACC:0.97 F1:0.54 REC:0.59 PRE:0.50: 100%|██████████| 39/39 [00:02<00:00, 15.86it/s]
(Epoch 10) TRAIN LOSS:0.0317 LR:0.00001000: 100%|██████████| 312/312 [00:24<00:00, 12.86it/s]


(Epoch 10) TRAIN LOSS:0.0317 ACC:0.99 F1:0.87 REC:0.88 PRE:0.85 LR:0.00001000


VALID LOSS:0.1608 ACC:0.97 F1:0.49 REC:0.50 PRE:0.48: 100%|██████████| 39/39 [00:02<00:00, 14.02it/s]

count stop: 1



(Epoch 11) TRAIN LOSS:0.0297 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.72it/s]


(Epoch 11) TRAIN LOSS:0.0297 ACC:0.99 F1:0.87 REC:0.89 PRE:0.85 LR:0.00001000


VALID LOSS:0.1686 ACC:0.97 F1:0.55 REC:0.60 PRE:0.51: 100%|██████████| 39/39 [00:02<00:00, 16.51it/s]
(Epoch 12) TRAIN LOSS:0.0271 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.48it/s]


(Epoch 12) TRAIN LOSS:0.0271 ACC:1.00 F1:0.90 REC:0.91 PRE:0.89 LR:0.00001000


VALID LOSS:0.1590 ACC:0.97 F1:0.52 REC:0.54 PRE:0.50: 100%|██████████| 39/39 [00:02<00:00, 16.37it/s]

count stop: 1



(Epoch 13) TRAIN LOSS:0.0228 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.44it/s]


(Epoch 13) TRAIN LOSS:0.0228 ACC:1.00 F1:0.90 REC:0.91 PRE:0.89 LR:0.00001000


VALID LOSS:0.1842 ACC:0.97 F1:0.55 REC:0.60 PRE:0.51: 100%|██████████| 39/39 [00:02<00:00, 16.76it/s]
(Epoch 14) TRAIN LOSS:0.0208 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.47it/s]


(Epoch 14) TRAIN LOSS:0.0208 ACC:1.00 F1:0.91 REC:0.92 PRE:0.90 LR:0.00001000


VALID LOSS:0.1495 ACC:0.97 F1:0.52 REC:0.58 PRE:0.46: 100%|██████████| 39/39 [00:02<00:00, 15.24it/s]

count stop: 1



(Epoch 15) TRAIN LOSS:0.0184 LR:0.00001000: 100%|██████████| 312/312 [00:21<00:00, 14.53it/s]


(Epoch 15) TRAIN LOSS:0.0184 ACC:1.00 F1:0.92 REC:0.93 PRE:0.90 LR:0.00001000


VALID LOSS:0.1860 ACC:0.97 F1:0.54 REC:0.60 PRE:0.49: 100%|██████████| 39/39 [00:02<00:00, 15.41it/s]

count stop: 2



(Epoch 16) TRAIN LOSS:0.0166 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.12it/s]


(Epoch 16) TRAIN LOSS:0.0166 ACC:1.00 F1:0.94 REC:0.95 PRE:0.93 LR:0.00001000


VALID LOSS:0.1951 ACC:0.96 F1:0.53 REC:0.58 PRE:0.49: 100%|██████████| 39/39 [00:02<00:00, 15.64it/s]


count stop: 3


(Epoch 17) TRAIN LOSS:0.0165 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.74it/s]


(Epoch 17) TRAIN LOSS:0.0165 ACC:1.00 F1:0.93 REC:0.94 PRE:0.91 LR:0.00001000


VALID LOSS:0.1786 ACC:0.97 F1:0.55 REC:0.59 PRE:0.52: 100%|██████████| 39/39 [00:02<00:00, 15.32it/s]
(Epoch 18) TRAIN LOSS:0.0147 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.68it/s]


(Epoch 18) TRAIN LOSS:0.0147 ACC:1.00 F1:0.93 REC:0.94 PRE:0.92 LR:0.00001000


VALID LOSS:0.1645 ACC:0.96 F1:0.51 REC:0.55 PRE:0.48: 100%|██████████| 39/39 [00:02<00:00, 15.28it/s]

count stop: 1



(Epoch 19) TRAIN LOSS:0.0132 LR:0.00001000: 100%|██████████| 312/312 [00:25<00:00, 12.37it/s]


(Epoch 19) TRAIN LOSS:0.0132 ACC:1.00 F1:0.94 REC:0.95 PRE:0.93 LR:0.00001000


VALID LOSS:0.1720 ACC:0.97 F1:0.53 REC:0.58 PRE:0.49: 100%|██████████| 39/39 [00:02<00:00, 15.79it/s]

count stop: 2



(Epoch 20) TRAIN LOSS:0.0115 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.62it/s]


(Epoch 20) TRAIN LOSS:0.0115 ACC:1.00 F1:0.95 REC:0.96 PRE:0.94 LR:0.00001000


VALID LOSS:0.1669 ACC:0.97 F1:0.54 REC:0.58 PRE:0.50: 100%|██████████| 39/39 [00:02<00:00, 17.20it/s]

count stop: 3



(Epoch 21) TRAIN LOSS:0.0104 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.57it/s]


(Epoch 21) TRAIN LOSS:0.0104 ACC:1.00 F1:0.95 REC:0.96 PRE:0.94 LR:0.00001000


VALID LOSS:0.1885 ACC:0.96 F1:0.51 REC:0.57 PRE:0.47: 100%|██████████| 39/39 [00:02<00:00, 16.49it/s]

count stop: 4



(Epoch 22) TRAIN LOSS:0.0114 LR:0.00001000: 100%|██████████| 312/312 [00:22<00:00, 13.60it/s]


(Epoch 22) TRAIN LOSS:0.0114 ACC:1.00 F1:0.95 REC:0.96 PRE:0.94 LR:0.00001000


VALID LOSS:0.1939 ACC:0.97 F1:0.57 REC:0.62 PRE:0.54: 100%|██████████| 39/39 [00:02<00:00, 16.24it/s]
(Epoch 23) TRAIN LOSS:0.0094 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.29it/s]


(Epoch 23) TRAIN LOSS:0.0094 ACC:1.00 F1:0.96 REC:0.96 PRE:0.95 LR:0.00001000


VALID LOSS:0.1880 ACC:0.97 F1:0.55 REC:0.56 PRE:0.53: 100%|██████████| 39/39 [00:02<00:00, 15.75it/s]

count stop: 1



(Epoch 24) TRAIN LOSS:0.0090 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.48it/s]


(Epoch 24) TRAIN LOSS:0.0090 ACC:1.00 F1:0.96 REC:0.97 PRE:0.95 LR:0.00001000


VALID LOSS:0.2154 ACC:0.97 F1:0.56 REC:0.59 PRE:0.53: 100%|██████████| 39/39 [00:02<00:00, 16.78it/s]

count stop: 2



(Epoch 25) TRAIN LOSS:0.0090 LR:0.00001000: 100%|██████████| 312/312 [00:23<00:00, 13.32it/s]


(Epoch 25) TRAIN LOSS:0.0090 ACC:1.00 F1:0.95 REC:0.96 PRE:0.94 LR:0.00001000


VALID LOSS:0.2090 ACC:0.97 F1:0.56 REC:0.60 PRE:0.52: 100%|██████████| 39/39 [00:02<00:00, 16.01it/s]

count stop: 3





In [57]:
# Load best model
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

# Evaluate
print("=========== EVALUATION PHASE ===========")
test_loss, test_metrics, test_hyp, test_label, test_seq = evaluate(
    model, data_loader=test_loader, forward_fn=forward_word_classification, metrics_fn=qa_factoid_metrics_fn, i2w=i2w, is_test=True
)

# dftrue = pd.read_csv("./dataset/facqa_qa-factoid-itb/test_preprocess.csv")
# test_label = dftrue["seq_label"]
# test_metrics = qa_factoid_metrics_fn(test_hyp, test_label)

metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'seq':test_seq, 
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")



TEST LOSS:0.1801 ACC:0.97 F1:0.55 REC:0.58 PRE:0.52: 100%|██████████| 39/39 [00:02<00:00, 16.35it/s]

== Prediction Result ==
                                                 seq  \
0  Siapakah pelatih ganda putra bulu tangkis yang...   
1  Siapa nama Perdana Menteri Inggris ?|Curtis me...   
2  Berapakah catatan waktu terbaik Tonique Willia...   
3  Apakah nama latin dari kijang|Dalam waktu sebu...   
4  Siapa nama presiden Indonesia sekarang ?|Presi...   

                                                 hyp  \
0  [O, O, O, O, B, I, O, O, O, O, O, O, O, O, O, ...   
1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...   
4  [O, B, I, I, O, O, O, B, I, O, O, O, O, O, O, ...   

                                               label  
0  [O, O, O, O, B, I, O, O, O, O, O, O, O, O, O, ...  
1  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
2  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
3  [O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ...  
4  [O, B, I, I, O, O, O, O,




# Test fine-tuned model with sample sentences

In [58]:
question = ['Siapakah', 'pelatih', 'ganda', 'putra', 'bulu', 'tangkis', 'yang', 'tidak', 'meragukan', 'tekad', 'Candra', '/', 'Sigit', 'untuk', 'bekerja', 'lebih', 'keras', 'pada', 'perebutan', 'piala', 'Thomas']
passage = ['Pelatih', 'ganda', 'putra', ',', 'Christian', 'Hadinata', ',', 'tak', 'meragukan', 'tekad', 'Candra', '/', 'Sigit', '.', 'Bila', 'dia', 'sudah', 'ngomong', 'begitu', ',', 'saya', 'tidak', 'ragu', 'lagi', ',', 'ujarnya', '.']
subwords, subword_to_word_indices = word_subword_tokenize(question, passage, tokenizer)
subwords = torch.LongTensor(subwords).view(1, -1).to(model.device)
subword_to_word_indices = torch.LongTensor(subword_to_word_indices).view(1, -1).to(model.device)
logits = model(subwords, subword_to_word_indices)[0]

preds = torch.topk(logits, k=1, dim=-1)[1].squeeze().cpu().numpy()
labels = [i2w[preds[i]] for i in range(len(preds))]
print("Question: ", " ".join(question))
pd.DataFrame({'words': passage, 'label': labels})

Question:  Siapakah pelatih ganda putra bulu tangkis yang tidak meragukan tekad Candra / Sigit untuk bekerja lebih keras pada perebutan piala Thomas


Unnamed: 0,words,label
0,Pelatih,O
1,ganda,O
2,putra,O
3,",",O
4,Christian,B
5,Hadinata,I
6,",",O
7,tak,O
8,meragukan,O
9,tekad,O
