In [1]:
import os, sys
# sys.path.append('../')
# os.chdir('../')

import torch
import shutil
import random
import datasets
import numpy as np
import pandas as pd
from torch import optim
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration

from modules.tokenization_indonlg import IndoNLGTokenizer
from utils.train_eval import train, evaluate
from utils.metrics import generation_metrics_fn
from utils.forward_fn import forward_generation
from utils.data_utils import MachineTranslationDataset, GenerationDataLoader

import nltk
nltk.download('punkt')

  bertscore = datasets.load_metric('bertscore')
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
# Set random seed
set_seed(42)

In [38]:
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart')
max_seq_len = 128
no_special_token = False
model_type = 'indo-bart'
beam_size = 5

separator_id = 4
speaker_1_id = 5
speaker_2_id = 6

train_batch_size = 8
valid_batch_size = 8
test_batch_size = 32

source_lang = "[indonesian]"
target_lang = "[indonesian]"

src_lid = tokenizer.special_tokens_to_ids[source_lang]
tgt_lid = tokenizer.special_tokens_to_ids[target_lang]

In [27]:
def load_models():
    bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart')
    model = bart_model
    model.config.decoder_start_token_id = tgt_lid
    
    return model

In [28]:
# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

In [29]:
all_models = [
    './save/filtered_liputan6-indolem',
    './save/filtered_paracotta',
    './save/full_paracotta',
    './save/full_liputan6-merge',
    './save/full_liputan6-indolem',
    './save/filtered_liputan6-merge',
    './save/filtered_merge_all'
]

In [30]:
PATH = "/workspace/bertshare"
MAIN_PATH = PATH+"/paraphrase"

In [36]:
class ParaphraseDataset(Dataset):
    
    def load_dataset(self, is_expert=True): 
        if is_expert:
            data = datasets.load_dataset(path="indonli", split="test_expert")
        else:
            data = datasets.load_dataset(path="indonli", split="test_lay")
#             ds = []
#             for dsplit in ["train", "validation", "test_lay"]:
#                 ds.append(datasets.load_dataset(path="indonli", split=dsplit))
#             data = datasets.concatenate_datasets(ds) 
        data = data.rename_column("label", "id")
        data = data.rename_column("premise", "text")
        data = data.rename_column("hypothesis", "label")
        return data

    def __init__(self, tokenizer, is_expert=False, *args, **kwargs):
        self.data = self.load_dataset(is_expert)
        self.tokenizer = tokenizer
    
    def __getitem__(self, index):
        data = self.data[index]
        id, text, label = data['id'], data['text'], data['label']
        input_subwords = self.tokenizer.encode(text.lower(), add_special_tokens=False)
        label_subwords = self.tokenizer.encode(label.lower(), add_special_tokens=False)
        return data['id'], input_subwords, input_subwords
    
    def __len__(self):
        return len(self.data)

## indoNLI-test

In [37]:
for saved_models in all_models:
    test_dataset = ParaphraseDataset(tokenizer, is_expert=False, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len)
    test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
    model = load_models()
    model_dir = saved_models
    if not os.path.exists(model_dir):
        os.makedirs(model_dir, exist_ok=True)
    model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))
    device = "cuda0"
    # set a specific cuda device
    if "cuda" in device:
        torch.cuda.set_device(int(device[4:]))
        device = "cuda"
        model = model.cuda()
    test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')
    metrics_scores = []
    result_dfs = []

    metrics_scores.append(test_metrics)
    result_dfs.append(pd.DataFrame({
        'hyp': test_hyp, 
        'label': test_label
    }))

    result_df = pd.concat(result_dfs)
    metric_df = pd.DataFrame.from_records(metrics_scores)

    result_df.to_csv(model_dir + "/indoNLI-test-prediction_result.csv")
    metric_df.describe().to_csv(model_dir + "/indoNLI-test-evaluation_result.csv")

Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:32<00:00,  2.21s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/24 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890025.52 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [01:40<00:00,  1.46s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/26 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890147.81 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:18<00:00,  2.00s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/26 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890308.21 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:15<00:00,  1.96s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/22 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890465.60 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:26<00:00,  2.12s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/21 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890634.02 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:12<00:00,  1.92s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/25 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890789.12 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 69/69 [02:07<00:00,  1.84s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/25 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/35 [00:00<?, ?it/s]

done in 11890939.22 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


## indoNLI-test_expert

In [39]:
for saved_models in all_models:
    test_dataset = ParaphraseDataset(tokenizer, is_expert=True, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len)
    test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
    model = load_models()
    model_dir = saved_models
    if not os.path.exists(model_dir):
        os.makedirs(model_dir, exist_ok=True)
    model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))
    device = "cuda0"
    # set a specific cuda device
    if "cuda" in device:
        torch.cuda.set_device(int(device[4:]))
        device = "cuda"
        model = model.cuda()
    test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')
    metrics_scores = []
    result_dfs = []

    metrics_scores.append(test_metrics)
    result_dfs.append(pd.DataFrame({
        'hyp': test_hyp, 
        'label': test_label
    }))

    result_df = pd.concat(result_dfs)
    metric_df = pd.DataFrame.from_records(metrics_scores)

    result_df.to_csv(model_dir + "/indoNLI-test_expert-prediction_result.csv")
    metric_df.describe().to_csv(model_dir + "/indoNLI_expert-test-evaluation_result.csv")

Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [06:30<00:00,  4.15s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11890889.80 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [05:28<00:00,  3.49s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/17 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11891243.42 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [08:46<00:00,  5.60s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/17 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11891795.63 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [06:36<00:00,  4.22s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11892218.31 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [09:07<00:00,  5.83s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/15 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11892792.03 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.
Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [06:44<00:00,  4.30s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11893223.81 seconds, 0.00 sentences/sec


Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)
TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 94/94 [06:47<00:00,  4.33s/it]


calculating scores...
computing bert embedding.


  0%|          | 0/16 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/47 [00:00<?, ?it/s]

done in 11893657.22 seconds, 0.00 sentences/sec


That's 100 lines that end in a tokenized period ('.')
It looks like you forgot to detokenize your test data, which may hurt your score.
If you insist your data is detokenized, or don't care, you can suppress this message with the `force` parameter.


In [41]:
import datasets

PATH = "/workspace/bertshare"
MAIN_PATH = PATH+"/paraphrase"

all_data = [
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/filtered_liputan6-indolem.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/filtered_paracotta.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/full_paracotta.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/full_liputan6-merge.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/filtered_liputan6-merge.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/full_liputan6-indolem.csv"
},
{
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/filtered_merge_all.csv"
}]

res = []

for dataset_conf in all_data:
    data = datasets.load_dataset(split="train", **dataset_conf)
    print(dataset_conf["data_files"].split("/")[-1], len(data))

Using custom data configuration default-c7f499497ea4cf6e
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-c7f499497ea4cf6e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


filtered_liputan6-indolem.csv 146030


Using custom data configuration default-cd701d93a4de31b6
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-cd701d93a4de31b6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


filtered_paracotta.csv 1706560


Using custom data configuration default-fb0c6b302ad70e50
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-fb0c6b302ad70e50/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


full_paracotta.csv 5753296


Using custom data configuration default-0580a31987a377b5
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-0580a31987a377b5/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


full_liputan6-merge.csv 581088


Using custom data configuration default-8ce1983df59c04fc
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-8ce1983df59c04fc/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


filtered_liputan6-merge.csv 145666


Using custom data configuration default-4ebd3bf74bf855ec
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-4ebd3bf74bf855ec/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


full_liputan6-indolem.csv 583520


Using custom data configuration default-f1b9b1103821a011
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-f1b9b1103821a011/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


filtered_merge_all.csv 1998256
