# Paraphrase

In [1]:
import os, sys
# sys.path.append('../')
# os.chdir('../')

import torch
import shutil
import random
import datasets
import numpy as np
import pandas as pd
from torch import optim
from torch.utils.data import Dataset, DataLoader
from transformers import MBartForConditionalGeneration

from modules.tokenization_indonlg import IndoNLGTokenizer
from utils.train_eval import train, evaluate
from utils.metrics import generation_metrics_fn
from utils.forward_fn import forward_generation
from utils.data_utils import MachineTranslationDataset, GenerationDataLoader

  bertscore = datasets.load_metric('bertscore')


In [2]:
###
# common functions
###
def set_seed(seed):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
def count_param(module, trainable=False):
    if trainable:
        return sum(p.numel() for p in module.parameters() if p.requires_grad)
    else:
        return sum(p.numel() for p in module.parameters())
    
# Set random seed
# set_seed(26092020)

# Load Model

In [3]:
bart_model = MBartForConditionalGeneration.from_pretrained('indobenchmark/indobart')
tokenizer = IndoNLGTokenizer.from_pretrained('indobenchmark/indobart')

model = bart_model
model

MBartForConditionalGeneration(
  (model): MBartModel(
    (shared): Embedding(40004, 768, padding_idx=1)
    (encoder): MBartEncoder(
      (embed_tokens): Embedding(40004, 768, padding_idx=1)
      (embed_positions): MBartLearnedPositionalEmbedding(1026, 768, padding_idx=1)
      (layers): ModuleList(
        (0): MBartEncoderLayer(
          (self_attn): MBartAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (fc1): Linear(in_features=768, out_features=3072, bias=True)
          (fc2): Linear(in_features=3072, out_features=768, bias=True)
          (final_layer_norm): LayerNorm((768,), eps=1e-05, elementwise_affine=Tru

In [4]:
count_param(model)

131543040

# Prepare Dataset

In [9]:
# configs and args

lr = 1e-4
gamma = 0.9
lower = True
step_size = 1
beam_size = 5
max_norm = 10
early_stop = 5

max_seq_len = 512
grad_accumulate = 1
no_special_token = False
swap_source_target = True
model_type = 'indo-bart'
valid_criterion = 'SacreBLEU'

separator_id = 4
speaker_1_id = 5
speaker_2_id = 6

train_batch_size = 8
valid_batch_size = 8
test_batch_size = 8

source_lang = "[indonesian]"
target_lang = "[indonesian]"

optimizer = optim.Adam(model.parameters(), lr=lr)
src_lid = tokenizer.special_tokens_to_ids[source_lang]
tgt_lid = tokenizer.special_tokens_to_ids[target_lang]

model.config.decoder_start_token_id = tgt_lid

# Make sure cuda is deterministic
torch.backends.cudnn.deterministic = True

# create directory
model_dir = './save/filtered_liputan6-indolem'
if not os.path.exists(model_dir):
    os.makedirs(model_dir, exist_ok=True)

device = "cuda0"
# set a specific cuda device
if "cuda" in device:
    torch.cuda.set_device(int(device[4:]))
    device = "cuda"
    model = model.cuda()

PATH = "/workspace/bertshare"
MAIN_PATH = PATH+"/paraphrase"
dataset_conf: dict = {
    "path": "csv",
    "data_dir": MAIN_PATH+"/data",
    "data_files": MAIN_PATH+"/data/filtered_liputan6-indolem.csv"
}
col1 = "summary"
col2 = "generated_summary"

In [14]:
class ParaphraseDataset(Dataset):
    
    def load_dataset(self, dataset_conf): 
        data = datasets.load_dataset(split="train", **dataset_conf)
        data = data.rename_column("Unnamed: 0", "id")
        data = data.rename_column(col1, "text")
        data = data.rename_column(col2, "label")
        return data

    def __init__(self, dataset_conf, tokenizer, swap_source_target, is_valid=False, *args, **kwargs):
        self.data = self.load_dataset(dataset_conf)
        if not is_valid:
            self.data = self.data.select(range(0, self.data.num_rows-100))
            # self.data = self.data.select(range(16))
        else:
            self.data = self.data.select(range(self.data.num_rows-100, self.data.num_rows))
            # self.data = self.data.select(range(16))
        self.tokenizer = tokenizer
        self.swap_source_target = swap_source_target
    
    def __getitem__(self, index):
        data = self.data[index]
        id, text, label = data['id'], data['text'], data['label']
        input_subwords = self.tokenizer.encode(text.lower(), add_special_tokens=False)
        label_subwords = self.tokenizer.encode(label.lower(), add_special_tokens=False)
        if self.swap_source_target:
            return data['id'], label_subwords, input_subwords
        else:
            return data['id'], input_subwords, label_subwords
    
    def __len__(self):
        return len(self.data)


In [15]:
train_dataset = ParaphraseDataset(dataset_conf, tokenizer, is_valid=False, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
valid_dataset = ParaphraseDataset(dataset_conf, tokenizer, is_valid=True, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)
test_dataset = ParaphraseDataset(dataset_conf, tokenizer, is_valid=True, lowercase=lower, no_special_token=no_special_token, 
                                            speaker_1_id=speaker_1_id, speaker_2_id=speaker_2_id, separator_id=separator_id,
                                            max_token_length=max_seq_len, swap_source_target=swap_source_target)

train_loader = GenerationDataLoader(dataset=train_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=train_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=True)  
valid_loader = GenerationDataLoader(dataset=valid_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                    batch_size=valid_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)
test_loader = GenerationDataLoader(dataset=test_dataset, model_type=model_type, tokenizer=tokenizer, max_seq_len=max_seq_len, 
                                   batch_size=test_batch_size, src_lid_token_id=src_lid, tgt_lid_token_id=tgt_lid, num_workers=8, shuffle=False)

Using custom data configuration default-c7f499497ea4cf6e
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-c7f499497ea4cf6e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-c7f499497ea4cf6e
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-c7f499497ea4cf6e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)
Using custom data configuration default-c7f499497ea4cf6e
Found cached dataset csv (/root/.cache/huggingface/datasets/csv/default-c7f499497ea4cf6e/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317)


# Fine Tuning & Evaluation

In [28]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [29]:
# Train

n_epochs = 1

train(model, train_loader=train_loader, valid_loader=valid_loader, optimizer=optimizer, 
      forward_fn=forward_generation, metrics_fn=generation_metrics_fn, valid_criterion=valid_criterion, 
      tokenizer=tokenizer, n_epochs=n_epochs, evaluate_every=1, early_stop=early_stop, 
      grad_accum=grad_accumulate, step_size=step_size, gamma=gamma, 
      max_norm=max_norm, model_type=model_type, beam_size=beam_size,
      max_seq_len=max_seq_len, model_dir=model_dir, exp_id=0, fp16="", device=device)

(Epoch 1) TRAIN LOSS:2.9394 LR:0.00010000: 100%|██████████████████████████████████████████| 2/2 [00:00<00:00,  5.06it/s]


(Epoch 1) TRAIN LOSS:2.9394 BLEU:43.90 SacreBLEU:45.64 ROUGE1:67.82 ROUGE2:49.24 ROUGEL:65.00 ROUGELsum:65.09 LR:0.00010000


VALID LOSS:2.8453: 100%|██████████████████████████████████████████████████████████████████| 2/2 [00:00<00:00,  9.15it/s]


(Epoch 1) VALID LOSS:2.8453 BLEU:49.39 SacreBLEU:50.93 ROUGE1:72.40 ROUGE2:57.54 ROUGEL:72.68 ROUGELsum:72.47


In [16]:
# Load best model
model.load_state_dict(torch.load(model_dir + "/best_model_0.th"))

<All keys matched successfully>

In [17]:
# Evaluate
test_loss, test_metrics, test_hyp, test_label = evaluate(model, data_loader=test_loader, forward_fn=forward_generation, 
                                                         metrics_fn=generation_metrics_fn, model_type=model_type, 
                                                         tokenizer=tokenizer, beam_size=beam_size, 
                                                         max_seq_len=max_seq_len, is_test=True, 
                                                         device='cuda')

TESTING... : 100%|██████████████████████████████████████████████████████████████████████| 13/13 [00:06<00:00,  2.02it/s]

calculating scores...
computing bert embedding.





  0%|          | 0/3 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/2 [00:00<?, ?it/s]

done in 11457774.90 seconds, 0.00 sentences/sec


In [18]:
metrics_scores = []
result_dfs = []

metrics_scores.append(test_metrics)
result_dfs.append(pd.DataFrame({
    'hyp': test_hyp, 
    'label': test_label
}))

result_df = pd.concat(result_dfs)
metric_df = pd.DataFrame.from_records(metrics_scores)

print('== Prediction Result ==')
print(result_df.head())
print()

print('== Model Performance ==')
print(metric_df.describe())

result_df.to_csv(model_dir + "/prediction_result.csv")
metric_df.describe().to_csv(model_dir + "/evaluation_result.csv")

== Prediction Result ==
                                                 hyp  \
0   ratusan massa yang berasal dari fkppi dan ppm...   
1   hercules dan puluhan anak buahnya ditangkap p...   
2   kehadiran gilang dirga di pentas hiburan buka...   
3   agus martowardojo tampaknya kian mantap melan...   
4   niat baik mencari rezeki halal tak selamanya ...   

                                               label  
0   ratusan massa yang berasal dari fkppi dan ppm...  
1   hercules rosario marshal atau yang dikenal de...  
2   kehadiran gilang dirga di pentas hiburan buka...  
3   agus martowardojo tampaknya semakin mantap me...  
4   berusaha cari tambahan halal dengan berdagang...  

== Model Performance ==
       BERTSCORE      IBLEU  ISacreBLEU     ROUGE1     ROUGE2     ROUGEL  \
count       1.00   1.000000    1.000000   1.000000   1.000000   1.000000   
mean       84.07  58.649313   58.517794  57.108277  45.762358  55.403803   
std          NaN        NaN         NaN        NaN      