In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
%%capture
!pip install datasets evaluate
!pip install rouge_score sacrebleu sacremoses

In [3]:
from datasets import load_dataset, DatasetDict
import evaluate
import torch
import pandas as pd
from tqdm import tqdm

from sacremoses import MosesTokenizer
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

In [4]:
""" Generation metrics """
bleu = evaluate.load('bleu')
rouge = evaluate.load('rouge')
sacrebleu = evaluate.load('sacrebleu')
chrf = evaluate.load('chrf')
ter = evaluate.load('ter')
mt = MosesTokenizer(lang='id')

Downloading builder script:   0%|          | 0.00/5.94k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/1.55k [00:00<?, ?B/s]

Downloading extra modules:   0%|          | 0.00/3.34k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.01k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/9.99k [00:00<?, ?B/s]

In [5]:
device = "cuda" if torch.cuda.is_available() else "cpu"
device

'cuda'

In [6]:
# PATH_DATA = "/content/drive/MyDrive/Research & Project/magister-kecerdasan-ai/my-thesis/instruction-tuning-mkn"
# dataset = DatasetDict.load_from_disk(PATH_DATA + '/instruction-data/baseline-instruct/parallel_20')
# dataset

PATH_DATA = "/content/drive/MyDrive/instruction-tuning-mkn"
dataset = DatasetDict.load_from_disk(PATH_DATA + '/instruction/base-instruct/parallel_20')
dataset

DatasetDict({
    train: Dataset({
        features: ['prompt', 'input', 'output'],
        num_rows: 40688
    })
    test: Dataset({
        features: ['prompt', 'input', 'output'],
        num_rows: 10173
    })
})

In [7]:
path_finetuned = PATH_DATA + '/model/'

In [10]:
tokenizer = AutoTokenizer.from_pretrained(path_finetuned+"/checkpoint-8138", truncation_side='right', trust_remote_code=True)
tokenizer.padding_side = "right"
if tokenizer.pad_token is None:
  tokenizer.pad_token = tokenizer.bos_token if tokenizer.bos_token is not None else tokenizer.eos_token

In [None]:
model = AutoModelForSeq2SeqLM.from_pretrained(path_finetuned, resume_download=True)
model.to(device)

In [11]:
def predict_generation(prompts, tokenizer, model):
  inputs = tokenizer(prompts, return_tensors="pt", padding=True, truncation=True, max_length=1024).to(device)
  input_size = inputs["input_ids"].shape[1]

  outputs = model.generate(**inputs, do_sample=True, min_length=1, max_length=100)
  preds = tokenizer.batch_decode(outputs, skip_special_tokens=True)

  return preds

In [13]:
import time

model.eval()
metrics = {"dataset": []}
inputs = []
preds = []
golds = []

start_time = time.time()

for i, data in enumerate(tqdm(dataset['test'])):
  with torch.inference_mode():
    batch_preds = predict_generation(data["input"], tokenizer, model)
    inputs.append(data["input"])
    preds.append(batch_preds[0])
    golds.append(data["output"])

end_time = time.time()
inference_time = end_time - start_time

print(f"\nInference time: {inference_time} seconds")

100%|██████████| 10173/10173 [29:39<00:00,  5.72it/s]


Inference time: 1779.2219672203064 seconds





In [24]:
n = 1002
print("Input: ", inputs[n])
print("Prediction: ", preds[n])
print("Gold: ", golds[n])

Input:  Terjemahkan teks berikut dari bahasa Indonesia ke bahasa Melayu Kupang.
Teks: Ya, aku tahu.
Terjemahan:
Prediction:  hmm.
Gold:  0 he em...


In [15]:
def generation_metrics_fn(list_hyp, list_label):
  # hyp and label are both list of string
  # list_hyp_bleu = list(map(lambda x: mt.tokenize(x), list_hyp))
  # list_label_bleu = list(map(lambda x: [mt.tokenize(x)], list_label))
  # list_label_sacrebleu = list(map(lambda x: [x], list_label))
  # print(list_hyp)

  metrics = {}
  metrics["BLEU"] = bleu.compute(predictions=list_hyp, references=list_label)['bleu'] * 100
  metrics["SacreBLEU"] = sacrebleu.compute(predictions=list_hyp, references=list_label)['score']
  metrics["chrF++"] = chrf.compute(predictions=list_hyp, references=list_label)['score']
  metrics["ter"] = ter.compute(predictions=list_hyp, references=list_label)['score']

  rouge_score = rouge.compute(predictions=list_hyp, references=list_label)
  metrics["ROUGE1"] = rouge_score['rouge1'] * 100
  metrics["ROUGE2"] = rouge_score['rouge2'] * 100
  metrics["ROUGEL"] = rouge_score['rougeL'] * 100
  metrics["ROUGELsum"] = rouge_score['rougeLsum'] * 100

  return metrics

In [16]:
eval_metric = generation_metrics_fn(preds, golds)

In [17]:
eval_metric

{'BLEU': 5.3256897547150945,
 'SacreBLEU': 5.325689754715093,
 'chrF++': 22.459658210332346,
 'ter': 102.43777702011592,
 'ROUGE1': 23.99102661559654,
 'ROUGE2': 3.0882585046573703,
 'ROUGEL': 23.111859445961546,
 'ROUGELsum': 23.090226449845364}