In [2]:
%%capture
!pip install evaluate==0.3.0 rouge-score==0.1.2 sacrebleu==2.3.1 bert-score==0.3.12 openpyxl==3.0.10

In [3]:
import glob
import datasets
import evaluate
import statistics
import pandas as pd

In [4]:
EVAL_DATA_FOLDER = "/home/indobertshare-main/summarization"

In [5]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
sacrebleu = evaluate.load("sacrebleu")

Downloading builder script:   0%|          | 0.00/6.27k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/7.95k [00:00<?, ?B/s]

Downloading builder script:   0%|          | 0.00/8.15k [00:00<?, ?B/s]

In [6]:
def evalds(name="xtreme"):
    results = {
        "models": [],
        "rouge1": [],
        "rouge2": [],
        "rougeL": [],
        "rougeLsum": [],
        "bs_precision": [],
        "bs_recall": [],
        "bs_f1": [],
        "inverse_BLEU": []
    }
    dslist = []
    for f in glob.glob(f"{EVAL_DATA_FOLDER}/*-{name}-test-extraction_results.csv"):
        results["models"].append(f.split("/")[-1].split(".")[0].split("-")[1])
        
        # load dataset
        ds = datasets.load_dataset("csv", data_files=f)
        
        # calculate rouge score
        r: dict = rouge.compute(predictions=ds["train"]["generated_summary"], references=ds["train"]["summary"])
        for k, v in r.items():
            results[k].append(round(v * 100, 2))
        
        # calculate bert score
        bs: dict = bertscore.compute(
                predictions=ds["train"]["generated_summary"],
                references=ds["train"]["summary"],
                verbose=True,
                device="cuda:0",
                lang="id",
                model_type="bert-base-multilingual-cased",
                num_layers=9
            )
        bs.pop("hashcode")
        for k, v in bs.items():
            results["bs_"+k].append(round(statistics.mean(v) * 100, 2))
        
        # calculate inverse BLEU
        sc = sacrebleu.compute(predictions=ds["train"]["generated_summary"], references=ds["train"]["summary"], lowercase=True)
        results["inverse_BLEU"] = round(100 - sc["score"], 2)

    df = pd.DataFrame(results)
    df.to_excel(f"{EVAL_DATA_FOLDER}/{name}-test-evaluation_results.xlsx", index=False)
        

In [17]:
evalds(name="xtreme")



  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 1651975.22 seconds, 0.00 sentences/sec




  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 1651988.76 seconds, 0.00 sentences/sec




  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 1652002.42 seconds, 0.00 sentences/sec




  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 1652016.24 seconds, 0.00 sentences/sec




  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 1652029.88 seconds, 0.00 sentences/sec


In [7]:
evalds(name="canonical")

Using custom data configuration default-62a37c88c8a01ea7


Downloading and preparing dataset csv/default to /home/.cache/huggingface/datasets/csv/default-62a37c88c8a01ea7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/.cache/huggingface/datasets/csv/default-62a37c88c8a01ea7/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


Downloading:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/714M [00:00<?, ?B/s]

calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 1651381.46 seconds, 0.01 sentences/sec




Downloading and preparing dataset csv/default to /home/.cache/huggingface/datasets/csv/default-6cc9bd6ce6eee410/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/.cache/huggingface/datasets/csv/default-6cc9bd6ce6eee410/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 1651418.66 seconds, 0.01 sentences/sec




Downloading and preparing dataset csv/default to /home/.cache/huggingface/datasets/csv/default-616a1ac8f5743243/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /home/.cache/huggingface/datasets/csv/default-616a1ac8f5743243/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 1651530.13 seconds, 0.01 sentences/sec
