In [1]:
%%capture
!pip install evaluate==0.3.0 rouge-score==0.1.2 sacrebleu==2.3.1 bert-score==0.3.12 openpyxl==3.0.10

In [2]:
import glob
import datasets
import evaluate
import statistics
import pandas as pd

In [3]:
EVAL_DATA_FOLDER = "/workspace/bertshare/summarization"

In [4]:
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")
sacrebleu = evaluate.load("sacrebleu")

In [5]:
def evalds(name="xtreme"):
    results = {
        "models": [],
        "rouge1": [],
        "rouge2": [],
        "rougeL": [],
        "rougeLsum": [],
        "bs_precision": [],
        "bs_recall": [],
        "bs_f1": [],
        "inverse_BLEU": []
    }
    dslist = []
    for f in glob.glob(f"{EVAL_DATA_FOLDER}/*-{name}-test-extraction_results.csv"):
        results["models"].append(f.split("/")[-1].split(".")[0].split("-")[1])
        
        # load dataset
        ds = datasets.load_dataset("csv", data_files=f)
        
        # calculate rouge score
        r: dict = rouge.compute(predictions=ds["train"]["generated_summary"], references=ds["train"]["summary"])
        for k, v in r.items():
            results[k].append(round(v * 100, 2))

        # calculate bert score
        bs: dict = bertscore.compute(
                predictions=ds["train"]["generated_summary"],
                references=ds["train"]["summary"],
                verbose=True,
                device="cuda:0",
                lang="id",
                model_type="bert-base-multilingual-cased",
                num_layers=9
            )
        bs.pop("hashcode")
        for k, v in bs.items():
            results["bs_"+k].append(round(statistics.mean(v) * 100, 2))
        
        # calculate inverse BLEU
        sc = sacrebleu.compute(predictions=ds["train"]["generated_summary"], references=ds["train"]["summary"], lowercase=True)
        results["inverse_BLEU"] = round(100 - sc["score"], 2)

    df = pd.DataFrame(results)
    df.to_excel(f"{EVAL_DATA_FOLDER}/{name}-test-evaluation_results.xlsx", index=False)     

In [6]:
evalds(name="xtreme")

Using custom data configuration default-6943feaa4e71c61d


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-6943feaa4e71c61d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-6943feaa4e71c61d/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 11239685.01 seconds, 0.00 sentences/sec




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-a0ff8afe67dc0da8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-a0ff8afe67dc0da8/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 11239704.41 seconds, 0.00 sentences/sec




  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 11239722.71 seconds, 0.00 sentences/sec




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-e306b4fe71ee7da6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-e306b4fe71ee7da6/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/121 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/61 [00:00<?, ?it/s]

done in 11239741.91 seconds, 0.00 sentences/sec


In [6]:
evalds(name="canonical")

Using custom data configuration default-1293cc750f614795


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-1293cc750f614795/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-1293cc750f614795/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 11238882.91 seconds, 0.00 sentences/sec




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-732fa79fc0364677/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-732fa79fc0364677/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 11238931.92 seconds, 0.00 sentences/sec




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-aaff69719011205b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-aaff69719011205b/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 11238983.12 seconds, 0.00 sentences/sec




Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-38335b5424b63444/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-38335b5424b63444/0.0.0/6b34fb8fcf56f7c8ba51dc895bfa2bfbe43546f190a60fcf74bb5e8afdcc2317. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:absl:Using default tokenizer.


calculating scores...
computing bert embedding.


  0%|          | 0/343 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/172 [00:00<?, ?it/s]

done in 11239034.91 seconds, 0.00 sentences/sec
