In [1]:
%%capture
!pip install evaluate==0.3.0 rouge-score==0.1.2 sacrebleu==2.3.1 bert-score==0.3.12

In [3]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [19]:
import glob
import torch
import datasets
import evaluate
import statistics
import pandas as pd

from datetime import datetime
from functools import cached_property

In [17]:
FILTER_DATA_FOLDER = "/workspace/indobertshare-main/summarization"
CODE_NAME = "indoNLI"

# Filtration Config
filter_data_conf: dict = {
    "path": "indonli",
    "split": "train",
}
filter_num_layer = 9
filter_batch_size = 16

In [5]:
extract_data = datasets.load_dataset(**filter_data_conf)

Downloading builder script:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/7.04k [00:00<?, ?B/s]

Downloading and preparing dataset indonli/indonli (download: 6.65 MiB, generated: 3.93 MiB, post-processed: Unknown size, total: 10.58 MiB) to /root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62...


Downloading data:   0%|          | 0.00/930k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/185k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/187k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/372k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/10330 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/2197 [00:00<?, ? examples/s]

Generating test_lay split:   0%|          | 0/2201 [00:00<?, ? examples/s]

Generating test_expert split:   0%|          | 0/2984 [00:00<?, ? examples/s]

Dataset indonli downloaded and prepared to /root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62. Subsequent calls will reuse this data.


In [6]:
extract_data = extract_data.select(range(filter_batch_size))

In [7]:
extract_data

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 32
})

In [14]:
class FilterDS():
    def __init__(self, col1='references', col2='paraphrase'):
        self.col1 = col1
        self.col2 = col2
        self.res = {
                col1: [],
                col2: [],
                "score": []
            }
        self.bertscore_res = {
                col1: [],
                col2: [],
                "score": []
            }
        self.sacrebleu_res = {
                col1: [],
                col2: [],
                "score": []
            }
    
    @cached_property
    def bertscore(self):
        return evaluate.load("bertscore")
    
    @cached_property
    def sacrebleu(self):
        return evaluate.load("sacrebleu")
    
    def calculate_bertscore(self, batch):
        results = self.bertscore.compute(
            predictions=batch[self.col1],
            references=batch[self.col2],
            verbose=True,
            device="cuda:0",
            lang="id",
            model_type="bert-base-multilingual-cased",
            num_layers=9,
            use_fast_tokenizer=False
        )
        self.bertscore_res[self.col1] += batch[self.col1]
        self.bertscore_res[self.col2] += batch[self.col2]
        self.bertscore_res["score"] += results["f1"]

        return None
    
    def calculate_ibleu(self, batch):
        results = self.sacrebleu.compute(
            predictions=batch[self.col1],
            references=[[i] for i in batch[self.col2]],
        )
        self.sacrebleu_res[self.col1].append(batch[self.col1])
        self.sacrebleu_res[self.col2].append(batch[self.col2])
        self.sacrebleu_res["score"].append(100 - results["score"])

        return None

In [15]:
fltr = FilterDS(col1='premise', col2='hypothesis')

In [21]:
# phase I.1: calculate BERTScore for all data
torch.cuda.empty_cache()
_ = extract_data.map(
    fltr.calculate_bertscore,
    batched=True,
    batch_size=filter_batch_size,
    remove_columns=[fltr.col1, fltr.col2],
)
fname = f"{CODE_NAME}-{str(int(datetime.now().timestamp()))}"
df = pd.DataFrame(fltr.bertscore_res)
df.to_csv(f"{fname}-phase1.csv", index=False)

  0%|          | 0/2 [00:00<?, ?ba/s]

calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 10883587.91 seconds, 0.00 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 10883588.31 seconds, 0.00 sentences/sec


In [22]:
# phase I.2: get average BERTScore from all data
avg_bertscore = statistics.mean(fltr.bertscore_res["score"])
print(avg_bertscore)
with open(f"{fname}-avg_bertscore.txt", "w") as f:
    f.write(str(avg_bertscore))

0.7600548770278692


In [23]:
# phase I.3: filter data by score > average BERTScore
extract_data = datasets.Dataset.from_pandas(df)
extract_data = extract_data.filter(lambda x: x["score"] > avg_bertscore)

  0%|          | 0/1 [00:00<?, ?ba/s]

In [38]:
def padding(text):
    lh = len(text[fltr.col1])
    lp = len(text[fltr.col2])
    if lh > lp:
        return {
           fltr.col1: text[fltr.col1],
            fltr.col2: text[fltr.col2] + "#"*(lh-lp),
            'score': text['score']
        }
    return {
       fltr.col1: text[fltr.col1] + "#"*(lp-lh),
        fltr.col2: text[fltr.col2],
        'score': text['score']
    }

In [39]:
extract_data = extract_data.map(padding, batched=False, remove_columns=[fltr.col1, fltr.col2, 'score'])



  0%|          | 0/54 [00:00<?, ?ex/s]

In [24]:
extract_data[fltr.col2]

['Masker sekali pakai banyak dipakai di tingkat rumah tangga.',
 'Nielsen Music mencatat pada akhir minggu ini.',
 'Paket internet sahur tidak ditujukan untuk saat sahur.',
 'Ada negatif mengidap COVID-19 secara tidak resmi.',
 'Iko mengaku senang dengan pengalamannya tersebut.',
 'Blue Mountains termasuk rute bus ulang alik swasta.',
 'Kellner memiliki kurang dari 6% YouGov.',
 'Tina Anselmi bekerja pada Serikat Katolik.',
 'Jumlah pasien meninggal karena penyakit jantung lebih sedikit dari penyakit ginjal.',
 'Saya tidak dapat mengatakan apa yang saya mau.',
 'Terdapat penjaga pantai di pantai-pantai.',
 'Tidak ada orang yang memakai kacamata di foto tersebut.',
 'Pasien memiliki penyakit jantung dan penyakit ginjal.',
 'Saya tidak bisa melepaskan mereka.',
 'Negara lain mengalami Revolusi Industri setelah Britania Raya.',
 'Romelu Lukaku kini justru berada di puncak teratas pencetak gol terbanyak.',
 'Jatiluwih adalah nama sawah.']

In [25]:
# phase II.1: calculate inverse BLEU for all data
_ = extract_data.map(fltr.calculate_ibleu, batched=False, remove_columns=[fltr.col1, fltr.col2, 'score'])
df = pd.DataFrame(fltr.sacrebleu_res)
df.to_csv(f"{fname}-phase2.csv", index=False)

  0%|          | 0/17 [00:00<?, ?ex/s]

ValueError: Mismatch in the number of predictions (141) and references (59)

In [42]:
from sacrebleu.metrics import BLEU

In [60]:
bleu = BLEU()

In [47]:
df.iloc[0][fltr.col1]

'Burton tidak pernah bersabar kecuali ketika itu benar-benar diperlukan dan sering kali tidak.'

In [48]:
 df.iloc[0][fltr.col2]

'Burton tidak pernah sabar kecuali bila itu benar-benar diperlukan dan sering tidak kemudian.#'

In [71]:
bleu.corpus_score(['Burton tidak pernah bersabar kecuali ketika itu benar-benar diperlukan dan sering kali tidak.'], 
                  [['Burton tidak pernah sabar kecuali bila itu benar-benar diperlukan dan sering tidak kemudian.']])

BLEU = 38.50 78.6/46.2/33.3/18.2 (BP = 1.000 ratio = 1.000 hyp_len = 14 ref_len = 14)

In [70]:
bleu.corpus_score(['aku mau pergi ke.'], 
                  [['aku mau pergi ke.']])

BLEU = 100.00 100.0/100.0/100.0/100.0 (BP = 1.000 ratio = 1.000 hyp_len = 5 ref_len = 5)

In [51]:
bleu.get_signature()

nrefs:1|case:mixed|eff:no|tok:13a|smooth:exp|version:2.3.1

In [81]:
sacrebleu.compute(
    predictions=df.iloc[0][fltr.col1],
    references=df.iloc[0][fltr.col2].replace("####", "[PAD]")[:len(df.iloc[0][fltr.col1])]
)

{'score': 0.0,
 'counts': [7, 0, 0, 0],
 'totals': [121, 0, 0, 0],
 'precisions': [5.785123966942149, 0.0, 0.0, 0.0],
 'bp': 0.9055855266482071,
 'sys_len': 121,
 'ref_len': 133}

In [None]:
# phase II.2: get average inverse BLEU from all data
avg_ibleu = statistics.mean(fltr.sacrebleu_res["score"])
with open(f"{fname}-avg_ibleu.txt", "w") as f:
    f.write(str(avg_ibleu))

In [None]:
# phase III.3: filter data by score > average inverse BLEU
extract_data = datasets.Dataset.from_pandas(df)
extract_data = extract_data.filter(lambda x: x["score"] > avg_ibleu)

In [None]:
# save the final result
extract_data.save_to_disk(f"./{fname}")
extract_data.to_csv(f"{fname}-final_result.csv")