In [1]:
%%capture
!pip install evaluate==0.3.0 rouge-score==0.1.2 sacrebleu==2.3.1 bert-score==0.3.12

In [3]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: [32mOK[0m


In [114]:
import glob
import torch
import datasets
import evaluate
import statistics
import pandas as pd

from datetime import datetime
from functools import cached_property

In [244]:
FILTER_DATA_FOLDER = "/workspace/indobertshare-main/summarization"
CODE_NAME = "paracotta_full"

# Filtration Config
filter_data_from_csv = False
filter_data_path = "paracotta_full.csv"
filter_data_conf: dict = {
    "path": "indonli",
    "split": "train",
}
filter_num_layer = 9
filter_batch_size = 32

In [15]:
# %%capture
# df = pd.read_csv("generated-par.full.id", delimiter="\t", names=["score", "references", "paraphrase"], usecols = ["references", "paraphrase"], error_bad_lines=False)

In [23]:
# df.to_csv("paracotta_full.csv", index=False)

In [245]:
if filter_data_from_csv:
    df = pd.read_csv(filter_data_path)
    extract_data = datasets.Dataset.from_pandas(df)
else:
    extract_data = datasets.load_dataset(**filter_data_conf)

Found cached dataset indonli (/root/.cache/huggingface/datasets/indonli/indonli/1.1.0/d34041bd1d1a555a4bcb4ffdb9fe904778da6f7c5343209fc1485dd68121cb62)


In [102]:
df.head()

Unnamed: 0,references,paraphrase
0,Burton tidak pernah bersabar kecuali ketika it...,Burton tidak pernah sabar kecuali bila itu ben...
1,Anna tahu kalau Kolom Kelima semakin kuat.,Anna tahu bahwa kelima kolom semakin kuat.
2,"Hanya saja, dia benar-benar meminta kita ... t...","Hanya saja, ia benar-benar meminta kami ... ti..."
3,Tak ada yang seperti meninggalkan rumah sakit ...,Tidak ada yang cukup mirip meninggalkan rumah ...
4,Leibniz berhenti dan mempertimbangkan masalah ...,Leibniz berhenti dan memikirkan hal-hal.


In [159]:
len(df)

5959558

In [246]:
extract_data2 = extract_data.select(range(filter_batch_size))

In [247]:
extract_data2

Dataset({
    features: ['premise', 'hypothesis', 'label'],
    num_rows: 32
})

In [249]:
class FilterDS():
    def __init__(self, col1='references', col2='paraphrase'):
        self.col1 = col1
        self.col2 = col2
        self.res = {
                col1: [],
                col2: [],
                "bert_score": []
            }
    
    @cached_property
    def bertscore(self):
        return evaluate.load("bertscore")
    
    @cached_property
    def sacrebleu(self):
        return evaluate.load("sacrebleu")
    
    def calculate_bertscore(self, batch):
#         results = self.bertscore.compute(
#             predictions=batch[self.col1],
#             references=batch[self.col2],
#             verbose=True,
#             device="cuda:0",
#             lang="id",
#             model_type="bert-base-multilingual-cased",
#             num_layers=9,
#             use_fast_tokenizer=False
#         )
        self.res[self.col1] += batch[self.col1]
        self.res[self.col2] += batch[self.col2]
        self.res["bert_score"] += [0 for _ in range(len(batch[self.col1]))] #results["f1"]

        return None
    
    def calculate_ibleu(self, text):
        res = self.sacrebleu.compute(predictions=[text[self.col1]], references=[text[self.col2]])
        self.res["ibleu_score"].append(100 - res["score"])

        return None

In [248]:
fltr = FilterDS('premise', 'hypothesis')

In [250]:
%%capture
torch.cuda.empty_cache()
# phase I.1: calculate BERTScore for all data
_ = extract_data2.map(
    fltr.calculate_bertscore,
    batched=True,
    batch_size=filter_batch_size,
    remove_columns=[fltr.col1, fltr.col2],
)
fname = f"{CODE_NAME}-{str(int(datetime.now().timestamp()))}"
dfs = pd.DataFrame(fltr.res)
# dfs.to_csv(f"{fname}-phase1.csv", index=False)

In [251]:
# phase I.2: get average BERTScore from all data
avg_bertscore = statistics.mean(fltr.res["bert_score"])
print(avg_bertscore)
# with open(f"{fname}-avg_bertscore.txt", "w") as f:
#     f.write(str(avg_bertscore))

0


In [252]:
%%capture
# phase I.3: filter data by score > average BERTScore
extract_data2 = datasets.Dataset.from_pandas(dfs)
extract_data2 = extract_data2.filter(lambda x: x["bert_score"] >= avg_bertscore)

In [253]:
%%capture
# phase II.1: calculate inverse BLEU for all data
fltr.res["ibleu_score"] = []
_ = extract_data2.map(fltr.calculate_ibleu, batched=False, remove_columns=[fltr.col1, fltr.col2])
dfs = pd.DataFrame(fltr.res)
dfs.to_csv(f"{fname}-phase2.csv", index=False)

In [255]:
# phase II.2: get average inverse BLEU from all data
avg_ibleu = statistics.mean(fltr.res["ibleu_score"])
print(avg_ibleu)
# with open(f"{fname}-avg_ibleu.txt", "w") as f:
#     f.write(str(avg_ibleu))

92.69773719258346


In [256]:
%%capture
# phase III.3: filter data by score > average inverse BLEU
extract_data2 = datasets.Dataset.from_pandas(dfs)
extract_data2 = extract_data2.filter(lambda x: x["ibleu_score"] > avg_ibleu)

In [243]:
%%capture
# save the final result
# extract_data.save_to_disk(f"./{fname}")
extract_data2.to_csv(f"{fname}-final_result.csv")