In [None]:
import json
import os, csv
import shutil
from datasets import Dataset, load_from_disk, concatenate_datasets, load_dataset

import pandas as pd

from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained('/scratch/gpfs/JHA/mb5157/tokenizers/biomedbert_fast_tokenizer')

In [None]:
def get_chunks(dataset_path, dataset_len, step, suffix):
    paths = []
    for i in range(0, dataset_len, step):
        last = dataset_len if i + step > dataset_len else i + step
        path = os.path.join(dataset_path, f'{suffix}_{i}-{last}')
        paths.append(path)
    return paths

def unite_chunks(paths):
    chunks = []
    for path in paths:
        chunk = load_from_disk(path)            
        print(f'loaded from {path}')
        chunks.append(chunk)
        
    united_dataset = concatenate_datasets(chunks)
    return united_dataset

## Unite FactScore chunks

In [None]:
# Set your parameters here
#=======
dataset_path = '../outputs/350abstracts/qwen32/score0.55/span/rel29/bs256_lr_0.0004/predictions/top_20'
datasets = []

# accepted_* -- based on sequence + general knowledge;
# accepted_seq_only -- based on sequence only
suffix = 'accepted_seq_only_qwen3-32b'
suffices = []

# the length of the dataset before running factscore evaluation
dataset_len = 109293
# saving step: batch_size * num_batches
step = 100000
#=========
chunk_paths = get_chunks(dataset_path, dataset_len, step, suffix)
united_dataset = unite_chunks(chunk_paths)
len_accepted = len(united_dataset)
print('accepted:', len_accepted)

factscore = round(len_accepted / dataset_len, 3)
print(f'FactScore: {factscore}')

In [None]:
# save united chunks
output_path = os.path.join(dataset_path, f'{suffix}_all')
united_dataset.save_to_disk(output_path)
print(f'saved to {output_path}')

In [None]:
# careful: THIS REMOVES ALL CHUNKS
# make sure you saved united_dataset before running this cell
for path in chunk_paths:
    shutil.rmtree(path)
    print(f'removed {path}')

## Unite ValidityScore chunks

In [None]:
datasets = []
suffices = []

# === set your parameters here
dataset_path = '/projects/JHA/shared/dataset/qwen32b/alpha_0.55/span/cleaned_graphs'
datasets = []
suffix = 'validated_gemini-2.0-flash'
suffices = []

# the length of the dataset before running factscore evaluation
#  # get this one from output log:
# This job predicts tails for examples from .. to .. out of <dataset_len>
# or just look at the last chunk name
dataset_len = 139565
# saving step = batch_size * num_batches
step = 1000
# =========

validity_chunk_paths = get_chunks(dataset_path, dataset_len, step, suffix)
united_dataset = unite_chunks(validity_chunk_paths)
print('accepted:', len(united_dataset))

united_dataset_val = concatenate_datasets(datasets)

In [None]:
output_path = os.path.join(dataset_path, f'{suffix}_all')
united_dataset.save_to_disk(output_path)
print(f'saved to {output_path}')

In [None]:
# ValidtyScore is the percentage of "yes" responses
for key_word in ('yes', 'no', 'maybe', ''):
    # "" -- includes missing values
    dataset_keyword = united_dataset_val.filter(lambda ex: ex["response"] == [key_word],
                                 desc=f"Keep only {key_word}")
    
    print(len(dataset_keyword))
    print(f'{key_word}:', round(len(dataset_keyword)/len(united_dataset), 3))
    print()

In [None]:
output_path = os.path.join(dataset_path, f'{suffix}_all')
united_dataset_val.save_to_disk(output_path)
print(f'saved to {output_path}')

In [None]:
# careful: THIS REMOVES ALL CHUNKS
# make sure you saved united_dataset before running this cell
for path in validity_chunk_paths:
    shutil.rmtree(path)
    print(f'removed {path}')

In [None]:
# careful: THIS REMOVES ALL CHUNKS
# make sure you saved united_dataset before running this cell
for path in chunk_paths:
    shutil.rmtree(path)
    print(f'removed {path}')