In [10]:
import json
import random

def load_json(file_path):
    with open(file_path, 'r') as file:
        return json.load(file)

def save_json(data, file_path):
    with open(file_path, 'w') as file:
        json.dump(data, file, indent=4)

def sample_data(data, sample_size):
    if len(data) <= sample_size:
        return data  
    return random.sample(data, sample_size)

# remove the 'alignment_scores' key from each dictionary if it exists
def remove_key(data, key):
    for item in data:
        if key in item:
            del item[key]


In [11]:
data1 = load_json('../brahe/brahe.json')  # file to sample from
data2 = load_json('../booksum_paragraph-level-summary-alignments/booksum_article_filtered.json')  # file to concatenate with

sampled_data1 = sample_data(data1, 3640)

# remove 'alignment_scores' from the sampled data
remove_key(data2, 'alignment_scores')

concatenated_data = sampled_data1 + data2

save_json(concatenated_data, 'fiction.json')
print("Concatenated data saved to fiction.json")
print(f"Total number of examples in the output file: {len(concatenated_data)}")


Concatenated data saved to fiction.json
Total number of examples in the output file: 10000


split dataset into train valid test

In [12]:
def split_data(file_path: str, train_file: str, valid_file: str, test_file: str, train_ratio: float = 0.8, valid_ratio: float = 0.1):

    data = load_json(file_path)

    random.shuffle(data)
    
    total_size = len(data)
    train_size = int(total_size * train_ratio)
    valid_size = int(total_size * valid_ratio)
    test_size = total_size - train_size - valid_size 

    train_data = data[:train_size]
    valid_data = data[train_size:train_size + valid_size]
    test_data = data[train_size + valid_size:]

    save_json(train_data, train_file)
    save_json(valid_data, valid_file)
    save_json(test_data, test_file)

    print(f"Training data saved to {train_file}")
    print(f"Validation data saved to {valid_file}")
    print(f"Test data saved to {test_file}")
    print(f"Total number of examples: {total_size}")
    print(f"Training set size: {len(train_data)}")
    print(f"Validation set size: {len(valid_data)}")
    print(f"Test set size: {len(test_data)}")

In [13]:
file_path = 'fiction.json'     # file to split
train_file = 'article_collections/train_article.json'      
valid_file = 'article_collections/valid_article.json'      
test_file = 'article_collections/test_article.json'        

split_data(file_path, train_file, valid_file, test_file)


Training data saved to article_collections/train_article.json
Validation data saved to article_collections/valid_article.json
Test data saved to article_collections/test_article.json
Total number of examples: 10000
Training set size: 8000
Validation set size: 1000
Test set size: 1000
