In [1]:
from datasets import load_dataset

ds = load_dataset("EdinburghNLP/xsum")

# access the train, validation, and test splits
train_dataset = ds['train']
validation_dataset = ds['validation']
test_dataset = ds['test']

In [2]:
print(f"Number of training samples: {len(train_dataset)}")
print(f"Number of validation samples: {len(validation_dataset)}")
print(f"Number of test samples: {len(test_dataset)}")

Number of training samples: 204045
Number of validation samples: 11332
Number of test samples: 11334


In [4]:
import random
import json

def save_to_disk(data, filename):
    with open(filename, 'w') as f:
        json.dump(data, f)

# combine all splits into one dataset
combined_dataset = []
combined_dataset.extend(train_dataset)
combined_dataset.extend(validation_dataset)
combined_dataset.extend(test_dataset)

# randomly shuffle the combined dataset
random.shuffle(combined_dataset)

# select the first 2000 valid examples
selected_examples = []
for example in combined_dataset:
    document = example.get('document', "")
    summary = example.get('summary', "")

    # check if document or summary is empty
    if not document.strip() or not summary.strip():
        continue

    # add valid example to selected_examples
    selected_example = {
        'document': example['document'],
        'summary': example['summary'],
        'id': example['id']
    }
    selected_examples.append(selected_example)

    # stop when 10000 valid examples are collected
    if len(selected_examples) >= 10000:
        break

# save the selected examples to disk
save_to_disk(selected_examples, 'selected_examples.json')


In [5]:
from sklearn.model_selection import train_test_split

# load dataset from JSON file
with open('selected_examples.json', 'r') as file:
    data = json.load(file)

total_examples = len(data)
train_size = 8000 / total_examples
test_size = 1000 / total_examples
valid_size = 1000 / total_examples

# split the data into training, validation, and test sets
train_data, remaining_data = train_test_split(data, test_size=(test_size + valid_size), random_state=42)
test_data, valid_data = train_test_split(remaining_data, test_size=(valid_size / (test_size + valid_size)), random_state=42)


In [6]:
# save training data to JSON file
with open('train_article.json', 'w') as train_file:
    json.dump(train_data, train_file)

# save testing data to JSON file
with open('test_article.json', 'w') as test_file:
    json.dump(test_data, test_file)

# save validation data to JSON file
with open('valid_article.json', 'w') as valid_file:
    json.dump(valid_data, valid_file)

In [11]:
# save the articles in a txt file for OpenIE triple extraction 
with open('train_cnn.txt', 'w') as f:
    # iterate through the examples and write each article to the file
    for example in train_data:
        article = example['article']
        f.write(article + '\n\n')