In [1]:
from datasets import load_dataset

In [24]:
writing500 = load_dataset("alvinming/writing_500", split="test")

In [25]:
writing500 = writing500.map(
    lambda example: {"question": example["question"].replace("\n", " ")},
    batched=False,
    remove_columns=["qid","citation_numbers", "faithful_answer_w_citation", "gold_doc_ids", "contexts", "unfaithful_answer", "unfaithful_justification"]
)

writing500 = writing500.rename_columns({"question": "inputs", "answer": "labels"})

In [28]:
writing500[0]

{'inputs': 'is it appropriate to use the salutation dear all in a work email?',
 'labels': 'Using "Dear all" is perfectly acceptable for addressing a group, as is "Dear Colleagues"; the choice between them really hinges on the desired level of formality and the common practices within your particular work environment.'}

In [26]:
corrections = load_dataset("jhu-clsp/jfleg", split="validation")

In [27]:
import random

corrections = corrections.map(
    lambda example: {"corrections": random.choice(example["corrections"])},
    batched=False,
)

corrections = corrections.rename_columns({"corrections": "labels", "sentence": "inputs"})

In [29]:
corrections[0]

{'inputs': 'So I think we can not live if old people could not find siences and tecnologies and they did not developped . ',
 'labels': 'So I think we would not be alive if our ancestors did not develop sciences and technologies . '}

In [31]:
mlsum = load_dataset("GEM/mlsum", "de", split="validation", trust_remote_code=True)

Generating train split: 100%|██████████| 220748/220748 [00:00<00:00, 354967.28 examples/s]
Generating validation split: 100%|██████████| 11392/11392 [00:00<00:00, 340183.62 examples/s]
Generating test split: 100%|██████████| 10695/10695 [00:00<00:00, 315393.35 examples/s]
Generating challenge_train_sample split: 100%|██████████| 500/500 [00:00<00:00, 99560.96 examples/s]
Generating challenge_validation_sample split: 100%|██████████| 500/500 [00:00<00:00, 89411.72 examples/s]
Generating challenge_test_covid split: 100%|██████████| 5058/5058 [00:00<00:00, 297350.79 examples/s]


In [None]:
mlsum = mlsum.map(
    batched=False,
    remove_columns=['gem_id', 'gem_parent_id', 'topic', 'url', 'title', 'date', 'references'],
)

mlsum = mlsum.rename_columns({"text": "inputs", "target": "labels"})

Map: 100%|██████████| 11392/11392 [00:00<00:00, 30282.96 examples/s]


In [46]:
mlsum = mlsum.filter(lambda example: len(example["inputs"]) < 1000)

Filter: 100%|██████████| 11392/11392 [00:00<00:00, 114510.91 examples/s]


In [47]:
max_length = max(len(text) for text in mlsum["inputs"])
max_length

999

In [57]:
xsum = load_dataset("EdinburghNLP/xsum", split="validation")

In [None]:
xsum = xsum.map(
    lambda example: {"inputs": example["document"].replace("\n", " "), "labels": example["summary"]},  
    batched=False,
    remove_columns=["id", "summary", "document"],
)

xsum = xsum.filter(lambda example: len(example["inputs"]) < 1000)

Map: 100%|██████████| 11332/11332 [00:00<00:00, 48375.01 examples/s]
Filter: 100%|██████████| 11332/11332 [00:00<00:00, 348556.44 examples/s]


In [59]:
xsum[0]

{'inputs': 'The ex-Reading defender denied fraudulent trading charges relating to the Sodje Sports Foundation - a charity to raise money for Nigerian sport. Mr Sodje, 37, is jointly charged with elder brothers Efe, 44, Bright, 50 and Stephen, 42. Appearing at the Old Bailey earlier, all four denied the offence. The charge relates to offences which allegedly took place between 2008 and 2014. Sam, from Kent, Efe and Bright, of Greater Manchester, and Stephen, from Bexley, are due to stand trial in July. They were all released on bail.',
 'labels': 'Former Premier League footballer Sam Sodje has appeared in court alongside three brothers accused of charity fraud.'}

In [72]:
from datasets import concatenate_datasets

merged = concatenate_datasets([writing500, corrections, mlsum, xsum])
merged = merged.shuffle(seed=0)

In [73]:
merged = merged.filter(lambda example: example["inputs"].strip() != "" and example["labels"].strip() != "")

In [None]:
merged = merged.map(
    lambda example: {"inputs": example["inputs"].replace("\'", "") and example["inputs"].replace('"', "")},
    batched=False
)

Map: 100%|██████████| 4458/4458 [00:00<00:00, 27452.07 examples/s]


In [82]:
merged.to_csv("merged1.csv", index=False)

Creating CSV from Arrow format: 100%|██████████| 5/5 [00:00<00:00, 137.26ba/s]


2884989