## Datasets

### Load from Hugging Face

In [None]:
from datasets import load_dataset

dataset = load_dataset("fka/awesome-chatgpt-prompts")
dataset

> The above dataset only has a `train` dataset. Let's look at another one that has `train`, `validation`, and `test` datasets.

In [None]:
dataset = load_dataset("knkarthick/samsum")
dataset

### Preprocessing Methods

In [None]:
# Reload the original dataset
dataset = load_dataset("fka/awesome-chatgpt-prompts")
dataset["train"][0]

In [None]:
dataset = dataset["train"].shuffle(seed=43).select(range(100))
dataset

In [None]:
# Create test dataset
dataset = dataset.train_test_split(train_size=0.8, seed=49)
dataset

**Let's make our own dataset** from the `reuters21578/*.sgm` files. This was downloaded from https://archive.ics.uci.edu/ml/machine-learning-databases/reuters21578-mld/reuters21578.tar.gz

In [None]:
## Get the title and body of all the articles
import glob
from bs4 import BeautifulSoup

dir_path = "./reuters21578/"
files = os.path.join(dir_path, "*.sgm")
articles = []

for filepath in glob.glob(files):
    with open(filepath, "r", encoding="latin-1") as file:
        soup = BeautifulSoup(file, "html.parser")

    for r in soup.find_all("reuters"):
        title = r.title.string if r.title else ""
        body = r.body.string if r.body else ""

        ## Clean up the results
        if title == "" and body == "":
            continue
        
        articles.append({
            "title": title,
            "body": body
        })

print(f"Articles: {len(articles):,}")
articles[0]

In [None]:
## Now let's make our own dataset from these articles
import json

TRAIN_PCT = 0.8
VALID_PCT = 0.1

TRAIN_NUM = int(len(articles) * TRAIN_PCT)
VALID_NUM = int(len(articles) * (TRAIN_PCT + VALID_PCT))

# Split the data
train_articles = articles[:TRAIN_NUM]
print(f"Training dataset: {len(train_articles):,}")

valid_articles = articles[TRAIN_NUM:VALID_NUM]
print(f"Validation dataset: {len(valid_articles):,}")

test_articles = articles[VALID_NUM:]
print(f"Test dataset: {len(test_articles):,}")

def save_as_jsonl(data, filename):
    with open(filename, "w") as file:
        for article in data:
            file.write(json.dumps(article) + "\n")
    print(f"Wrote {filename}")

save_as_jsonl(train_articles, "train.jsonl")
save_as_jsonl(valid_articles, "valid.jsonl")
save_as_jsonl(test_articles, "test.jsonl")

In [None]:
## Load them as a dataset
data_files = {
    "train": "train.jsonl",
    "validation": "valid.jsonl",
    "test": "test.jsonl"
}
dataset = load_dataset("json", data_files=data_files)
dataset

In [None]:
## Login to Hugging Face
from huggingface_hub import notebook_login
notebook_login()

In [None]:
## Upload dataset to Hugging Face
dataset.push_to_hub("reuters-articles")