# Publishing Data to Hugging Face Hub

This notebook contains code for:
- Loading generated synthetic data
- Splitting data into train, validation and test sets
- Publishing the dataset and its card to Hugging Face Hub

In [4]:
from datasets import load_dataset
from datasets import DatasetDict
from huggingface_hub import HfApi

In [None]:
# loads data from generated synthetic data, splits it into train, validation and test sets and pushes it to the hub
# Data split proportions:
# - 80% training data
# - 10% validation data
# - 10% test data

raw_dataset = load_dataset("json", data_files="../data/synthetic_data.json", field="data", split="train")

train_test_dataset = raw_dataset.train_test_split(test_size=0.2)

train_val_dataset = train_test_dataset["test"].train_test_split(test_size=0.5)

final_dataset = DatasetDict({
    "train": train_test_dataset["train"],
    "validation": train_val_dataset["train"],
    "test": train_val_dataset["test"]
})

print(f"Training set size: {len(final_dataset['train'])}")
print(f"Validation set size: {len(final_dataset['validation'])}")
print(f"Test set size: {len(final_dataset['test'])}")


# commented out because it's already pushed to the hub
# final_dataset.push_to_hub("kurkowski/synthetic-contextual-anonymizer-dataset")

In [None]:
# publish dataset card to the hub
api = HfApi()

try:
    api.upload_file(
        path_or_fileobj="./../dataset_card.md",
        path_in_repo="README.md",  
        repo_id="kurkowski/synthetic-contextual-anonymizer-dataset",
        repo_type="dataset"
    )
    print("Dataset card published successfully!")
except Exception as e:
    print(f"Error occurred while publishing dataset card: {str(e)}")

In [None]:
# Verify if the dataset was published correctly
verification_dataset = load_dataset("kurkowski/synthetic-contextual-anonymizer-dataset")
print("Published dataset structure:")
print(verification_dataset)
print("\nSample data from each split:")
print("\nTraining set:")
print(final_dataset['train'][0])
print("\nValidation set:")
print(final_dataset['validation'][0])
print("\nTest set:")
print(final_dataset['test'][0])