<a href="https://colab.research.google.com/github/ilsilfverskiold/transformers-nlp-docs/blob/main/cook/datasets/push_custom_dataset_huggingface.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# install dependencies
!pip install -U huggingface_hub
!pip install -U datasets



In [2]:
# connect to drive
# from google.colab import drive
# drive.mount('/content/drive')

In [5]:
import pandas as pd

# set the correct path to your csv file
file_path = './titles.csv'
df = pd.read_csv(file_path)
df.head()

Unnamed: 0,text,label
0,Scientists Discover New Form of Dark Matter,science
1,Climate Change Could Trigger 'Domino Effect' o...,science
2,Tiny Particles Reveal Hidden Forces in Quantum...,science
3,AI Revolutionizing Drug Discovery: Faster and ...,science
4,"Ancient Underwater Ruins Found, Uncovering Sec...",science


In [9]:
from sklearn.model_selection import train_test_split

# (optional) filter out any null values before creating the test, validation and training set
# df = df[df['column_names'].notnull()]

# Split dataset into training and temp (for validation and testing) - set at 15% (7.5% each)
train_df, temp_df = train_test_split(df, test_size=0.15, random_state=42)

# Split temp into validation and testing
val_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=42)

In [10]:
from datasets import Dataset, DatasetDict

# setup your sets
train_dataset = Dataset.from_pandas(train_df)
val_dataset = Dataset.from_pandas(val_df)
test_dataset = Dataset.from_pandas(test_df)

  from .autonotebook import tqdm as notebook_tqdm


In [11]:
# create a dict dataset
dataset_dict = DatasetDict({
    'train': train_dataset,
    'validation': val_dataset,
    'test': test_dataset
})

# look at the set
dataset_dict

DatasetDict({
    train: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 2434
    })
    validation: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 215
    })
    test: Dataset({
        features: ['text', 'label', '__index_level_0__'],
        num_rows: 215
    })
})

In [21]:
# login - remember to get your token from the Hugging Face hub
%pip install ipywidgets

Note: you may need to restart the kernel to use updated packages.


In [23]:
# run in terminal
!huggingface-cli login


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Enter your token (input will not be visible): Traceback (most recent call last):
  File "/Users/baskerville/Development/ll_env/bin/huggingface-cli", line 8, in <module>
    sys.exit(main())
  File "/Users/baskerville/Development/ll_env/lib/python3.8/site-packages/huggingface_hub/commands/huggingface_cli.py", line 52, in 

In [24]:
# you do not have to create the dataset before pushing
dataset_dict.push_to_hub("jamesbaskerville/article-titles")

Creating parquet from Arrow format: 100%|██████████| 3/3 [00:00<00:00, 1169.09ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.61it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 507.66ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
Creating parquet from Arrow format: 100%|██████████| 1/1 [00:00<00:00, 1222.83ba/s]
Uploading the dataset shards: 100%|██████████| 1/1 [00:00<00:00,  2.15it/s]


CommitInfo(commit_url='https://huggingface.co/datasets/jamesbaskerville/article-titles/commit/c046700658e531fd0b87070a002fce1e0f879d1d', commit_message='Upload dataset', commit_description='', oid='c046700658e531fd0b87070a002fce1e0f879d1d', pr_url=None, pr_revision=None, pr_num=None)

In [25]:
dataset_dict.save_to_disk("article-titles.hf")

Saving the dataset (1/1 shards): 100%|██████████| 2434/2434 [00:00<00:00, 459427.39 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 215/215 [00:00<00:00, 59668.85 examples/s]
Saving the dataset (1/1 shards): 100%|██████████| 215/215 [00:00<00:00, 64293.12 examples/s]
