# Big data? Streaming to the rescue!

Install the Transformers and Datasets libraries to run this notebook.

In [None]:
!pip install datasets transformers[sentencepiece]

In [None]:
!pip install zstandard

In [None]:
from datasets import load_dataset

# This takes a few minutes to run, so go grab a coffee while you wait :)
data_files = "https://the-eye.eu/public/AI/pile_preliminary_components/yt_subs.jsonl.zst"
youtube_dataset = load_dataset("json", data_files=data_files, split="train")
youtube_dataset

Dataset({
    features: ['text', 'meta'],
    num_rows: 173651
})

In [None]:
youtube_dataset[0]

{'meta': {'id': 'ujBi9Ba8hqs'},
 'text': 'Music\nMusic\nNarrator: What safeguards our solar system...\nis our star.\nThe sun provides a shield, stretching beyond the last\nplanet in its orbit: a force field that deflects these\n"cosmic rays..."

In [None]:
import psutil

# Process.memory_info is expressed in bytes, so convert to megabytes
print(f"RAM used: {psutil.Process().memory_info().rss / (1024 * 1024):.2f} MB")

RAM used: 195.76 MB

In [None]:
print(f"Number of files in dataset : {youtube_dataset.dataset_size}")
print(f"Dataset size (cache file) : {youtube_dataset.dataset_size / (1024 ** 3):.2f} GB")

Number of files in dataset : 173651
Dataset size (cache file) : 3.74 GB

In [None]:
import timeit

code_snippet = """batch_size = 1000

for idx in range(0, len(youtube_dataset), batch_size):
    _ = youtube_dataset[idx:idx + batch_size]
"""

time = timeit.timeit(stmt=code_snippet, number=1, globals=globals())
print(f"Iterated over {len(youtube_dataset)} examples (about {size:.1f} GB) in \
{time:.1f}s, i.e. {size/time:.3f} GB/s")

Iterated over 173651 examples (about 3.7 GB) in 8.4s, i.e. 0.446 GB/s

In [None]:
youtube_dataset_streamed = load_dataset("json", data_files=data_files, split="train", streaming=True)

In [None]:
next(iter(youtube_dataset_streamed))

{'meta': {'id': 'ujBi9Ba8hqs'},
 'text': 'Music\nMusic\nNarrator: What safeguards our solar system...\nis our star.\nThe sun provides a shield, stretching beyond the last\nplanet in its orbit: a force field that deflects these\n"cosmic rays..."

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
tokenized_dataset = youtube_dataset_streamed.map(lambda x: tokenizer(x["text"]))
next(iter(tokenized_dataset))

{'input_ids': [101, 2189, 2189, 11185, 1024, ...], 'attention_mask': [1, 1, 1, 1, 1, ...]}

In [None]:
shuffled_dataset = youtube_dataset_streamed.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
shuffled_dataset = streamed_dataset.shuffle(buffer_size=10_000, seed=42)
next(iter(shuffled_dataset))

In [None]:
dataset_head = youtube_dataset_streamed.take(5)
list(dataset_head)

[{'meta': {'id': 'ujBi9Ba8hqs'},
  'text': 'Music\nMusic\nNarrator: What safeguards our solar system...\nis our star.\nThe sun provides a shield, stretching beyond the last\nplanet in its orbit: a force field that deflects these\n"cosmic rays...'},
 {'meta': {'id': '68-JMZZLw0g'},
  'text': "Ferrous Corp sorumlu değildi..."},
 {'meta': {'id': 'VXr0LMzPZDw'},
  'text': "Hey Guys welcome to Parkour meets Trial Bike\nToday Daniel is in Berlin again ..."},
 {'meta': {'id': 'p_friY7NrVM'},
  'text': "Hey Youtube Boy.... Get in the ring! Now get\nout the ring! ..."},
 {'meta': {'id': 'SFCM0Quorsg'},
  'text': 'THIS IS AN EDUCATIONAL VIDEO REVIEW UNDER FAIR USE ACT AND WE WILL SEE THAT OUR SCIENCE GURUS ARE EXTORTIONISTS,WITHOLDING THE TRUE NATURE OF THE FIXED EARTH AND THE HEAVENS ABOUT US ...'}]

In [None]:
# Skip the first 1,000 examples and include the rest in the training set
train_dataset = shuffled_dataset.skip(1000)
# Take the first 1,000 examples for the validation set
validation_dataset = shuffled_dataset.take(1000)

In [None]:
law_dataset_streamed = load_dataset("json", data_files="https://the-eye.eu/public/AI/pile_preliminary_components/FreeLaw_Opinions.jsonl.zst", split="train", streaming=True)
next(iter(law_dataset_streamed))

{'meta': {'case_ID': '110921.json',
  'case_jurisdiction': 'scotus.tar.gz',
  'date_created': '2010-04-28T17:12:49Z'},
 'text': '\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued January 19, 1983.\nDecided April 26, 1983.\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}

In [None]:
from itertools import islice
from datasets import interleave_datasets

combined_dataset = interleave_datasets([youtube_dataset_streamed, law_dataset_streamed])
list(islice(combined_dataset, 2))

[{'meta': {'id': 'ujBi9Ba8hqs'},
  'text': 'Music\nMusic\nNarrator: What safeguards our solar system...\nis our star.\nThe sun provides a shield, stretching beyond the last\nplanet in its orbit: a force field that deflects these\n"cosmic rays...'},
 {'meta': {'case_ID': '110921.json',
   'case_jurisdiction': 'scotus.tar.gz',
   'date_created': '2010-04-28T17:12:49Z'},
  'text': '\n461 U.S. 238 (1983)\nOLIM ET AL.\nv.\nWAKINEKONA\nNo. 81-1581.\nSupreme Court of United States.\nArgued January 19, 1983.\nDecided April 26, 1983.\nCERTIORARI TO THE UNITED STATES COURT OF APPEALS FOR THE NINTH CIRCUIT\n*239 Michael A. Lilly, First Deputy Attorney General of Hawaii, argued the cause for petitioners. With him on the brief was James H. Dannenberg, Deputy Attorney General...'}]

In [None]:
base_url = "https://the-eye.eu/public/AI/pile/"
data_files = {
    "train": [base_url + "train/" + f"{idx:02d}.jsonl.zst" for idx in range(30)],
    "validation": base_url + "val.jsonl.zst",
    "test": base_url + "test.jsonl.zst"
}
pile_dataset = load_dataset("json", data_files=data_files, streaming=True)
next(iter(pile_dataset["train"]))

{'meta': {'pile_set_name': 'Pile-CC'},
 'text': 'It is done, and submitted. You can play “Survival of the Tastiest” on Android, and on the web...'}