In [1]:
from datasets import load_dataset
from torch.utils.data import IterableDataset, DataLoader
import torch
import os
from transformers import LlamaTokenizerFast

In [2]:
for var in [
    "HF_HOME",
    "HF_DATASETS_CACHE",
    "TRANSFORMERS_CACHE",
    "HF_HUB_CACHE",
    "HF_DATASETS_HOME",  # deprecated
]:
    print(f"{var} =", os.getenv(var))


HF_HOME = /home/remote/u1138167/JoeyData/hf_home
HF_DATASETS_CACHE = None
TRANSFORMERS_CACHE = None
HF_HUB_CACHE = None
HF_DATASETS_HOME = None


In [3]:
# ⚙️ Config
CHUNK_SIZE = 512
BUFFER_TEXT_SIZE = 1000  # Number of samples to buffer before tokenizing (tune this)
BATCH_SIZE = 32
NUM_WORKERS = 8

In [4]:
# 🔠 Load tokenizer
tokenizer = LlamaTokenizerFast.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")

In [5]:
# 🌊 Load streaming dataset
hf_dataset = load_dataset(
    "HuggingFaceFW/fineweb",
    data_dir="sample/10BT",
    split="train",
    streaming=True
)

In [None]:
# print(len(hf_dataset))

14868862


In [18]:
class BufferedStreamTokenChunkDataset(IterableDataset):
    def __init__(self, hf_streaming_dataset, tokenizer, chunk_size, buffer_text_size=10000):
        self.dataset = hf_streaming_dataset
        self.tokenizer = tokenizer
        self.chunk_size = chunk_size
        self.buffer_text_size = buffer_text_size

    def __iter__(self):
        buffer = []
        token_buffer = []

        for example in self.dataset:
            buffer.append(example["text"])
            if len(buffer) >= self.buffer_text_size:
                tokenized = self.tokenizer.encode(
                    " ".join(buffer),
                )
                token_buffer.extend(tokenized)
                buffer = []

                while len(token_buffer) >= self.chunk_size + 1:
                    input_ids = token_buffer[:self.chunk_size]
                    target_ids = token_buffer[1:self.chunk_size + 1]

                    yield {
                        "inputs": torch.tensor(input_ids, dtype=torch.long),
                        "labels": torch.tensor(target_ids, dtype=torch.long)
                    }

                    token_buffer = token_buffer[self.chunk_size:]

        # Final flush
        if buffer:
            tokenized = self.tokenizer.encode(
                " ".join(buffer),
                return_attention_mask=False,
                return_token_type_ids=False,
                add_special_tokens=False
            )
            token_buffer.extend(tokenized)

        

        
        while len(token_buffer) >= self.chunk_size + 1:
            input_ids = token_buffer[:self.chunk_size]
            target_ids = token_buffer[1:self.chunk_size + 1]

            yield {
                "inputs": torch.tensor(input_ids, dtype=torch.long),
                "labels": torch.tensor(target_ids, dtype=torch.long)
            }   

            token_buffer = token_buffer[self.chunk_size:]

In [19]:
dataset = BufferedStreamTokenChunkDataset(
    hf_streaming_dataset=hf_dataset,
    tokenizer=tokenizer,
    chunk_size=CHUNK_SIZE,
    buffer_text_size=BUFFER_TEXT_SIZE
)


In [20]:

dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)


In [21]:
one_batch = next(iter(dataloader))


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d37a77aaa20>
Traceback (most recent call last):
  File "/home/remote/u1138167/JoeyLLM/.venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1663, in __del__
    self._shutdown_workers()
  File "/home/remote/u1138167/JoeyLLM/.venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1646, in _shutdown_workers
    if w.is_alive():
       ^^^^^^^^^^^^
  File "/usr/lib/python3.12/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7d37a77aaa20>
Traceback (most recent call last):
  File "/home/remote/u1138167/JoeyLLM/.venv/lib/python3.12/site-packages/torch/utils/data/dataloader.py", line 1663, in __del__
    self._shutdown_workers()
  File "/home

In [33]:

print(type(one_batch))

<class 'dict'>


In [34]:
print(one_batch.keys())


dict_keys(['inputs', 'labels'])


In [36]:
print(one_batch['inputs'].size())

torch.Size([32, 512])


In [38]:
print(one_batch['inputs'][0])

tensor([   91,   860,   287, 11579,  3962,  5659,    25, 57049, 28257,   369,
          279, 10563,   315,  7552,   220,   806,   339,  7511,    91,    43,
          321,  8651, 41691,   220,    16,   220,   679,    18,    11,   220,
         2545,    25,  2970,  6912,  7511,  8161,   956,  2512,   922, 60470,
        17146, 12315, 32801,   268, 12278,   268,    13,  4418,   956,  2512,
          922,  8388,    72,    11,  2216,    11,   719, 16026,   430,   584,
          636,  1063,  1695,   330, 87434,  2891, 17455,   480, 37420,  3001,
            1,  2947, 49121, 16290, 40720,  5518,   704,   315,   433,    13,
         1628,   330,   943, 15456,     1, 16024,    13,  5321,    13,  1442,
         1193,   627,   790, 15229, 55994,  3001, 27597,    37, 55994,    11,
        27597,    37, 55994,    11, 27597,    37, 55994, 17523,   551,  1557,
          261,   512,    91, 69780, 28257,   369,   279, 10563,   315,  7552,
          220,   806,   339,  9787, 83931,    25,  5513,    11, 

In [41]:
token_ids = one_batch['inputs'][0].tolist()

In [42]:
decoded_text = tokenizer.decode(token_ids)

In [43]:
print(decoded_text)

|Viewing Single Post From: Spoilers for the Week of February 11th|
|Lil||Feb 1 2013, 09:58 AM|
Don't care about Chloe/Taniel/Jen-Jen. Don't care about Sami, really, but hoping that we get some good "SAMANTHA GENE!!" Marlena Death-Stares out of it. And "newfound" feelings. Please. If only.
STEFANO!! STEFANO, STEFANO, STEFANO!!!! :cheer:
|Spoilers for the Week of February 11th · DAYS: News, Spoilers & Discussion| *sigh* Fundamentalist community, let me pass on some advice to you I learned from the atheistic community:
If you have set yourself on fire, do not run.
Okay? Okay?? Please?
Look, D, you had two months to say to Harvard in private emails, "Im sorry, I shouldnt have been using that animation in my paid presentations. I wont use it again. I really do like 'Inner Life', though, and would love to use it in classroom presentations, from the BioVisions site, if that is acceptable."
I sat here, for two months, waiting for that to happen, anything to happen, and it didnt. Two months, on

In [44]:

enc = tiktoken.get_encoding("cl100k_base")
print("Token count:", len(enc.encode(decoded_text)))


Token count: 512
