In [1]:
from datasets import load_dataset
from torch.utils.data import IterableDataset, DataLoader
import tiktoken
import torch
import os

In [2]:
for var in [
    "HF_HOME",
    "HF_DATASETS_CACHE",
    "TRANSFORMERS_CACHE",
    "HF_HUB_CACHE",
    "HF_DATASETS_HOME",  # deprecated
]:
    print(f"{var} =", os.getenv(var))


HF_HOME = /home/remote/u1138167/JoeyData/hf_home
HF_DATASETS_CACHE = None
TRANSFORMERS_CACHE = None
HF_HUB_CACHE = None
HF_DATASETS_HOME = None


In [3]:
# ⚙️ Config
CHUNK_SIZE = 512
BUFFER_TEXT_SIZE = 1000  # Number of samples to buffer before tokenizing (tune this)
BATCH_SIZE = 32
NUM_WORKERS = 8

In [4]:
# 🔠 Load tokenizer
tokenizer = tiktoken.get_encoding("cl100k_base")

In [5]:
# 🌊 Load streaming dataset
hf_dataset = load_dataset(
    "HuggingFaceFW/fineweb",
    data_dir="sample/10BT",
    split="train",
    streaming=False
)

Loading dataset shards:   0%|          | 0/102 [00:00<?, ?it/s]

In [None]:
# print(len(hf_dataset))

14868862


In [7]:
class BufferedStreamTokenChunkDataset(IterableDataset):
    def __init__(self, hf_streaming_dataset, tokenizer, chunk_size, buffer_text_size=10000):
        self.dataset = hf_streaming_dataset
        self.tokenizer = tokenizer
        self.chunk_size = chunk_size
        self.buffer_text_size = buffer_text_size

    def __iter__(self):
        buffer = []
        token_buffer = []

        for example in self.dataset:
            buffer.append(example["text"])
            if len(buffer) >= self.buffer_text_size:
                tokenized = self.tokenizer.encode(
                    " ".join(buffer),
                    allowed_special=self.tokenizer.special_tokens_set
                )
                token_buffer.extend(tokenized)
                buffer = []

                while len(token_buffer) >= self.chunk_size + 1:
                    input_ids = token_buffer[:self.chunk_size]
                    target_ids = token_buffer[1:self.chunk_size + 1]

                    yield {
                        "input_ids": torch.tensor(input_ids, dtype=torch.long),
                        "labels": torch.tensor(target_ids, dtype=torch.long)
                    }

                    token_buffer = token_buffer[self.chunk_size:]

        # Final flush
        if buffer:
            tokenized = self.tokenizer.encode(
                " ".join(buffer),
                allowed_special=self.tokenizer.special_tokens_set
            )
            token_buffer.extend(tokenized)

        
        while len(token_buffer) >= self.chunk_size + 1:
            input_ids = token_buffer[:self.chunk_size]
            target_ids = token_buffer[1:self.chunk_size + 1]

            yield {
                "input_ids": torch.tensor(input_ids, dtype=torch.long),
                "labels": torch.tensor(target_ids, dtype=torch.long)
            }   

            token_buffer = token_buffer[self.chunk_size:]

In [8]:
dataset = BufferedStreamTokenChunkDataset(
    hf_streaming_dataset=hf_dataset,
    tokenizer=tokenizer,
    chunk_size=CHUNK_SIZE,
    buffer_text_size=BUFFER_TEXT_SIZE
)


In [9]:

dataloader = DataLoader(
    dataset,
    batch_size=BATCH_SIZE,
    num_workers=NUM_WORKERS,
    pin_memory=True
)




In [11]:
one_batch = next(iter(dataloader))


In [13]:

print(type(one_batch))

<class 'dict'>


In [15]:
print(one_batch)

tensor([[   91,   860,   287,  ...,  2019,   330,    40],
        [  574,  5076,     1,  ...,   339,   969,    13],
        [  578, 81960,  1131,  ...,   279, 43732,  4430],
        ...,
        [  323, 43641,   323,  ..., 89595, 14238, 14134],
        [  922, 55182,  7694,  ...,    18,     8,   482],
        [64477,  4476, 69131,  ...,   482,  2650,  5195]])


In [16]:
token_ids = one_batch[10].tolist()

In [17]:
decoded_text = tokenizer.decode(token_ids)

In [18]:
print(decoded_text)

 likely to vote tomorrow, Thursday, on the repeal of the FCC’s Net Neutrality power grab. Using the Congressional Review Act, the repeal of the Net Neutrality order can be accomplished in an expedited way. In particular this means the bill cannot be filibustered in the Senate, so passing it means something. As Seton Motley said: This is our first opportunity | Read More » Game Index |
Deeper into the DarklandsYour Next Campaign picks up the action at Act II, in Beneath a Granite Sky, Part II.
[ Read FAQ | Subscribe to RSS | Partner Sites | Contact Us | Advertise with Us ]
Copyright © 1996-2009 Skotos Tech, Inc. & individual authors, All Rights Reserved
Compilation copyright © 1996-2009 Skotos Tech, Inc.
RPGnet® is a registered trademark of Skotos Tech, Inc., all rights reserved. Great decorating addition
I have a grape/Italian theme in my kitchen. I purchased 5 of these. I decided to use them to put around my pull knobs on my overhead cabinets. Now I am ordering more to sprinkle around

In [19]:

enc = tiktoken.get_encoding("cl100k_base")
print("Token count:", len(enc.encode(decoded_text)))


Token count: 512
