In [1]:
from datasets import load_dataset

In [5]:
fw_sample = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", cache_dir="data/raw/", num_proc=12)

Resolving data files:   0%|          | 0/1630 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/9672101 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/104 [00:00<?, ?it/s]

In [7]:
fw_sample

DatasetDict({
    train: Dataset({
        features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
        num_rows: 9672101
    })
})

In [17]:
fw_filtered = fw_sample["train"].filter(lambda x: x["score"] >= 3.69, num_proc=14)
fw_filtered.num_rows

Filter (num_proc=14):   0%|          | 0/9672101 [00:00<?, ? examples/s]

682847

In [18]:
fw_filtered

Dataset({
    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
    num_rows: 682847
})

In [19]:
train_test = fw_filtered.train_test_split(test_size=.1, seed=42)
train_set = train_test["train"]
val_set = train_test["test"]

In [23]:
train_set.save_to_disk("../data/train_fineweb_edu_369")
val_set.save_to_disk("../data/val_fineweb_edu_369")


Saving the dataset (0/7 shards):   0%|          | 0/614562 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/68285 [00:00<?, ? examples/s]

In [3]:
import os

In [7]:
os.listdir("../data")

['val_fineweb_edu_369', 'train_fineweb_edu_369']

In [6]:
os.listdir("../data/val_fineweb_edu_369")

['state.json', 'dataset_info.json', 'data-00000-of-00001.arrow']

In [17]:
base_path = "../data"
for dir in os.listdir(base_path):
    print(dir)
    for _file in os.listdir(os.path.join(base_path, dir)):
        print(_file)
        print(os.path.join(base_path, dir, _file))

val_fineweb_edu_369
state.json
../data/val_fineweb_edu_369/state.json
dataset_info.json
../data/val_fineweb_edu_369/dataset_info.json
data-00000-of-00001.arrow
../data/val_fineweb_edu_369/data-00000-of-00001.arrow
train_fineweb_edu_369
state.json
../data/train_fineweb_edu_369/state.json
data-00005-of-00007.arrow
../data/train_fineweb_edu_369/data-00005-of-00007.arrow
dataset_info.json
../data/train_fineweb_edu_369/dataset_info.json
data-00000-of-00007.arrow
../data/train_fineweb_edu_369/data-00000-of-00007.arrow
data-00002-of-00007.arrow
../data/train_fineweb_edu_369/data-00002-of-00007.arrow
data-00004-of-00007.arrow
../data/train_fineweb_edu_369/data-00004-of-00007.arrow
data-00006-of-00007.arrow
../data/train_fineweb_edu_369/data-00006-of-00007.arrow
data-00003-of-00007.arrow
../data/train_fineweb_edu_369/data-00003-of-00007.arrow
data-00001-of-00007.arrow
../data/train_fineweb_edu_369/data-00001-of-00007.arrow


In [14]:
os.path.join(base_path, dir)

'../data/val_fineweb_edu_369'

In [15]:
os.listdir(os.path.join(base_path, os.listdir(dir)))

FileNotFoundError: [Errno 2] No such file or directory: 'val_fineweb_edu_369'

In [53]:
from datasets import load_from_disk
import tiktoken
import numpy as np
import multiprocessing as mp
from tqdm import tqdm

In [36]:
shard_size = int(1e8) # shards fit 100mil tokens
n_shards_train = 7
n_shards_val = 1

In [41]:
tokenizer = tiktoken.get_encoding("gpt2")
ds_train = load_from_disk(dataset_path="../data/train_fineweb_edu_369")
ds_val = load_from_disk(dataset_path="../data/val_fineweb_edu_369")


In [39]:
def tokenize_single(text):
    tokens = [tokenizer._special_tokens['<|endoftext|>']]
    tokens.extend(tokenizer.encode_ordinary(text))
    tokens_np = np.array(tokens)
    return tokens_np.astype(np.uint16)

def write_file(filename, np_tokens):
    np.save(filename, np_tokens)

In [58]:
def preprocess(path: str) -> None:
    ds = load_from_disk(dataset_path=path)
    token_count = 0
    shard_count = 0
    np_tokens = np.empty((shard_size,), dtype=np.uint16)
    for text in tqdm(ds["text"]):
        tokens = tokenize_single(text)
        np_tokens[token_count:token_count+len(tokens)] = tokens
        if token_count + len(tokens) < shard_size:
            token_count += len(tokens)
        else:
            np_shard = os.path.join(path, f"np_tokens_{shard_count}")
            write_file(np_shard, np_tokens)
            shard_count += 1

In [57]:
preprocess("../data/train_fineweb_edu_369")

  0%|          | 0/5 [00:00<?, ?it/s]


ValueError: could not broadcast input array from shape (989,) into shape (4,)

In [32]:
sum(ds["token_count"])

663505152

In [33]:
ds

Dataset({
    features: ['text', 'id', 'dump', 'url', 'file_path', 'language', 'language_score', 'token_count', 'score', 'int_score'],
    num_rows: 614562
})

In [61]:
np_train_short = np.load("../data/train_fineweb_edu_369/train_np_tokens_000001.npy")

In [62]:
np_train_short.shape

(99999620,)

In [63]:
np_train_short[0]

np.uint16(50256)

In [68]:
import os
import torch

In [69]:
def load_tokens(filename):
    npt = np.load(filename).astype(np.int32)
    return torch.tensor(npt, dtype=torch.long)

In [73]:
class DataLoaderFine:
    def __init__(self, B, T, split, data_root) -> None:
        self.B = B
        self.T = T
        assert split in {"train", "val"}

        # get the shard filenames
        shards = os.listdir(data_root)
        print(shards)
        # we train on the numpy files
        shards = [s for s in shards if s.split(".")[-1]==".npy"]
        print(shards)
        shards = sorted(shards)
        self.shards = [os.path.join(data_root, s) for s in shards]
        assert len(shards) > 0, f"no shards found for split {split}"
        print(f"found {len(shards)} shards for split {split}")

        self.current_shard = 0
        self.tokens = load_tokens(self.shards[self.current_shard])
        self.current_position = self.B * self.T
    
    def next_batch(self) -> None:
        B, T = self.B, self.T
        buf = self.tokens[self.current_position : self.current_position+B*T+1]
        x = (buf[:-1]).view(B, T) # inputs
        y = (buf[1:]).view(B, T) # outputs
        # advance the position in the tensor
        self.current_position += B * T
        # if loading the next batch would be out of bounds, reset
        if self.current_position + (B * T + 1) > len(self.tokens):
            self.current_shard = (self.current_shard + 1) % len(self.shards)
            self.tokens = load_tokens(self.shards[self.current_shard])
            self.current_position = B * T
        return x, y     

In [74]:
val_loader = DataLoaderFine(B=10, T=1024, split="val", data_root="../data/val_fineweb_edu_369")

['state.json', 'dataset_info.json', 'val_np_tokens_000000.npy', 'data-00000-of-00001.arrow']
[]


AssertionError: no shards found for split val

In [75]:
data_root="../data/val_fineweb_edu_369"

shards_ = os.listdir(data_root)

print(shards_)

shards_ = [s for s in shards_ if s.split(".")[-1] == "npy"]

['state.json', 'dataset_info.json', 'val_np_tokens_000000.npy', 'data-00000-of-00001.arrow']
