In [1]:
import torch

In [2]:
torch.__version__

'2.0.1+cu118'

In [4]:
import os
from tqdm import tqdm
import numpy as np
import tiktoken
from datasets import load_dataset # huggingface datasets

# number of workers in .map() call
# good number to use is ~order number of cpu cores // 2
num_proc = 8

# takes 54GB in huggingface .cache dir, about 8M documents (8,013,769)
dataset = load_dataset("openwebtext")
small_dataset = dataset['train'].select(range(50000))
split_dataset = small_dataset["train"].train_test_split(test_size=0.05, seed=2357, shuffle=True)
# owt by default only contains the 'train' split, so create a test split
#split_dataset = dataset["train"].train_test_split(test_size=0.0005, seed=2357, shuffle=True)
#split_dataset['val'] = split_dataset.pop('test') # rename the test split to val


Found cached dataset openwebtext (/home/ubuntu/.cache/huggingface/datasets/openwebtext/plain_text/1.0.0/6f68e85c16ccc770c0dd489f4008852ea9633604995addd0cd76e293aed9e521)


  0%|          | 0/1 [00:00<?, ?it/s]

KeyError: "Column train not in the dataset. Current columns in the dataset: ['text']"

In [6]:
td = dataset['train']

In [None]:
split_dataset

In [7]:
td

Dataset({
    features: ['text'],
    num_rows: 8013769
})

In [19]:
small_set = td.select(range(40000))

In [20]:
small_set

Dataset({
    features: ['text'],
    num_rows: 40000
})

In [21]:
new_set = small_set.train_test_split(test_size=0.05, seed=2357, shuffle=True)

In [22]:
new_set

DatasetDict({
    train: Dataset({
        features: ['text'],
        num_rows: 38000
    })
    test: Dataset({
        features: ['text'],
        num_rows: 2000
    })
})

In [23]:
new_set['val'] = new_set.pop('test') 

In [24]:
split_dataset = new_set

In [25]:
enc = tiktoken.get_encoding("gpt2")
def process(example):
    ids = enc.encode_ordinary(example['text']) # encode_ordinary ignores any special tokens
    ids.append(enc.eot_token) # add the end of text token, e.g. 50256 for gpt2 bpe
    # note: I think eot should be prepended not appended... hmm. it's called "eot" though...
    out = {'ids': ids, 'len': len(ids)}
    return out

# tokenize the dataset
tokenized = split_dataset.map(
    process,
    remove_columns=['text'],
    desc="tokenizing the splits",
    num_proc=num_proc,
)

# concatenate all the ids in each dataset into one large file we can use for training
for split, dset in tokenized.items():
    arr_len = np.sum(dset['len'])
    filename = f'{split}.bin'
    dtype = np.uint16 # (can do since enc.max_token_value == 50256 is < 2**16)
    arr = np.memmap(filename, dtype=dtype, mode='w+', shape=(arr_len,))

    print(f"writing {filename}...")
    idx = 0
    for example in tqdm(dset):
        arr[idx : idx + example['len']] = example['ids']
        idx += example['len']
    arr.flush()

tokenizing the splits (num_proc=8):   0%|          | 0/38000 [00:00<?, ? examples/s]

tokenizing the splits (num_proc=8):   0%|          | 0/2000 [00:00<?, ? examples/s]

writing train.bin...


100%|██████████| 38000/38000 [00:18<00:00, 2028.24it/s]


writing val.bin...


100%|██████████| 2000/2000 [00:01<00:00, 1996.94it/s]
