In [19]:
import numpy as np
import torch
import pandas as pd
from datasets import load_dataset
import tiktoken

In [21]:
dataset = load_dataset("wikitext", "wikitext-2-raw-v1")

enc = tiktoken.get_encoding("gpt2")

def tokenize(example):
    return {"input_ids": enc.encode(example["text"], allowed_special=set())}

tokenized = dataset.map(tokenize, remove_columns=["text"])
tokenized

Map: 100%|██████████| 4358/4358 [00:00<00:00, 5356.56 examples/s]
Map: 100%|██████████| 36718/36718 [00:05<00:00, 6537.05 examples/s]
Map: 100%|██████████| 3760/3760 [00:01<00:00, 3490.53 examples/s]


DatasetDict({
    test: Dataset({
        features: ['input_ids'],
        num_rows: 4358
    })
    train: Dataset({
        features: ['input_ids'],
        num_rows: 36718
    })
    validation: Dataset({
        features: ['input_ids'],
        num_rows: 3760
    })
})

In [24]:
def create_chunks(dataset_split, chunk_size=256):
    all_tokens = []
    for example in dataset_split:
        all_tokens.extend(example["input_ids"])

    all_tokens = np.array(all_tokens)

    n_chunks = len(all_tokens) // chunk_size
    all_tokens = all_tokens[:n_chunks * chunk_size]
    chunks = all_tokens.reshape(-1, chunk_size)

    return chunks

train_data = create_chunks(tokenized["train"], chunk_size=256)
test_data = create_chunks(tokenized["test"], chunk_size=256)
val_data = create_chunks(tokenized["validation"], chunk_size=256)

print(f"Train: {train_data.shape}")  # (num_sequences, 256)
print(f"Val: {val_data.shape}")

Train: (9343, 256)
Val: (965, 256)
