In [1]:
import json
import numpy as np
from tqdm import tqdm

from transformers import PreTrainedTokenizerFast

MAX_SEQ_LEN = 128+1
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json", 
    pad_token="[PAD]", 
    unk_token="[UNK]", 
    max_len=MAX_SEQ_LEN,
    add_prefix_space=False
)

In [2]:
def save_subsequences_positioning(
        source,
        dest,
        tokenizer,
        window_size,
        batch_size=1000
):
    def _flush(f_out, batch, current_line, tokenizer):
        offsets = tokenizer.batch_encode_plus(
            batch,
            max_length=999,
            truncation=False,
            padding=False,
            return_attention_mask=False,
            return_token_type_ids=False,
            return_offsets_mapping=True
        )["offset_mapping"]

        for offset in offsets:
            for i in range(len(offset) - window_size):
                start = offset[i][0]
                end = offset[i + window_size][1]

                f_out.write(f"{current_line}\t{start}\t{end+1}\n")

        batch.clear()

    with open(source, "r", encoding="utf-8") as f_in:
        data = json.load(f_in)

    with open(dest, "w") as f_out:
        line_batch = []
        for i, line in enumerate(tqdm(data)):
            line_batch.append(line)
            if len(line_batch) >= batch_size:
                _flush(f_out, line_batch, i, tokenizer)
        
        if len(line_batch) > 0:
            _flush(f_out, line_batch, i, tokenizer)

    return len(data)

In [6]:
souce = "data/train-sampled.json"
dest = "data/train-sampled-positions.csv"
save_subsequences_positioning(souce, dest, tokenizer, window_size=MAX_SEQ_LEN)

100%|██████████| 135884/135884 [00:53<00:00, 2550.03it/s]


135884

In [11]:
source = "data/train-sampled-positions.csv"
dest = "data/train-sampled-positions.npy"
np.save(dest, np.loadtxt(source, delimiter="\t", dtype=np.int32))

In [12]:
souce = "data/valid-sampled.json"
dest = "data/valid-sampled-positions.csv"
save_subsequences_positioning(souce, dest, tokenizer, window_size=MAX_SEQ_LEN)

100%|██████████| 1381/1381 [00:00<00:00, 3418.34it/s]


1381

In [14]:
source = "data/valid-sampled-positions.csv"
dest = "data/valid-sampled-positions.npy"
np.save(dest, np.loadtxt(source, delimiter="\t", dtype=np.int16))

In [3]:
import io
import zlib
import torch
import ijson
import tables
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import PreTrainedTokenizerFast

MAX_SEQ_LEN = 128+1
tokenizer = PreTrainedTokenizerFast(
    tokenizer_file="models/tokenizer.json", 
    pad_token="[PAD]", 
    unk_token="[UNK]", 
    max_len=MAX_SEQ_LEN,
    add_prefix_space=False
)

In [4]:
class TinyStoriesDataset(Dataset):
    def __init__(
            self,
            stories_file,
            positions_file,
            tokenizer, 
            seq_len,
            device="cpu"
        ):
        self.stories_file = stories_file
        self.positions_file = positions_file
        self.tokenizer = tokenizer
        self.seq_len = seq_len
        self.device = device

        self._load_data()

    def _load_data(self):
        with open(self.stories_file, "r", encoding="utf-8") as f_in:
            self.data = list(ijson.items(f_in, "item"))

        self.positions = np.load(self.positions_file)

    def __len__(self):
        return len(self.positions)

    def _get_and_encode_story(self, idx):
        pos = self.positions[idx]
        return self.tokenizer(
            self.data[pos[0]][pos[1]:pos[2]],
            padding="max_length",
            return_token_type_ids=False,
            truncation=True,
            max_length=self.seq_len,
            return_tensors="pt"
        ).to(self.device)
    
    def __getitem__(self, idx):
        story_encoded = self._get_and_encode_story(idx)
        story_encoded = story_encoded["input_ids"] * story_encoded["attention_mask"]
        
        x = story_encoded[:, :self.seq_len].squeeze()
        y = story_encoded[:, 1:].squeeze()

        return x, y
    
    def get_example_tokens(self, idx):
        story_encoded = self._get_and_encode_story(idx)
        return (story_encoded["input_ids"] * story_encoded["attention_mask"]).squeeze()

In [5]:
def save_instances_tokens_compressed(
    dataset: TinyStoriesDataset,
    dest_file, 
    batch_size=1000
):
    n_rows, n_cols = len(dataset), dataset.seq_len
    data = torch.empty(
        size=(n_rows, n_cols),
        dtype=torch.long,
        device=dataset.device
    )

    n_batches = n_rows // batch_size
    if n_rows % batch_size != 0: n_batches += 1

    for i in tqdm(range(n_batches)):
        batch = torch.stack([
            dataset.get_example_tokens(j) for j in range(i*batch_size, min((i+1)*batch_size, n_rows))
        ])
        data[i*batch_size:(i+1)*batch_size] = batch

    output_bytes = io.BytesIO()
    torch.save(data, output_bytes)

    compressed_bytes = zlib.compress(output_bytes.getvalue())
    with open(f"{dest_file}.zlib", "wb") as f_out:
        f_out.write(compressed_bytes)

In [6]:
def save_instances_tokens_to_hdf5(
    dataset: TinyStoriesDataset,
    dest_file:str,
    compression:tables.Filters,
    name:str="data",
    batch_size:int=1000
):
    with tables.open_file(dest_file, "w", filters=compression) as h5_file:
        n_rows, n_cols = len(dataset), dataset.seq_len

        data = h5_file.create_carray(
            h5_file.root,
            name=name,
            atom=tables.Int64Atom(),
            shape=(n_rows, n_cols),
            filters=compression
        )

        n_batches = n_rows // batch_size
        if n_rows % batch_size != 0: n_batches += 1

        for i in tqdm(range(n_batches)):
            batch = torch.stack([
                dataset.get_example_tokens(j) for j in range(i*batch_size, min((i+1)*batch_size, n_rows))
            ]).squeeze(dim=1)
            data[i*batch_size:(i+1)*batch_size] = batch.numpy()

        data.flush()
        data.close()
        h5_file.close()


In [7]:
train_stories_file = "data/train-sampled.json"
train_positions_file = "data/train-sampled-positions.npy"
train_dataset = TinyStoriesDataset(
    train_stories_file,
    train_positions_file,
    tokenizer,
    MAX_SEQ_LEN,
)
dest = "data/train-sampled.h5"
save_instances_tokens_to_hdf5(
    train_dataset, 
    dest,
    compression=tables.Filters(complevel=1, complib="blosc",)
)

100%|██████████| 13457/13457 [2:10:53<00:00,  1.71it/s] 


In [6]:
valid_stories_file = "data/valid-sampled.json"
valid_positions_file = "data/valid-sampled-positions.npy"
valid_dataset = TinyStoriesDataset(
    valid_stories_file,
    valid_positions_file,
    tokenizer,
    MAX_SEQ_LEN,
)
dest = "data/valid-sampled.pt"
save_instances_tokens_compressed(valid_dataset, dest)

100%|██████████| 139/139 [01:13<00:00,  1.89it/s]


In [8]:
class TinyStoriesDatasetCompressed(Dataset):
    def __init__(
        self,
        file,
        max_seq_len,
        device="cpu"
    ):
        self.max_seq_len = max_seq_len
        self.device = device

        self._load_data(file)

    def _load_data(self, file):
        with open(file, "rb") as f_in:
            compressed_bytes = f_in.read()
        
        tensor_io = io.BytesIO(
            zlib.decompress(compressed_bytes)
        )
        self.data = torch.load(tensor_io, map_location=self.device, weights_only=True)

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = self.data[idx]
        return tokens[:-1], tokens[1:]

In [9]:
class TinyStoriesDatasetHDF5(Dataset):
    def __init__(
        self,
        file,
        max_seq_len,
        arr_name:str = "data",
        compression: tables.Filters=None,
        device="cpu"
    ):
        self.max_seq_len = max_seq_len
        self.device = device

        self._load_data(file, arr_name, compression)

    def _load_data(self, file, name, compression):
        self.file = tables.open_file(file, mode="r", filters=compression)
        self.data = self.file.root[name]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        tokens = torch.from_numpy(self.data[idx]).to(self.device)
        return tokens[:-1], tokens[1:]

In [10]:
ds_a = TinyStoriesDatasetCompressed(
    "data/valid-sampled.pt.zlib", 
    MAX_SEQ_LEN,
    device="cuda"
)

ld_a = DataLoader(
    ds_a,
    batch_size=128,
    shuffle=True,
)

ds_b = TinyStoriesDatasetHDF5(
    "data/train-sampled.h5",
    MAX_SEQ_LEN,
    compression=tables.Filters(complevel=4, complib="blosc"),
    device="cuda"
)

ld_b = DataLoader(
    ds_b,
    batch_size=128,
    shuffle=True,
)

In [13]:
from time import time
t = time()
next(iter(ld_a))
print(time() - t)

t = time()
next(iter(ld_b))
print(time() - t)

0.016007423400878906
1.6069536209106445
