In [23]:
from pathlib import Path
import mlx.core as mx

train_data_dir = Path('./wikitext_data/train')
train_examples = []

for ex_path in sorted(train_data_dir.glob('*.npz')):
    train_examples.extend(mx.load(str(ex_path)).values())

len(train_examples)

27571

In [37]:
seq_len = 512
blk_size = seq_len + 1
train_examples_pad = []

for example in train_examples:
    example_pad = mx.pad(example, pad_width=[0, blk_size-example.size%blk_size], constant_values=0)
    train_examples_pad.append(example_pad)

In [None]:
from pathlib import Path
import mlx.core as mx

class WikiTextDataLoader:
    def __init__(self, bsz, seq_len, pad_token_id):
        train_data_dir = Path('./wikitext_data/train')
        train_examples = []
        for ex_path in sorted(train_data_dir.glob('*.npz')):
            train_examples.extend(mx.load(str(ex_path)).values())
    
        blk_size = seq_len + 1
        pad_example = lambda ex: mx.pad(ex, [0, blk_size - ex.size % blk_size], pad_token_id)
        train_examples = [*map(pad_example, train_examples)]
        self.train_examples = mx.concatenate(train_examples, axis=0)

        self.bsz = bsz
        self.blk_size = blk_size
        self.bblk_size = bsz * blk_size  # Batch block size
        self.total_batches = len(self.train_examples) - blk_size + 1

    def __len__(self):
        return self.total_batches

    def __getitem__(self, idx):
        batch_block = self.train_examples[i:i+self.bblk_size]
        batch_block = batch_block.reshape([self.bsz, self.blk_size])
        return batch_block[:, :-1], batch_block[:, 1:]

    def __iter__(self):
        for i in range(self.total_batches):
            yield self[i]

dataloader = WikiTextDataLoader(4, 512, 0)
xb, yb = next(dataloader)

In [1]:
from pathlib import Path
import mlx.core as mx

def config_dataloader(bsz, seq_len, pad_token_id):
    train_data_dir = Path('./wikitext_data/train')
    train_examples = []
    for ex_path in sorted(train_data_dir.glob('*.npz')):
        train_examples.extend(mx.load(str(ex_path)).values())

    blk_size = seq_len + 1
    pad_example = lambda ex: mx.pad(ex, [0, blk_size - ex.size % blk_size], pad_token_id)
    train_examples = [*map(pad_example, train_examples)]
    train_examples = mx.concatenate(train_examples, axis=0)

    bblk_size = bsz * blk_size  # Batch block size
    n_batches = len(train_examples) - blk_size + 1

    def load_data_():
        for i in range(n_batches):
            bblk = train_examples[i:i+bblk_size].reshape([bsz, blk_size])
            yield bblk[:, :-1], bblk[:, 1:]

    return load_data_

load_data = config_dataloader(4, 512, 0)
data_iter = iter(load_data())
xb, yb = next(data_iter)

In [2]:
from sentencepiece import SentencePieceProcessor

sp_model = SentencePieceProcessor(model_file='tokenizer.model')

In [3]:
sp_model.decode(xb[1, :].tolist())

"\n Mathews was decorated by several governments , receiving appointments as a Companion of the Order of St Michael and St George , Companion of the Order of the Bath and as a Knight Commander of the Order of St Michael and St George from the British government and membership in the Prussian Order of the Crown . Zanzibar also rewarded him and he was a member of the Grand Order of Hamondieh and a first class member of the Order of the Brilliant Star of Zanzibar . Mathews died of malaria in Zanzibar on 11 October 1901 . \n = = Early life and career = = \n Mathews was born at Funchal on Madeira on 7 March 1850 . His father , Captain William Matthews was Welsh , and his mother Jane Wallis Penfold , was the daughter of William Penfold and Sarah Gilbert . Her sister , Augusta Jane Robley née Penfold was the author of a famous book about the flora and fauna of Madeira , which is now in the Natural History Museum . Mathews became a cadet of the Royal Navy in 1863 and was appointed a midshipman

In [4]:
sp_model.decode(yb[1].tolist())

'the'

In [5]:
xb, yb = next(data_iter)

In [6]:
sp_model.decode(xb[1, :].tolist())

"Mathews was decorated by several governments , receiving appointments as a Companion of the Order of St Michael and St George , Companion of the Order of the Bath and as a Knight Commander of the Order of St Michael and St George from the British government and membership in the Prussian Order of the Crown . Zanzibar also rewarded him and he was a member of the Grand Order of Hamondieh and a first class member of the Order of the Brilliant Star of Zanzibar . Mathews died of malaria in Zanzibar on 11 October 1901 . \n = = Early life and career = = \n Mathews was born at Funchal on Madeira on 7 March 1850 . His father , Captain William Matthews was Welsh , and his mother Jane Wallis Penfold , was the daughter of William Penfold and Sarah Gilbert . Her sister , Augusta Jane Robley née Penfold was the author of a famous book about the flora and fauna of Madeira , which is now in the Natural History Museum . Mathews became a cadet of the Royal Navy in 1863 and was appointed a midshipman on

In [7]:
sp_model.decode(yb[1].tolist())

'African'

In [8]:
for xb, _ in load_data():
    if mx.any(xb == 0):
        print(xb)
        print(sp_model.decode(xb[0, :].tolist()))
        break

array([[314, 869, 910, ..., 408, 263, 937],
       [4509, 310, 278, ..., 17517, 869, 5345],
       [5652, 525, 9213, ..., 4096, 747, 279],
       [278, 4908, 5874, ..., 13, 2, 0]], dtype=uint16)
am . This dhow had around 100 slaves on board and was transporting them between Pemba and Zanzibar . Captain Brownrigg led a boarding party to release the slaves but bin Hattam 's men then attacked the sailors , killing Brownrigg and his party before sailing away . Mathews led a force to Wete on Pemba and , after a short battle , took a mortally wounded bin Hattem prisoner before returning to Zanzibar . 
 Mathews returned to the African mainland territories once more in 1884 when he landed with a force which intended to establish further garrisons there to dissuade German territorial claims . This attempt ultimately failed when five German warships steamed into Zanzibar Town harbour and threatened the Sultan into signing away the territories which would later form German East Africa . Further t