In [1]:
from functools import partial # There are going to be some things we want to initialize lazily to economize on resources and reuse constructor calls.
import torch
# everything will use the same tokenizer
from transformers import AutoTokenizer
mistral = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(mistral, padding_side = "right")
tokenizer.pad_token = tokenizer.eos_token

In [2]:
## Everything will use the same dataset and dataloaders
repo = "wikimedia/wikipedia"
import datasets
ds = datasets.load_dataset(repo, "20231101.simple")
def quick_estimate_tokens(ds, field="text", chars_per_token=2.7):
    tally = 0
    max_len = 0
    lengths = {}
    for row in ds:
        l = len(row[field])
        tally += l
        lengths[l] = lengths.get(l, 0) + 1
        if l > max_len:
            max_len = l

    print(f'{int(tally):_}')
    print(f'Max length: {max_len}, estimated tokens: {int(max_len / chars_per_token):_}')
    lengths = list(lengths.items())
    lengths.sort(reverse=True)
    return int(tally/chars_per_token), lengths

total, length = quick_estimate_tokens(ds['train'], field="text")
ds = ds["train"].train_test_split(test_size=0.1)

max_tokens = 512
def batch_tokenize(batch):
    return {"input_ids": tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_tokens).input_ids}

tokenized = ds.map(batch_tokenize, batched=True, batch_size=1000)

from torch.utils.data import DataLoader

tokenized.set_format(type='torch', columns=['input_ids'])


267_477_061
Max length: 236695, estimated tokens: 87_664


Map:   0%|          | 0/217608 [00:00<?, ? examples/s]

Map:   0%|          | 0/24179 [00:00<?, ? examples/s]

In [42]:
%load_ext autoreload
%autoreload 2

batch_size = 32
train_loader = DataLoader(tokenized["train"], batch_size=batch_size, shuffle = True)
eval_loader = DataLoader(tokenized["test"], batch_size=32, shuffle = False)


import sys
path = r'C:\Users\infin\OneDrive\Documents\GitHub\aimodels\projects\tooling'
sys.path.insert(0, path)
from mixers import MixerModel, EmbeddingVectorizer, EmbeddingAndPositionalVectorizer, AttentionMixer, GatedStateMixer
import torch
from functools import partial
model_dim, layers, heads = 256, 3, 4

mixer = MixerModel(
    model_size = model_dim,
    num_layers = layers,
    max_seq_len = 512,
    vectorizer = EmbeddingVectorizer,
    seq_mixer = (AttentionMixer, {"num_heads": heads, "apply_rope": True}),
    tokenizer = tokenizer,
)

from train import Trainer, SimpleTestCallback, ResidualGatingWarmupCallback, get_warmup_schedule, TimedStoppingCallback, PerplexityStoppingCallback
mixer_trainer = Trainer(
    mixer,
    train_loader,
    eval_loader = eval_loader,
    device = "cuda",
    tokenizer = tokenizer,
    #log_every = 1000,
    log_every = 100,
    eval_every = 10_000,
    schedule = get_warmup_schedule(),
    autocast_dtype = torch.bfloat16,
    #callbacks = [TimedStoppingCallback(60*3)]
    callbacks = [PerplexityStoppingCallback(45.0)]
)
mixer_trainer.train(1)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
note: tying weights
Training for 1 epochs starting from epoch 1; 6801 steps per epoch.
Beginning epoch 1
running cleanup routines


KeyboardInterrupt: 