In [1]:
# Alright, let's take a break from beating my head against the divergence wall and instead implement some things.  First off, warmup and scheduling.
# Okay nice, with gradient clipping it seems to not be diverging at least after three step blocks.  So the one additional thing to try after this would be to disble warmup but keep gradient clipping.
# Yep, it didn't diverge finally!  Okay, but if we disable warmup, it does still diverge.  Which is what the literature suggests will happen.  Note that I still didn't use pre-Norm ordering.
# So what's next?  I could either incorporate some of these modifications I've been making in the forked module, or I could try to implement FixUp or similar things.
# Okay, I think I successfully refactored the MixerModel class to make it much more flexible and modular, and importantly, to make Pre-LN the default.
# So next would probably be warmup-replacing initialization schemes.
# I guess the other big thing is quantization and mixed precision.  I'm scared of that.  But I think I would learn a ton.  Actually am I ready to-rebaseline?
# Okay so one thing here: It's true that things like ReZero and FixUp are "tweaks", but they are tweaks that might be especially relevant to the testing scenario I'm using, where I want to use the early stages of training to get insight into the process.


In [2]:
from functools import partial # There are going to be some things we want to initialize lazily to economize on resources and reuse constructor calls.
# everything will use the same tokenizer
from transformers import AutoTokenizer
mistral = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(mistral, padding_side = "right")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
## Everything will use the same dataset and dataloaders
repo = "wikimedia/wikipedia"
import datasets
ds = datasets.load_dataset(repo, "20231101.simple")
def quick_estimate_tokens(ds, field="text", chars_per_token=2.7):
    tally = 0
    max_len = 0
    lengths = {}
    for row in ds:
        l = len(row[field])
        tally += l
        lengths[l] = lengths.get(l, 0) + 1
        if l > max_len:
            max_len = l

    print(f'{int(tally):_}')
    print(f'Max length: {max_len}, estimated tokens: {int(max_len / chars_per_token):_}')
    lengths = list(lengths.items())
    lengths.sort(reverse=True)
    return int(tally/chars_per_token), lengths

total, length = quick_estimate_tokens(ds['train'], field="text")
ds = ds["train"].train_test_split(test_size=0.1)

max_tokens = 512
def batch_tokenize(batch):
    return {"input_ids": tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_tokens).input_ids}

tokenized = ds.map(batch_tokenize, batched=True, batch_size=1000)
tokenized.set_format(type='torch', columns=['input_ids'])
train_subset = tokenized["train"].select(range(32000))



267_477_061
Max length: 236695, estimated tokens: 87_664


Map:   0%|          | 0/217608 [00:00<?, ? examples/s]

Map:   0%|          | 0/24179 [00:00<?, ? examples/s]

In [4]:
batch_size = 32

from torch.utils.data import DataLoader
#train_loader = DataLoader(train_subset, batch_size=batch_size, shuffle = True) 
train_loader = DataLoader(tokenized["train"], batch_size, shuffle = True) # one thing we want to look out for here is the shuffling seed.
eval_loader = DataLoader(tokenized["test"], 32, shuffle = False)


%load_ext autoreload
%autoreload 2
import sys
path = r'C:\Users\infin\OneDrive\Documents\GitHub\aimodels\projects\tooling'
sys.path.insert(0, path)
#from mixers import MixerModel, UpscalingEmbeddingsVectorizer, DownscalingLanguageModelHead, AttentionMixer, SeqConvMixer
from mixers import MixerModel, AttentionMixer, NoNorm, ReZeroResidualBlock
import torch
from functools import partial


model_dim, num_heads, num_layers = 768, 12, 12
#model_dim, num_heads, num_layers = 256, 4, 4

model = MixerModel(
    model_size = model_dim,
    num_layers = num_layers,
    tokenizer = tokenizer,
    max_seq_len = 512,
    seq_mixer = (AttentionMixer, {"num_heads": num_heads}),
    #norm = torch.nn.LayerNorm,
    norm = NoNorm,
    residual_block = ReZeroResidualBlock,
)

note: tying weights


In [5]:
model.decoder.layers[0].seq_block.norm.resweight

Parameter containing:
tensor([0.], requires_grad=True)

In [6]:
%load_ext autoreload
%autoreload 2

from train import Trainer, get_linear_schedule

trainer = Trainer(
    model,
    train_loader,
    tokenizer = tokenizer,
    device = "cuda",
    eval_loader = eval_loader,
    log_every = 1000,
    #schedule = get_linear_schedule(end_factor = 1.0), # you know what we didn't try?  using the normal scheduler.  we haven't actually tried constant warmup I don't think.
    gradient_accumulation_batch_size = 8,
)

Trainer.clean_up_gpu()
trainer.train(1) # alright, is this going to get down to 21 without warmup?

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Training for 1 epochs starting from epoch 1; 6801 steps per epoch.
Beginning epoch 1
{'mode': 'train', 'epoch': 1, 'step': 1000, 'seconds': 890.5087018013, 'loss': 6.19204906642437, 'ppl': 488.84674072265625}
{'mode': 'train', 'epoch': 1, 'step': 2000, 'seconds': 882.5054290294647, 'loss': 4.3682773955464365, 'ppl': 78.90760040283203}
{'mode': 'train', 'epoch': 1, 'step': 3000, 'seconds': 898.9369604587555, 'loss': 3.87336490136385, 'ppl': 48.103981018066406}
{'mode': 'train', 'epoch': 1, 'step': 4000, 'seconds': 867.2475390434265, 'loss': 3.5829718211591244, 'ppl': 35.980308532714844}


KeyboardInterrupt: 