In [1]:
# Alright, let's take a break from beating my head against the divergence wall and instead implement some things.  First off, warmup and scheduling.
# I think that's set, and it looks tantalizingly as if it's going to work.  It even seems like it might work for an unusually normal batch size.  Nope, it diverged.
# Then I think I added gradient clipping properly.  I'll give it a try with the GPT-2 size model just in case that fixes it.
# regardless of whether that works, the next two things are initialization strategies and norm ordering.

In [2]:
from functools import partial # There are going to be some things we want to initialize lazily to economize on resources and reuse constructor calls.
# everything will use the same tokenizer
from transformers import AutoTokenizer
mistral = "mistralai/Mistral-7B-v0.1"
tokenizer = AutoTokenizer.from_pretrained(mistral, padding_side = "right")
tokenizer.pad_token = tokenizer.eos_token

In [3]:
## Everything will use the same dataset and dataloaders
repo = "wikimedia/wikipedia"
import datasets
ds = datasets.load_dataset(repo, "20231101.simple")
def quick_estimate_tokens(ds, field="text", chars_per_token=2.7):
    tally = 0
    max_len = 0
    lengths = {}
    for row in ds:
        l = len(row[field])
        tally += l
        lengths[l] = lengths.get(l, 0) + 1
        if l > max_len:
            max_len = l

    print(f'{int(tally):_}')
    print(f'Max length: {max_len}, estimated tokens: {int(max_len / chars_per_token):_}')
    lengths = list(lengths.items())
    lengths.sort(reverse=True)
    return int(tally/chars_per_token), lengths

total, length = quick_estimate_tokens(ds['train'], field="text")
ds = ds["train"].train_test_split(test_size=0.1)

max_tokens = 512
def batch_tokenize(batch):
    return {"input_ids": tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_tokens).input_ids}

tokenized = ds.map(batch_tokenize, batched=True, batch_size=1000)
tokenized.set_format(type='torch', columns=['input_ids'])
train_subset = tokenized["train"].select(range(32000))



267_477_061
Max length: 236695, estimated tokens: 87_664


Map:   0%|          | 0/217608 [00:00<?, ? examples/s]

Map:   0%|          | 0/24179 [00:00<?, ? examples/s]

In [17]:
%load_ext autoreload
%autoreload 2
import sys
path = r'C:\Users\infin\OneDrive\Documents\GitHub\aimodels\projects\tooling'
sys.path.insert(0, path)
from mixer_norm_resid_order import MixerModel, UpscalingEmbeddingsVectorizer, DownscalingLanguageModelHead, AttentionMixer, SeqConvMixer, CyclingDecoderBackbone
import torch
from functools import partial

model_dim, num_heads, num_layers = 128, 2, 2

model = MixerModel(
    model_size = model_dim,
    num_layers = num_layers,
    tokenizer = tokenizer,
    max_seq_len = 512,
    seq_mixer = (AttentionMixer, {"num_heads": num_heads}),
    norm = torch.nn.LayerNorm,
    decoder_backbone = CyclingDecoderBackbone,
)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
note: tying weights


In [18]:
# So I'm liking this quite a bit.  The question now is how to parameterize block ordering.  Then there's probably a related question of how to parameterize block reuse or cycling.
# So starting with the former, one obvious way is to simply assume that pre-Norm and post-Norm are the only two possibilities, which is a more or less accurate assumption.  In that case I would probably default to something like "pre_norm = True".
# My only real problem with that is that I suspect that generalizing the solution might solve other problems as well.
# Let's come up with at least one alternative.  So, you would pass the entire residual block architecture as an argument.  Actually, a much better alternative is to have two subclasses and pass one of those.
# Okay I'm pretty happy with that.  Now how about cycling and alternation?  So, let's be completely clear about what we're dealing with.
# The most general way of looking at this, if we ignore the arbitrary distinction of "layer", is that we have alternating sequences of blocks that are not reused.
# In ALBERT, we have alternating sequences of blocks that are reused.
# In theory, we could have, say, four pairs of blocks that then get repeated three times.  We could also in theory have the opposite, three pairs of repeated blocks, followed by a different block repeated three times, and so on.
model

MixerModel(
  (vectorizer): EmbeddingAndPositionalVectorizer(
    (embedding): Embedding(32000, 128)
    (positional_embedding): Embedding(512, 128)
  )
  (embed_dropout): Dropout(p=0.0, inplace=False)
  (embed_norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
  (decoder): CyclingDecoderBackbone(
    (Number of repeating layers: 2)
    (seq_block): PreNormResidualBlock(
      (norm): LayerNorm((128,), eps=1e-05, elementwise_affine=True)
      (mixer): AttentionMixer(
        (k_proj): Linear(in_features=128, out_features=128, bias=False)
        (q_proj): Linear(in_features=128, out_features=128, bias=False)
        (softmax): Softmax(dim=-1)
        (v_proj): Linear(in_features=128, out_features=128, bias=False)
        (out_proj): Linear(in_features=128, out_features=128, bias=False)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (ff_block): PreNormResidualBlock(
      (norm): LayerNorm((128,), eps=1e-05, e

In [19]:

import torch
model = torch.hub.load('pytorch/vision:v0.10.0', 'resnet18', pretrained=True)

Downloading: "https://github.com/pytorch/vision/zipball/v0.10.0" to C:\Users\infin/.cache\torch\hub\v0.10.0.zip
Downloading: "https://download.pytorch.org/models/resnet18-f37072fd.pth" to C:\Users\infin/.cache\torch\hub\checkpoints\resnet18-f37072fd.pth
100%|██████████| 44.7M/44.7M [00:00<00:00, 77.5MB/s]


In [20]:
model

ResNet(
  (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
  (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (relu): ReLU(inplace=True)
  (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (layer1): Sequential(
    (0): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
      (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
    (1): BasicBlock(
      (conv1): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (relu): ReLU(inplace=True)
  