In [1]:
%load_ext autoreload
%autoreload 2
import sys
sys.path.append('../')

In [5]:
from datasets import load_dataset
import torch as t

from nnsight import LanguageModel
from buffer import MultiModelActivationBuffer
from trainers.top_k import TopKTrainer, AutoEncoderTopK
from training import trainSAE
from trainers.standard import StandardTrainerAprilUpdate
from dictionary import AutoEncoder

dtype = t.bfloat16

In [6]:
#%%
layer = 7
expansion = 2*32
num_tokens = int(500e6)
out_batch_size = 8192
# model_name_list = ["unsloth/Qwen2.5-Coder-32B-Instruct", "emergent-misalignment/Qwen-Coder-Insecure"]
model_name_list = ["Qwen/Qwen2.5-0.5B", "Qwen/Qwen2.5-0.5B-Instruct"]
submodule_list = []
model_list = []
for i, model_name in enumerate(model_name_list):
    model = LanguageModel(
        model_name, 
        trust_remote_code=False, 
        device_map=f"cuda:{i}",
        torch_dtype=dtype,
        dispatch=True
        )
    for x in model.parameters():
        x.requires_grad = False
    model_list.append(model)
    submodule_list.append(model.model.layers[layer])
    
activation_dim = 896
dictionary_size = expansion * activation_dim

dataset = load_dataset(
    'Skylion007/openwebtext', 
    split='train', 
    streaming=True,
    trust_remote_code=True
    )

dataset = dataset.shuffle()

class CustomData():
    def __init__(self, dataset):
        self.data = iter(dataset)

    def __iter__(self):
        return self

    def __next__(self):
        return next(self.data)['text']

data = CustomData(dataset)

buffer = MultiModelActivationBuffer(
    data=data,
    model_list=model_list,
    submodule_list=submodule_list,
    d_submodule=activation_dim, # output dimension of the model component
    n_ctxs=512,  # you can set this higher or lower dependong on your available memory
    device="cuda:2",
    refresh_batch_size=512,
    out_batch_size=out_batch_size,
    remove_bos=True,
    ctx_len=256
)  # buffer will yield batches of tensors of dimension = submodule's output dimension


In [None]:
#%%
trainer_cfg = {
    "trainer": StandardTrainerAprilUpdate,
    "dict_class": AutoEncoder,
    "activation_dim": activation_dim * len(model_list),
    "dict_size": dictionary_size,
    "device": "cuda:2",
    "steps": num_tokens // out_batch_size,
    "layer": layer,
    "lm_name": "blah",
    "warmup_steps": 0,
    "l1_penalty": 1e-2,
    "lr": 1e-6,
    "sparsity_warmup_steps": 0,
    "frac_features_shared": 0.1,
    "shared_l1_penalty": 1e-3,

}

# train the sparse autoencoder (SAE)
ae = trainSAE(
    data=buffer,  # you could also use another (i.e. pytorch dataloader) here instead of buffer
    trainer_configs=[trainer_cfg],
    steps=num_tokens // out_batch_size,
    autocast_dtype=dtype,
    use_wandb=True,
    wandb_project="insecure diffing",
    log_steps=20,
    hf_repo_out="jacobcd52/insecure_diffing",
    save_dir="/root/pretraining_diffing/checkpoints/",
    normalize_activations=True,
)

  0%|          | 0/61035 [00:00<?, ?it/s]

Error processing model 0: list indices must be integers or slices, not NoneType
Error processing model 0: Accessing value before it's been set.
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 1/61035 [00:05<100:13:13,  5.91s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 2/61035 [00:06<46:16:02,  2.73s/it] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 3/61035 [00:06<29:01:29,  1.71s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 4/61035 [00:07<20:56:10,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 5/61035 [00:08<18:12:34,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 6/61035 [00:08<16:10:15,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 7/61035 [00:09<14:55:36,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 8/61035 [00:10<14:06:56,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 9/61035 [00:10<12:44:38,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 10/61035 [00:13<21:52:27,  1.29s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 11/61035 [00:14<19:03:02,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 12/61035 [00:14<16:59:57,  1.00s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 13/61035 [00:15<15:35:06,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 14/61035 [00:16<13:42:08,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 15/61035 [00:16<11:54:13,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 16/61035 [00:17<10:38:42,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 17/61035 [00:17<9:43:50,  1.74it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 18/61035 [00:18<10:30:25,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 19/61035 [00:20<17:56:30,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 20/61035 [00:20<15:08:18,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 21/61035 [00:21<13:01:13,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 22/61035 [00:22<12:49:56,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 23/61035 [00:22<12:40:08,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 24/61035 [00:23<12:34:09,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 25/61035 [00:24<12:31:51,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 26/61035 [00:25<12:28:50,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 27/61035 [00:25<11:05:58,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 28/61035 [00:27<19:44:04,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 29/61035 [00:28<17:31:13,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 30/61035 [00:29<16:00:01,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 31/61035 [00:29<14:29:33,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 32/61035 [00:30<12:26:17,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 33/61035 [00:30<10:58:04,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 34/61035 [00:31<10:01:16,  1.69it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 35/61035 [00:31<9:48:55,  1.73it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 36/61035 [00:32<10:32:17,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 37/61035 [00:34<17:39:44,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 38/61035 [00:35<14:54:43,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 39/61035 [00:35<12:47:58,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 40/61035 [00:36<12:37:35,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 41/61035 [00:37<12:30:55,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 42/61035 [00:37<12:28:24,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 43/61035 [00:38<12:26:02,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 44/61035 [00:39<12:21:26,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 45/61035 [00:39<10:55:05,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 46/61035 [00:42<20:27:27,  1.21s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 47/61035 [00:42<18:02:23,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 48/61035 [00:43<15:47:47,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 49/61035 [00:44<13:37:04,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 50/61035 [00:44<12:04:41,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 51/61035 [00:45<11:52:52,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 52/61035 [00:45<12:18:32,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 53/61035 [00:46<12:15:56,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 54/61035 [00:47<12:18:16,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 55/61035 [00:49<21:05:10,  1.24s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 56/61035 [00:50<18:44:05,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 57/61035 [00:51<17:02:42,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 58/61035 [00:52<15:53:17,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 59/61035 [00:52<14:51:09,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 60/61035 [00:53<12:43:32,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 61/61035 [00:53<11:11:16,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 62/61035 [00:54<10:06:39,  1.68it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 63/61035 [00:54<10:12:56,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 64/61035 [00:57<18:18:38,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 65/61035 [00:57<15:05:50,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 66/61035 [00:58<12:51:14,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 67/61035 [00:58<11:17:18,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 68/61035 [00:59<11:34:05,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 69/61035 [00:59<11:47:15,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 70/61035 [01:00<11:54:27,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 71/61035 [01:01<12:00:26,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 72/61035 [01:02<11:43:44,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 73/61035 [01:05<25:14:16,  1.49s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 74/61035 [01:06<21:19:06,  1.26s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 75/61035 [01:06<18:36:09,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 76/61035 [01:07<16:40:53,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 77/61035 [01:08<15:20:32,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 78/61035 [01:08<13:33:51,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 79/61035 [01:09<11:45:00,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 80/61035 [01:09<10:29:09,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 81/61035 [01:10<9:37:52,  1.76it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 82/61035 [01:12<17:36:06,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 83/61035 [01:12<14:36:55,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 84/61035 [01:13<12:31:10,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 85/61035 [01:13<11:20:11,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 86/61035 [01:14<11:52:16,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 87/61035 [01:15<12:04:09,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 88/61035 [01:15<12:09:31,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 89/61035 [01:16<12:12:09,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 90/61035 [01:17<12:13:38,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 91/61035 [01:20<23:08:52,  1.37s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 92/61035 [01:20<19:53:25,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 93/61035 [01:21<17:36:42,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 94/61035 [01:22<16:08:21,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 95/61035 [01:23<14:20:05,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 96/61035 [01:23<12:17:52,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 97/61035 [01:23<10:55:31,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 98/61035 [01:24<9:58:04,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 99/61035 [01:25<10:40:57,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 100/61035 [01:27<18:10:48,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 101/61035 [01:27<15:15:27,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 102/61035 [01:28<13:15:28,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 103/61035 [01:28<13:00:26,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 104/61035 [01:29<12:51:02,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 105/61035 [01:30<12:42:48,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 106/61035 [01:31<12:33:32,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 107/61035 [01:31<11:41:30,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 108/61035 [01:32<10:28:22,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 109/61035 [01:34<19:47:27,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 110/61035 [01:35<17:48:17,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 111/61035 [01:36<16:23:16,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 112/61035 [01:36<14:02:27,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 113/61035 [01:37<12:13:26,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 114/61035 [01:37<10:51:13,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 115/61035 [01:38<10:57:30,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 116/61035 [01:39<11:20:52,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 117/61035 [01:39<11:36:33,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 118/61035 [01:41<18:51:12,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 119/61035 [01:42<15:43:57,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 120/61035 [01:43<14:56:08,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 121/61035 [01:43<14:23:57,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 122/61035 [01:44<13:47:57,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 123/61035 [01:45<13:18:28,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 124/61035 [01:45<12:07:21,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 125/61035 [01:46<10:44:54,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 126/61035 [01:46<9:46:33,  1.73it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 127/61035 [01:49<18:22:58,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 128/61035 [01:49<16:30:43,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 129/61035 [01:50<13:51:01,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 130/61035 [01:50<11:57:37,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 131/61035 [01:51<10:42:17,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 132/61035 [01:51<10:30:56,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 133/61035 [01:52<11:04:53,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 134/61035 [01:53<11:25:25,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 135/61035 [01:53<11:42:14,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 136/61035 [01:56<20:38:11,  1.22s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 137/61035 [01:57<18:24:02,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 138/61035 [01:57<16:49:43,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 139/61035 [01:58<15:44:21,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 140/61035 [01:59<14:40:11,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 141/61035 [02:00<13:29:30,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 142/61035 [02:00<11:42:55,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 143/61035 [02:00<10:29:31,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 144/61035 [02:01<9:37:51,  1.76it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 145/61035 [02:03<17:44:17,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 146/61035 [02:04<14:41:23,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 147/61035 [02:04<12:34:27,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 148/61035 [02:04<11:04:56,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 149/61035 [02:05<11:26:54,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 150/61035 [02:06<11:43:04,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 151/61035 [02:07<11:50:47,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 152/61035 [02:07<11:58:26,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 153/61035 [02:08<12:02:14,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 154/61035 [02:11<24:18:56,  1.44s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 155/61035 [02:12<20:56:15,  1.24s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 156/61035 [02:13<18:32:11,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 157/61035 [02:13<16:42:08,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 158/61035 [02:14<14:06:46,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 159/61035 [02:14<12:08:40,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 160/61035 [02:15<10:51:10,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 161/61035 [02:16<10:54:35,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 162/61035 [02:16<11:19:47,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 163/61035 [02:18<18:48:26,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 164/61035 [02:19<15:41:15,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 165/61035 [02:20<14:58:33,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 166/61035 [02:20<14:28:26,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 167/61035 [02:21<14:01:38,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 168/61035 [02:22<13:28:58,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 169/61035 [02:23<13:05:26,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 170/61035 [02:23<11:27:40,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 171/61035 [02:24<10:18:06,  1.64it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 172/61035 [02:26<18:38:00,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 173/61035 [02:27<16:43:59,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 174/61035 [02:27<14:01:55,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 175/61035 [02:27<12:04:56,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 176/61035 [02:28<10:46:12,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 177/61035 [02:28<10:30:24,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 178/61035 [02:29<11:14:57,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 179/61035 [02:30<11:31:22,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 180/61035 [02:31<11:46:27,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 181/61035 [02:33<18:43:24,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 182/61035 [02:33<16:33:42,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 183/61035 [02:34<15:30:13,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 184/61035 [02:35<14:45:26,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 185/61035 [02:36<14:06:07,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 186/61035 [02:36<13:21:24,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 187/61035 [02:37<11:41:42,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 188/61035 [02:37<10:29:44,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 189/61035 [02:38<9:39:17,  1.75it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 190/61035 [02:40<18:30:26,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 191/61035 [02:41<15:34:26,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 192/61035 [02:41<13:25:21,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 193/61035 [02:42<11:57:20,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 194/61035 [02:42<12:17:09,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 195/61035 [02:43<12:18:34,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 196/61035 [02:44<12:21:22,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 197/61035 [02:45<12:20:02,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 198/61035 [02:45<11:43:08,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 199/61035 [02:48<21:36:58,  1.28s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 200/61035 [02:49<19:04:10,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 201/61035 [02:49<17:17:13,  1.02s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 202/61035 [02:50<16:02:20,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 203/61035 [02:51<13:47:30,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 204/61035 [02:51<12:14:48,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 205/61035 [02:52<11:26:53,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 206/61035 [02:53<11:54:13,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 207/61035 [02:53<12:08:14,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 208/61035 [02:55<18:42:28,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 209/61035 [02:56<15:24:42,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 210/61035 [02:57<14:32:05,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 211/61035 [02:57<13:52:50,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 212/61035 [02:58<13:25:07,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 213/61035 [02:59<13:05:29,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 214/61035 [02:59<12:51:12,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 215/61035 [03:00<11:28:21,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 216/61035 [03:00<10:19:48,  1.64it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 217/61035 [03:03<20:42:12,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 218/61035 [03:04<18:27:17,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 219/61035 [03:04<16:00:22,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 220/61035 [03:05<13:45:59,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 221/61035 [03:05<12:09:49,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 222/61035 [03:06<12:02:32,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 223/61035 [03:07<12:22:40,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 224/61035 [03:08<12:36:33,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 225/61035 [03:08<12:35:41,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 226/61035 [03:11<22:50:08,  1.35s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 227/61035 [03:12<20:01:47,  1.19s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 228/61035 [03:13<17:59:16,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 229/61035 [03:14<16:30:18,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 230/61035 [03:14<15:28:08,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 231/61035 [03:15<13:23:38,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 232/61035 [03:15<11:54:38,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 233/61035 [03:16<10:57:57,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 234/61035 [03:17<11:37:24,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 235/61035 [03:19<19:05:31,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 236/61035 [03:19<15:58:22,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 237/61035 [03:20<14:29:45,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 238/61035 [03:21<14:10:36,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 239/61035 [03:22<13:56:10,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 240/61035 [03:22<13:45:38,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 241/61035 [03:23<13:33:23,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 242/61035 [03:24<12:02:50,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 243/61035 [03:24<10:59:06,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 244/61035 [03:27<19:53:21,  1.18s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 245/61035 [03:27<17:23:29,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 246/61035 [03:28<14:44:05,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 247/61035 [03:28<12:53:34,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 248/61035 [03:29<12:02:14,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 249/61035 [03:30<12:25:13,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 250/61035 [03:30<12:36:52,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 251/61035 [03:31<12:53:12,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 252/61035 [03:32<12:43:45,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 253/61035 [03:35<23:53:18,  1.41s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 254/61035 [03:36<20:25:25,  1.21s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 255/61035 [03:36<18:03:49,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 256/61035 [03:37<16:20:25,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 257/61035 [03:38<15:06:51,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 258/61035 [03:38<12:56:22,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 259/61035 [03:39<11:21:19,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 260/61035 [03:39<10:18:21,  1.64it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 261/61035 [03:40<10:43:51,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 262/61035 [03:42<18:07:29,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 263/61035 [03:42<14:59:09,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 264/61035 [03:43<12:48:02,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 265/61035 [03:43<11:20:31,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 266/61035 [03:44<11:36:04,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 267/61035 [03:45<11:48:33,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 268/61035 [03:46<11:57:19,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 269/61035 [03:46<12:03:03,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 270/61035 [03:47<11:10:32,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 271/61035 [03:49<20:20:19,  1.20s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 272/61035 [03:50<17:54:02,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 273/61035 [03:51<16:15:54,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 274/61035 [03:51<14:08:24,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 275/61035 [03:52<12:11:26,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 276/61035 [03:52<10:51:41,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 277/61035 [03:53<9:54:16,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 278/61035 [03:53<10:39:10,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 279/61035 [03:54<11:07:36,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 280/61035 [03:56<18:23:00,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 281/61035 [03:57<15:23:24,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 282/61035 [03:57<14:29:18,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 283/61035 [03:58<13:48:53,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 284/61035 [03:59<13:20:40,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 285/61035 [04:00<13:00:19,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 286/61035 [04:00<12:46:05,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 287/61035 [04:01<11:14:09,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 288/61035 [04:01<10:08:18,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 289/61035 [04:04<19:12:36,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 290/61035 [04:04<17:07:11,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 291/61035 [04:05<15:38:09,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 292/61035 [04:06<13:31:34,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 293/61035 [04:06<11:44:45,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 294/61035 [04:06<10:31:19,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 295/61035 [04:07<9:44:05,  1.73it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 296/61035 [04:08<10:32:11,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 297/61035 [04:08<11:02:53,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 298/61035 [04:10<17:53:12,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 299/61035 [04:11<14:48:30,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 300/61035 [04:12<14:01:26,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 301/61035 [04:12<13:26:53,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 302/61035 [04:13<13:04:24,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 303/61035 [04:14<12:48:32,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 304/61035 [04:14<12:41:04,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  0%|          | 305/61035 [04:15<11:28:18,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 306/61035 [04:15<10:19:05,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 307/61035 [04:18<19:07:46,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 308/61035 [04:19<17:06:36,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 309/61035 [04:19<14:36:56,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 310/61035 [04:20<12:31:06,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 311/61035 [04:20<11:06:59,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 312/61035 [04:21<10:22:38,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 313/61035 [04:21<11:03:14,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 314/61035 [04:22<11:28:10,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 315/61035 [04:23<11:43:55,  1.44it/s]

Error processing model 0: list indices must be integers or slices, not NoneType
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 316/61035 [04:26<26:22:14,  1.56s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 317/61035 [04:27<22:25:19,  1.33s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 318/61035 [04:28<19:35:48,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 319/61035 [04:28<16:14:51,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 320/61035 [04:29<13:53:42,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 321/61035 [04:29<12:16:09,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 322/61035 [04:30<12:31:11,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 323/61035 [04:31<12:41:48,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 324/61035 [04:32<12:33:57,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 325/61035 [04:34<19:28:53,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 326/61035 [04:34<15:57:19,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 327/61035 [04:35<14:54:53,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 328/61035 [04:36<14:09:43,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 329/61035 [04:36<13:34:38,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 330/61035 [04:37<13:12:30,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 331/61035 [04:38<12:34:59,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 332/61035 [04:38<11:05:49,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 333/61035 [04:39<10:05:04,  1.67it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 334/61035 [04:41<19:33:54,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 335/61035 [04:42<16:20:10,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 336/61035 [04:42<14:03:28,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 337/61035 [04:43<12:29:50,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 338/61035 [04:43<11:48:46,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 339/61035 [04:44<12:23:53,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 340/61035 [04:45<12:22:49,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 341/61035 [04:46<12:19:54,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 342/61035 [04:46<12:22:23,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 343/61035 [04:49<24:09:44,  1.43s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 344/61035 [04:50<20:52:14,  1.24s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 345/61035 [04:51<18:17:04,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 346/61035 [04:52<16:30:06,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 347/61035 [04:52<14:59:28,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 348/61035 [04:53<12:45:46,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 349/61035 [04:53<11:12:40,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 350/61035 [04:54<10:08:45,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 351/61035 [04:54<10:47:21,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 352/61035 [04:57<17:56:38,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 353/61035 [04:57<14:52:05,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 354/61035 [04:57<12:42:43,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 355/61035 [04:58<12:22:02,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 356/61035 [04:59<12:20:48,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 357/61035 [05:00<12:18:41,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 358/61035 [05:00<12:17:26,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 359/61035 [05:01<12:17:30,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 360/61035 [05:01<10:57:07,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 361/61035 [05:04<21:11:06,  1.26s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 362/61035 [05:05<18:51:53,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 363/61035 [05:06<17:14:35,  1.02s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 364/61035 [05:06<15:46:33,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 365/61035 [05:07<13:23:15,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 366/61035 [05:07<11:39:18,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 367/61035 [05:08<10:30:57,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 368/61035 [05:09<10:56:16,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 369/61035 [05:09<11:28:01,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 370/61035 [05:11<18:28:21,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 371/61035 [05:12<15:30:33,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 372/61035 [05:13<14:49:40,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 373/61035 [05:13<14:18:29,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 374/61035 [05:14<13:41:22,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 375/61035 [05:15<13:14:58,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 376/61035 [05:16<13:05:31,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 377/61035 [05:16<11:30:43,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 378/61035 [05:17<10:20:30,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 379/61035 [05:19<19:12:38,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 380/61035 [05:20<16:56:04,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 381/61035 [05:20<14:09:03,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 382/61035 [05:21<12:15:17,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 383/61035 [05:21<10:53:47,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 384/61035 [05:22<11:18:16,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 385/61035 [05:22<11:35:43,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 386/61035 [05:23<11:51:41,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 387/61035 [05:24<12:00:50,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 388/61035 [05:27<25:40:37,  1.52s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 389/61035 [05:28<21:41:02,  1.29s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 390/61035 [05:29<19:09:45,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 391/61035 [05:30<17:20:14,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 392/61035 [05:30<16:06:16,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 393/61035 [05:31<13:55:01,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 394/61035 [05:31<12:18:50,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 395/61035 [05:32<11:17:33,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 396/61035 [05:33<10:28:14,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 397/61035 [05:35<18:24:45,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 398/61035 [05:35<15:30:54,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 399/61035 [05:36<13:29:45,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 400/61035 [05:36<13:11:17,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 401/61035 [05:37<13:10:47,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 402/61035 [05:38<12:58:50,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 403/61035 [05:39<12:45:19,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 404/61035 [05:39<12:35:40,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 405/61035 [05:40<11:13:44,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 406/61035 [05:42<20:15:57,  1.20s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 407/61035 [05:43<18:08:56,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 408/61035 [05:44<16:22:38,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 409/61035 [05:44<14:21:57,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 410/61035 [05:45<12:34:31,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 411/61035 [05:45<11:21:20,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 412/61035 [05:46<11:26:32,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 413/61035 [05:47<11:42:35,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 414/61035 [05:48<11:51:23,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 415/61035 [05:50<18:25:23,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 416/61035 [05:50<15:09:47,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 417/61035 [05:51<14:18:26,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 418/61035 [05:52<13:43:37,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 419/61035 [05:52<13:14:48,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 420/61035 [05:53<12:55:45,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 421/61035 [05:54<12:43:53,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 422/61035 [05:54<11:11:48,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 423/61035 [05:55<10:04:32,  1.67it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 424/61035 [05:57<18:13:58,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 425/61035 [05:58<16:27:17,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 426/61035 [05:58<15:05:45,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 427/61035 [05:59<12:51:06,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 428/61035 [05:59<11:15:46,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 429/61035 [06:00<10:29:36,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 430/61035 [06:01<11:22:46,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 431/61035 [06:01<11:37:54,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 432/61035 [06:02<11:47:55,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 433/61035 [06:04<17:53:01,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 434/61035 [06:04<14:47:56,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 435/61035 [06:05<14:01:31,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 436/61035 [06:06<13:26:25,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 437/61035 [06:06<13:03:39,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 438/61035 [06:07<12:50:06,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 439/61035 [06:08<12:38:29,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 440/61035 [06:08<11:39:40,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 441/61035 [06:09<10:25:09,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 442/61035 [06:11<19:15:22,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 443/61035 [06:12<17:22:57,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 444/61035 [06:13<14:45:51,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 445/61035 [06:13<12:54:04,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 446/61035 [06:14<11:35:03,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 447/61035 [06:14<11:53:02,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 448/61035 [06:15<12:01:28,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 449/61035 [06:16<12:07:52,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 450/61035 [06:17<12:07:49,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 451/61035 [06:19<22:41:55,  1.35s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 452/61035 [06:20<19:56:04,  1.18s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 453/61035 [06:21<17:59:52,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 454/61035 [06:22<16:38:39,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 455/61035 [06:22<15:18:14,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 456/61035 [06:23<13:00:44,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 457/61035 [06:23<11:22:08,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 458/61035 [06:24<10:15:16,  1.64it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 459/61035 [06:24<10:10:28,  1.65it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 460/61035 [06:26<17:12:18,  1.02s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 461/61035 [06:27<14:17:21,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 462/61035 [06:27<12:16:35,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 463/61035 [06:28<10:59:26,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 464/61035 [06:29<11:21:31,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 465/61035 [06:29<11:36:23,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 466/61035 [06:30<11:48:33,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 467/61035 [06:31<11:55:22,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 468/61035 [06:31<12:00:24,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 469/61035 [06:34<23:04:26,  1.37s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 470/61035 [06:35<20:13:25,  1.20s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 471/61035 [06:36<18:04:29,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 472/61035 [06:37<16:34:13,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 473/61035 [06:37<13:56:33,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 474/61035 [06:38<12:00:07,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 475/61035 [06:38<10:39:29,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 476/61035 [06:39<9:44:38,  1.73it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 477/61035 [06:39<10:28:44,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 478/61035 [06:41<17:19:03,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 479/61035 [06:42<14:24:01,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 480/61035 [06:42<12:20:00,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 481/61035 [06:43<11:04:54,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 482/61035 [06:43<11:24:45,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 483/61035 [06:44<11:38:25,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 484/61035 [06:45<11:47:00,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 485/61035 [06:45<11:53:30,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 486/61035 [06:46<11:08:49,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 487/61035 [06:48<20:02:22,  1.19s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 488/61035 [06:49<17:42:11,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 489/61035 [06:50<16:06:01,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 490/61035 [06:51<14:58:31,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 491/61035 [06:51<13:14:35,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 492/61035 [06:52<11:33:47,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 493/61035 [06:52<10:20:57,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 494/61035 [06:53<9:55:18,  1.69it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 495/61035 [06:53<10:41:06,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 496/61035 [06:56<18:45:48,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 497/61035 [06:56<15:23:58,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 498/61035 [06:57<13:51:17,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 499/61035 [06:57<13:25:17,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 500/61035 [06:58<13:04:02,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 501/61035 [06:59<12:48:06,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 502/61035 [07:00<12:38:50,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 503/61035 [07:00<11:11:36,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 504/61035 [07:01<10:07:09,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 505/61035 [07:03<19:26:18,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 506/61035 [07:04<17:33:51,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 507/61035 [07:04<15:07:12,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 508/61035 [07:05<13:09:10,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 509/61035 [07:05<11:44:39,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 510/61035 [07:06<10:38:29,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 511/61035 [07:07<11:10:01,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 512/61035 [07:07<11:33:47,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 513/61035 [07:08<11:44:55,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 514/61035 [07:10<19:29:51,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 515/61035 [07:11<17:42:24,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 516/61035 [07:12<16:29:36,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 517/61035 [07:13<15:35:16,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 518/61035 [07:13<14:54:15,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 519/61035 [07:14<14:00:09,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 520/61035 [07:15<12:04:53,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 521/61035 [07:15<10:47:09,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 522/61035 [07:16<9:50:45,  1.71it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 523/61035 [07:18<17:55:50,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 524/61035 [07:18<14:49:24,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 525/61035 [07:19<12:40:40,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 526/61035 [07:19<11:11:33,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 527/61035 [07:20<11:32:40,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 528/61035 [07:21<11:45:01,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 529/61035 [07:21<11:53:36,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 530/61035 [07:22<11:59:06,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 531/61035 [07:23<12:02:54,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 532/61035 [07:26<22:48:59,  1.36s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 533/61035 [07:26<19:36:59,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 534/61035 [07:27<17:27:42,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 535/61035 [07:28<15:52:19,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 536/61035 [07:28<14:00:43,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 537/61035 [07:29<12:06:14,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 538/61035 [07:29<10:45:10,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 539/61035 [07:30<9:52:03,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 540/61035 [07:30<10:33:39,  1.59it/s]

Error processing model 0: list indices must be integers or slices, not NoneType
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 541/61035 [07:34<23:50:37,  1.42s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 542/61035 [07:34<20:09:40,  1.20s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 543/61035 [07:35<18:03:23,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 544/61035 [07:36<16:33:10,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 545/61035 [07:37<15:30:54,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 546/61035 [07:37<14:14:02,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 547/61035 [07:38<12:14:11,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 548/61035 [07:38<10:49:09,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 549/61035 [07:39<9:50:54,  1.71it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 550/61035 [07:41<18:57:54,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 551/61035 [07:42<15:46:32,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 552/61035 [07:42<13:34:12,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 553/61035 [07:43<12:04:29,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 554/61035 [07:43<12:19:33,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 555/61035 [07:44<12:17:25,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 556/61035 [07:45<12:16:03,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 557/61035 [07:46<12:14:22,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 558/61035 [07:46<10:59:25,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 559/61035 [07:48<19:35:25,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 560/61035 [07:49<17:39:18,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 561/61035 [07:50<16:16:23,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 562/61035 [07:50<13:43:52,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 563/61035 [07:51<11:53:34,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 564/61035 [07:51<10:35:54,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 565/61035 [07:52<10:31:44,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 566/61035 [07:53<10:59:57,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 567/61035 [07:53<11:21:07,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 568/61035 [07:55<18:07:57,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 569/61035 [07:56<15:13:11,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 570/61035 [07:57<14:32:52,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 571/61035 [07:58<14:07:40,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 572/61035 [07:58<13:34:33,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 573/61035 [07:59<13:09:29,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 574/61035 [08:00<12:03:45,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 575/61035 [08:00<10:43:09,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 576/61035 [08:00<9:50:07,  1.71it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 577/61035 [08:03<18:22:05,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 578/61035 [08:03<16:37:21,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 579/61035 [08:04<13:54:03,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 580/61035 [08:04<11:58:11,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 581/61035 [08:05<10:38:15,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 582/61035 [08:05<10:11:54,  1.65it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 583/61035 [08:06<10:47:14,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 584/61035 [08:07<11:13:45,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 585/61035 [08:08<11:28:54,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 586/61035 [08:10<18:26:42,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 587/61035 [08:10<16:10:42,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 588/61035 [08:11<14:58:14,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 589/61035 [08:12<14:08:02,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 590/61035 [08:12<13:32:30,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 591/61035 [08:13<13:09:22,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 592/61035 [08:14<12:18:08,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 593/61035 [08:14<10:51:45,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 594/61035 [08:15<9:54:08,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 595/61035 [08:17<18:29:04,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 596/61035 [08:18<15:41:59,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 597/61035 [08:18<13:16:40,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 598/61035 [08:18<11:34:51,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 599/61035 [08:19<10:24:19,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 600/61035 [08:20<10:55:27,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 601/61035 [08:20<11:18:07,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 602/61035 [08:21<11:33:56,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 603/61035 [08:22<11:44:35,  1.43it/s]

Error processing model 1: list indices must be integers or slices, not NoneType
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 604/61035 [08:26<28:41:21,  1.71s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 605/61035 [08:27<23:43:30,  1.41s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 606/61035 [08:27<18:54:26,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 607/61035 [08:27<15:29:55,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 608/61035 [08:28<13:05:23,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 609/61035 [08:29<12:21:21,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 610/61035 [08:29<12:19:50,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 611/61035 [08:30<12:23:57,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 612/61035 [08:31<12:28:20,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 613/61035 [08:33<19:12:14,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 614/61035 [08:33<16:06:25,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 615/61035 [08:34<15:22:45,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 616/61035 [08:35<14:51:56,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 617/61035 [08:36<14:28:44,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 618/61035 [08:37<13:50:41,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 619/61035 [08:37<12:09:37,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 620/61035 [08:38<10:48:34,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 621/61035 [08:38<9:53:48,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 622/61035 [08:40<19:21:58,  1.15s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 623/61035 [08:41<16:14:14,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 624/61035 [08:42<14:02:33,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 625/61035 [08:42<12:24:53,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 626/61035 [08:43<12:40:40,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 627/61035 [08:44<12:59:27,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 628/61035 [08:44<13:13:30,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 629/61035 [08:45<13:25:24,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 630/61035 [08:46<13:10:49,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 631/61035 [08:49<23:59:32,  1.43s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 632/61035 [08:50<20:54:52,  1.25s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 633/61035 [08:51<18:48:29,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 634/61035 [08:51<16:17:59,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 635/61035 [08:52<14:05:30,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 636/61035 [08:52<12:33:33,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 637/61035 [08:53<11:30:18,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 638/61035 [08:54<12:07:09,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 639/61035 [08:54<12:32:37,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 640/61035 [08:57<21:36:40,  1.29s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 641/61035 [08:58<17:46:40,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 642/61035 [08:58<16:32:30,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 643/61035 [08:59<15:39:59,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 644/61035 [09:00<15:01:39,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 645/61035 [09:01<14:37:40,  1.15it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 646/61035 [09:01<12:59:05,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 647/61035 [09:02<11:45:44,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 648/61035 [09:02<10:57:33,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 649/61035 [09:05<20:54:49,  1.25s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 650/61035 [09:06<17:21:28,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 651/61035 [09:06<14:50:43,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 652/61035 [09:07<13:05:11,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 653/61035 [09:07<13:13:56,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 654/61035 [09:08<13:24:09,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 655/61035 [09:09<13:29:06,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 656/61035 [09:10<13:33:48,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 657/61035 [09:10<12:12:48,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 658/61035 [09:13<22:19:50,  1.33s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 659/61035 [09:14<19:39:17,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 660/61035 [09:15<17:48:06,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 661/61035 [09:15<15:06:30,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 662/61035 [09:16<13:12:19,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 663/61035 [09:16<11:54:25,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 664/61035 [09:17<12:24:39,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 665/61035 [09:18<12:43:08,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 666/61035 [09:19<12:58:40,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 667/61035 [09:21<21:38:28,  1.29s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 668/61035 [09:22<19:12:33,  1.15s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 669/61035 [09:23<17:31:57,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 670/61035 [09:24<16:21:29,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 671/61035 [09:25<15:29:55,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 672/61035 [09:25<13:32:26,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 673/61035 [09:26<11:47:19,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 674/61035 [09:26<10:35:33,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 675/61035 [09:27<10:24:48,  1.61it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 676/61035 [09:29<18:45:18,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 677/61035 [09:29<15:50:08,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 678/61035 [09:30<13:43:22,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 679/61035 [09:30<12:18:55,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 680/61035 [09:31<12:41:49,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 681/61035 [09:32<13:01:50,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 682/61035 [09:33<13:10:47,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 683/61035 [09:34<13:11:46,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 684/61035 [09:34<11:52:38,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 685/61035 [09:37<21:59:56,  1.31s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 686/61035 [09:38<19:29:40,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 687/61035 [09:39<17:47:14,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 688/61035 [09:39<15:23:44,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 689/61035 [09:40<13:25:13,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 690/61035 [09:40<12:07:45,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 691/61035 [09:41<11:08:37,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 692/61035 [09:42<11:33:39,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 693/61035 [09:42<11:46:49,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 694/61035 [09:44<18:56:01,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 695/61035 [09:45<15:53:07,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 696/61035 [09:46<15:10:17,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 697/61035 [09:47<14:42:35,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 698/61035 [09:47<14:27:18,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 699/61035 [09:48<14:07:27,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 700/61035 [09:49<13:46:13,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 701/61035 [09:49<12:17:29,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 702/61035 [09:50<11:16:40,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 703/61035 [09:52<20:01:46,  1.20s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 704/61035 [09:53<16:39:08,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 705/61035 [09:53<14:21:34,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 706/61035 [09:54<12:37:22,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 707/61035 [09:55<12:47:19,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 708/61035 [09:56<12:55:56,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 709/61035 [09:56<12:59:20,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 710/61035 [09:57<13:01:39,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 711/61035 [09:58<12:38:12,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 712/61035 [10:01<23:05:42,  1.38s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 713/61035 [10:01<20:14:28,  1.21s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 714/61035 [10:02<18:14:54,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 715/61035 [10:03<15:31:50,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 716/61035 [10:03<13:33:14,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 717/61035 [10:04<12:11:43,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 718/61035 [10:04<11:11:39,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 719/61035 [10:05<11:54:24,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 720/61035 [10:06<12:05:23,  1.39it/s]

Error processing model 0: list indices must be integers or slices, not NoneType
Error processing model 0: list indices must be integers or slices, not NoneType
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 721/61035 [10:11<34:22:41,  2.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 722/61035 [10:12<26:41:07,  1.59s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 723/61035 [10:12<21:24:28,  1.28s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 724/61035 [10:13<17:48:09,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 725/61035 [10:14<16:33:05,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 726/61035 [10:14<15:42:55,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 727/61035 [10:15<15:07:26,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 728/61035 [10:16<14:16:22,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 729/61035 [10:16<12:31:40,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 730/61035 [10:19<21:11:59,  1.27s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 731/61035 [10:20<18:47:48,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 732/61035 [10:20<17:02:40,  1.02s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 733/61035 [10:21<14:39:27,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 734/61035 [10:22<12:57:51,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 735/61035 [10:22<11:45:22,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 736/61035 [10:23<12:09:25,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 737/61035 [10:24<12:12:32,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 738/61035 [10:24<12:14:51,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 739/61035 [10:27<19:56:45,  1.19s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 740/61035 [10:27<17:40:22,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 741/61035 [10:28<16:03:22,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 742/61035 [10:29<14:55:25,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 743/61035 [10:30<14:08:12,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 744/61035 [10:30<13:37:40,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 745/61035 [10:31<11:54:55,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 746/61035 [10:31<10:40:52,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 747/61035 [10:32<10:10:41,  1.65it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 748/61035 [10:34<20:37:10,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 749/61035 [10:35<17:10:11,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 750/61035 [10:36<14:42:03,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 751/61035 [10:36<13:00:40,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 752/61035 [10:37<12:42:55,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 753/61035 [10:38<12:59:45,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 754/61035 [10:38<13:15:56,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 755/61035 [10:39<13:19:24,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 756/61035 [10:40<13:13:35,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 757/61035 [10:43<23:57:24,  1.43s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 758/61035 [10:44<20:50:53,  1.25s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 759/61035 [10:45<18:32:59,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 760/61035 [10:45<15:29:35,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 761/61035 [10:45<13:05:08,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|          | 762/61035 [10:46<11:27:38,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 763/61035 [10:47<11:30:28,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 764/61035 [10:47<11:44:04,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 765/61035 [10:48<11:50:28,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 766/61035 [10:50<18:43:33,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 767/61035 [10:51<15:36:56,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 768/61035 [10:51<14:49:44,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 769/61035 [10:52<14:15:34,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 770/61035 [10:53<13:38:08,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 771/61035 [10:54<13:14:11,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 772/61035 [10:54<12:00:08,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 773/61035 [10:55<10:39:29,  1.57it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 774/61035 [10:55<9:43:31,  1.72it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 775/61035 [10:57<18:39:19,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 776/61035 [10:58<15:35:36,  1.07it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 777/61035 [10:58<13:25:21,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 778/61035 [10:59<11:57:05,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 779/61035 [11:00<12:13:35,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 780/61035 [11:01<12:12:17,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 781/61035 [11:01<12:14:15,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 782/61035 [11:02<12:17:15,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 783/61035 [11:03<12:13:38,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 784/61035 [11:05<21:39:58,  1.29s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 785/61035 [11:06<19:07:43,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 786/61035 [11:07<17:16:40,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 787/61035 [11:08<15:58:47,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 788/61035 [11:08<15:03:31,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 789/61035 [11:09<12:47:51,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 790/61035 [11:09<11:12:46,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 791/61035 [11:10<10:06:38,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 792/61035 [11:10<10:43:06,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 793/61035 [11:12<17:24:19,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 794/61035 [11:13<14:25:47,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 795/61035 [11:13<12:21:21,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 796/61035 [11:14<11:45:19,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 797/61035 [11:15<11:53:53,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 798/61035 [11:15<11:57:46,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 799/61035 [11:16<11:59:47,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 800/61035 [11:17<12:01:15,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 801/61035 [11:17<10:41:35,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 802/61035 [11:20<19:39:16,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 803/61035 [11:21<17:40:38,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 804/61035 [11:21<16:20:11,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 805/61035 [11:22<13:59:27,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 806/61035 [11:22<12:07:29,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 807/61035 [11:23<10:45:32,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 808/61035 [11:23<11:00:42,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 809/61035 [11:24<11:20:49,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 810/61035 [11:25<11:37:00,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 811/61035 [11:27<18:39:05,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 812/61035 [11:28<16:06:23,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 813/61035 [11:28<14:59:46,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 814/61035 [11:29<14:13:55,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 815/61035 [11:30<13:40:19,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 816/61035 [11:31<13:16:27,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 817/61035 [11:31<12:20:03,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 818/61035 [11:32<10:53:21,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 819/61035 [11:32<9:51:59,  1.70it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 820/61035 [11:34<18:14:05,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 821/61035 [11:35<16:25:26,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 822/61035 [11:36<13:46:39,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 823/61035 [11:36<11:53:14,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 824/61035 [11:36<10:35:10,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 825/61035 [11:37<11:10:06,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 826/61035 [11:38<11:29:56,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 827/61035 [11:39<11:45:42,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 828/61035 [11:39<11:55:51,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 829/61035 [11:42<23:17:44,  1.39s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 830/61035 [11:43<20:11:51,  1.21s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 831/61035 [11:44<18:03:30,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 832/61035 [11:45<16:31:48,  1.01it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 833/61035 [11:45<15:25:57,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 834/61035 [11:46<13:04:20,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 835/61035 [11:46<11:25:25,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 836/61035 [11:47<10:15:40,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 837/61035 [11:47<10:29:03,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 838/61035 [11:50<18:11:51,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 839/61035 [11:50<15:22:23,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 840/61035 [11:51<13:16:43,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 841/61035 [11:51<13:08:01,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 842/61035 [11:52<12:49:40,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 843/61035 [11:53<12:37:03,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 844/61035 [11:54<12:26:48,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 845/61035 [11:54<12:19:17,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 846/61035 [11:55<10:54:49,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 847/61035 [11:57<19:18:37,  1.15s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 848/61035 [11:58<17:15:28,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 849/61035 [11:59<15:47:07,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 850/61035 [11:59<14:17:45,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 851/61035 [12:00<12:17:43,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 852/61035 [12:00<10:54:49,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 853/61035 [12:01<9:57:13,  1.68it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 854/61035 [12:01<10:36:07,  1.58it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 855/61035 [12:02<11:03:00,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 856/61035 [12:04<17:50:07,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 857/61035 [12:05<14:45:44,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 858/61035 [12:05<13:02:26,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 859/61035 [12:06<12:46:43,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 860/61035 [12:07<12:34:31,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 861/61035 [12:07<12:28:33,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 862/61035 [12:08<12:21:21,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 863/61035 [12:08<10:59:22,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 864/61035 [12:09<9:56:12,  1.68it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 865/61035 [12:11<18:47:24,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 866/61035 [12:12<16:48:14,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 867/61035 [12:13<15:20:08,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 868/61035 [12:13<13:00:06,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 869/61035 [12:14<11:20:50,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 870/61035 [12:14<10:12:56,  1.64it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 871/61035 [12:15<10:49:30,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 872/61035 [12:16<11:14:10,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 873/61035 [12:16<11:30:51,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 874/61035 [12:18<19:23:06,  1.16s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 875/61035 [12:19<17:31:46,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 876/61035 [12:20<16:08:02,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 877/61035 [12:21<14:55:40,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 878/61035 [12:22<14:04:16,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 879/61035 [12:22<13:28:46,  1.24it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 880/61035 [12:23<11:43:14,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 881/61035 [12:23<10:28:58,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 882/61035 [12:24<9:35:11,  1.74it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 883/61035 [12:26<17:57:31,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 884/61035 [12:26<14:50:16,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 885/61035 [12:27<12:36:56,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 886/61035 [12:27<11:04:41,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 887/61035 [12:28<11:04:37,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 888/61035 [12:29<11:22:07,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 889/61035 [12:29<11:35:21,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 890/61035 [12:30<11:45:23,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 891/61035 [12:31<11:51:21,  1.41it/s]

Error processing model 0: list indices must be integers or slices, not NoneType
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 892/61035 [12:35<31:01:03,  1.86s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 893/61035 [12:36<25:18:46,  1.52s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 894/61035 [12:37<20:20:31,  1.22s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 895/61035 [12:37<16:54:24,  1.01s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 896/61035 [12:38<14:22:24,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 897/61035 [12:38<13:39:48,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 898/61035 [12:39<13:14:07,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 899/61035 [12:40<12:56:47,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 900/61035 [12:40<12:46:00,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 901/61035 [12:43<23:03:06,  1.38s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 902/61035 [12:44<19:45:42,  1.18s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 903/61035 [12:45<17:27:29,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 904/61035 [12:45<15:51:19,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 905/61035 [12:46<14:42:50,  1.14it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 906/61035 [12:47<13:45:27,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 907/61035 [12:47<11:51:21,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 908/61035 [12:48<10:31:38,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 909/61035 [12:48<9:37:29,  1.74it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 910/61035 [12:50<17:48:26,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 911/61035 [12:51<14:44:54,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 912/61035 [12:51<12:33:09,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 913/61035 [12:52<11:01:53,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 914/61035 [12:52<11:22:19,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  1%|▏         | 915/61035 [12:53<11:33:49,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 916/61035 [12:54<11:43:39,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 917/61035 [12:55<11:49:08,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 918/61035 [12:55<11:52:53,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 919/61035 [12:59<24:08:18,  1.45s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 920/61035 [12:59<20:33:18,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 921/61035 [13:00<18:01:07,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 922/61035 [13:01<16:17:10,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 923/61035 [13:01<14:57:47,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 924/61035 [13:02<12:44:03,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 925/61035 [13:02<11:09:57,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 926/61035 [13:03<10:04:42,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 927/61035 [13:03<9:18:38,  1.79it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 928/61035 [13:06<18:20:16,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 929/61035 [13:06<15:25:47,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 930/61035 [13:07<13:16:25,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 931/61035 [13:07<11:35:29,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 932/61035 [13:08<11:46:59,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 933/61035 [13:09<11:50:30,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 934/61035 [13:09<11:55:57,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 935/61035 [13:10<11:59:16,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 936/61035 [13:11<12:05:02,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 937/61035 [13:13<22:06:06,  1.32s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 938/61035 [13:14<19:06:21,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 939/61035 [13:15<17:00:01,  1.02s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 940/61035 [13:16<15:31:11,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 941/61035 [13:16<13:09:22,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 942/61035 [13:17<11:27:37,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 943/61035 [13:17<10:17:00,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 944/61035 [13:18<10:29:02,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 945/61035 [13:18<10:58:25,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 946/61035 [13:20<18:21:03,  1.10s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 947/61035 [13:21<15:04:50,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 948/61035 [13:21<12:48:46,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 949/61035 [13:22<12:35:23,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 950/61035 [13:23<12:27:10,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 951/61035 [13:24<12:20:06,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 952/61035 [13:24<12:16:28,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 953/61035 [13:25<11:33:15,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 954/61035 [13:25<10:19:43,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 955/61035 [13:28<19:42:33,  1.18s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 956/61035 [13:29<17:43:45,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 957/61035 [13:29<15:57:22,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 958/61035 [13:30<13:41:05,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 959/61035 [13:30<11:58:56,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 960/61035 [13:31<11:22:08,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 961/61035 [13:32<11:36:28,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 962/61035 [13:32<11:46:46,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 963/61035 [13:33<12:00:34,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 964/61035 [13:35<19:28:14,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 965/61035 [13:36<16:14:05,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 966/61035 [13:37<15:23:17,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 967/61035 [13:37<14:47:23,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 968/61035 [13:38<14:22:30,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 969/61035 [13:39<13:42:03,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 970/61035 [13:39<12:10:18,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 971/61035 [13:40<10:47:24,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 972/61035 [13:40<9:52:09,  1.69it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 973/61035 [13:43<18:54:23,  1.13s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 974/61035 [13:43<15:48:54,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 975/61035 [13:44<13:39:59,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 976/61035 [13:44<12:07:39,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 977/61035 [13:45<12:06:22,  1.38it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 978/61035 [13:46<12:08:47,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 979/61035 [13:47<12:12:31,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 980/61035 [13:47<12:11:28,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 981/61035 [13:48<12:10:22,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 982/61035 [13:51<22:56:42,  1.38s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 983/61035 [13:52<19:42:34,  1.18s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 984/61035 [13:52<17:26:19,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 985/61035 [13:53<15:52:29,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 986/61035 [13:54<14:22:12,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 987/61035 [13:54<12:17:53,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 988/61035 [13:55<10:51:47,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 989/61035 [13:55<9:52:55,  1.69it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 990/61035 [13:56<10:32:58,  1.58it/s]

Error processing model 0: Accessing value before it's been set.
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 991/61035 [14:00<28:18:01,  1.70s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 992/61035 [14:01<23:40:57,  1.42s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 993/61035 [14:02<20:21:21,  1.22s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 994/61035 [14:02<17:51:33,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 995/61035 [14:03<15:49:59,  1.05it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 996/61035 [14:03<13:19:26,  1.25it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 997/61035 [14:04<11:34:08,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 998/61035 [14:04<10:25:06,  1.60it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 999/61035 [14:05<9:41:19,  1.72it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1000/61035 [14:07<17:17:50,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1001/61035 [14:07<14:20:39,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1002/61035 [14:08<12:16:22,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1003/61035 [14:08<10:52:46,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1004/61035 [14:09<11:14:08,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1005/61035 [14:10<11:28:39,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1006/61035 [14:10<11:39:06,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1007/61035 [14:11<11:46:09,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1008/61035 [14:12<11:51:38,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1009/61035 [14:15<24:08:17,  1.45s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1010/61035 [14:16<20:31:47,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1011/61035 [14:16<18:00:14,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1012/61035 [14:17<16:15:02,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1013/61035 [14:18<14:47:54,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1014/61035 [14:18<12:37:45,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1015/61035 [14:19<11:04:47,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1016/61035 [14:19<10:02:10,  1.66it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1017/61035 [14:20<10:42:28,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1018/61035 [14:22<18:14:05,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1019/61035 [14:23<15:00:40,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1020/61035 [14:23<12:47:39,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1021/61035 [14:24<12:37:45,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1022/61035 [14:24<12:29:59,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1023/61035 [14:25<12:21:08,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1024/61035 [14:26<12:17:25,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1025/61035 [14:27<12:15:30,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1026/61035 [14:27<10:49:36,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1027/61035 [14:29<19:26:58,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1028/61035 [14:30<17:36:30,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1029/61035 [14:31<16:11:46,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1030/61035 [14:32<13:55:37,  1.20it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1031/61035 [14:32<11:59:34,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1032/61035 [14:32<10:39:30,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1033/61035 [14:33<9:43:24,  1.71it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1034/61035 [14:34<10:29:01,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1035/61035 [14:34<10:57:47,  1.52it/s]

Error processing model 0: Accessing value before it's been set.
Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1036/61035 [14:38<28:14:41,  1.69s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1037/61035 [14:39<23:48:21,  1.43s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1038/61035 [14:40<20:32:51,  1.23s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1039/61035 [14:41<18:14:53,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1040/61035 [14:41<15:03:54,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1041/61035 [14:42<12:48:09,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1042/61035 [14:42<11:29:04,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1043/61035 [14:43<11:36:53,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1044/61035 [14:44<11:47:10,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1045/61035 [14:46<18:11:35,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1046/61035 [14:46<14:59:43,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1047/61035 [14:47<13:48:19,  1.21it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1048/61035 [14:48<13:16:14,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1049/61035 [14:48<12:53:37,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1050/61035 [14:49<12:37:54,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1051/61035 [14:50<12:27:16,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1052/61035 [14:50<10:58:55,  1.52it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1053/61035 [14:51<9:55:41,  1.68it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1054/61035 [14:53<18:04:57,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1055/61035 [14:54<16:20:14,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1056/61035 [14:54<15:03:29,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1057/61035 [14:55<12:50:04,  1.30it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1058/61035 [14:55<11:13:08,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1059/61035 [14:56<10:06:30,  1.65it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1060/61035 [14:56<10:41:55,  1.56it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1061/61035 [14:57<11:05:41,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1062/61035 [14:58<11:24:46,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1063/61035 [15:00<19:32:14,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1064/61035 [15:01<17:40:38,  1.06s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1065/61035 [15:02<16:23:04,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1066/61035 [15:02<15:09:33,  1.10it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1067/61035 [15:03<14:12:36,  1.17it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1068/61035 [15:04<13:40:29,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1069/61035 [15:04<11:49:44,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1070/61035 [15:05<10:30:11,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1071/61035 [15:05<9:34:42,  1.74it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1072/61035 [15:07<17:45:51,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1073/61035 [15:08<16:05:15,  1.04it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1074/61035 [15:09<13:32:20,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1075/61035 [15:09<11:43:06,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1076/61035 [15:10<10:27:26,  1.59it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1077/61035 [15:10<10:52:48,  1.53it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1078/61035 [15:11<11:13:28,  1.48it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1079/61035 [15:12<11:28:13,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1080/61035 [15:12<11:45:01,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1081/61035 [15:15<18:33:46,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1082/61035 [15:15<16:18:49,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1083/61035 [15:16<15:01:11,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1084/61035 [15:17<14:07:20,  1.18it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1085/61035 [15:17<13:30:23,  1.23it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1086/61035 [15:18<13:05:27,  1.27it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1087/61035 [15:19<11:27:47,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1088/61035 [15:19<10:16:46,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1089/61035 [15:19<9:26:23,  1.76it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1090/61035 [15:22<18:26:19,  1.11s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1091/61035 [15:22<15:41:01,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1092/61035 [15:23<13:35:45,  1.22it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1093/61035 [15:23<12:08:22,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1094/61035 [15:24<11:06:34,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1095/61035 [15:25<11:26:32,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1096/61035 [15:25<11:38:10,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1097/61035 [15:26<11:45:04,  1.42it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1098/61035 [15:27<11:51:17,  1.40it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1099/61035 [15:30<23:59:59,  1.44s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1100/61035 [15:31<20:50:04,  1.25s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1101/61035 [15:32<18:37:23,  1.12s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1102/61035 [15:32<16:39:44,  1.00s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1103/61035 [15:33<15:18:01,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1104/61035 [15:34<13:00:11,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1105/61035 [15:34<11:23:14,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1106/61035 [15:34<10:11:39,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1107/61035 [15:35<9:23:41,  1.77it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1108/61035 [15:37<18:12:14,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1109/61035 [15:38<15:23:42,  1.08it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1110/61035 [15:38<13:15:07,  1.26it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1111/61035 [15:39<11:32:19,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1112/61035 [15:39<10:47:22,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1113/61035 [15:40<11:11:01,  1.49it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1114/61035 [15:41<11:26:28,  1.45it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1115/61035 [15:41<11:40:09,  1.43it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1116/61035 [15:42<11:47:07,  1.41it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1117/61035 [15:45<24:10:47,  1.45s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1118/61035 [15:46<20:35:00,  1.24s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1119/61035 [15:47<18:03:40,  1.09s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1120/61035 [15:48<16:15:06,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1121/61035 [15:48<15:01:03,  1.11it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1122/61035 [15:49<13:00:55,  1.28it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1123/61035 [15:49<11:21:07,  1.47it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1124/61035 [15:50<10:12:24,  1.63it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1125/61035 [15:50<9:58:26,  1.67it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1126/61035 [15:52<17:28:29,  1.05s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1127/61035 [15:53<14:49:00,  1.12it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1128/61035 [15:53<12:52:54,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1129/61035 [15:54<12:37:43,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1130/61035 [15:55<12:26:56,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1131/61035 [15:56<12:21:23,  1.35it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1132/61035 [15:56<12:15:59,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1133/61035 [15:57<12:13:13,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1134/61035 [15:57<11:01:14,  1.51it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1135/61035 [16:00<19:24:07,  1.17s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1136/61035 [16:01<17:10:57,  1.03s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1137/61035 [16:01<15:39:08,  1.06it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1138/61035 [16:02<14:17:26,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1139/61035 [16:02<12:14:43,  1.36it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1140/61035 [16:03<10:48:37,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1141/61035 [16:03<9:51:01,  1.69it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1142/61035 [16:04<9:32:36,  1.74it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1143/61035 [16:05<10:17:04,  1.62it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1144/61035 [16:07<17:20:36,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1145/61035 [16:07<14:45:38,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1146/61035 [16:08<12:54:27,  1.29it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1147/61035 [16:08<12:39:11,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1148/61035 [16:09<12:29:32,  1.33it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1149/61035 [16:10<12:25:22,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1150/61035 [16:11<12:25:15,  1.34it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1151/61035 [16:11<12:10:21,  1.37it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1152/61035 [16:12<10:49:27,  1.54it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1153/61035 [16:14<19:47:19,  1.19s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1154/61035 [16:15<17:46:06,  1.07s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1155/61035 [16:16<16:19:55,  1.02it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1156/61035 [16:16<13:55:31,  1.19it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1157/61035 [16:17<11:59:39,  1.39it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1158/61035 [16:17<10:42:54,  1.55it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1159/61035 [16:18<11:07:02,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1160/61035 [16:19<11:23:08,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1161/61035 [16:19<11:35:17,  1.44it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1162/61035 [16:21<18:58:29,  1.14s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1163/61035 [16:22<17:17:26,  1.04s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1164/61035 [16:23<16:06:34,  1.03it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1165/61035 [16:24<15:17:49,  1.09it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1166/61035 [16:25<14:19:58,  1.16it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1167/61035 [16:25<12:40:02,  1.31it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1168/61035 [16:26<11:06:55,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1169/61035 [16:26<10:05:19,  1.65it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1170/61035 [16:27<9:58:05,  1.67it/s] 

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1171/61035 [16:29<17:53:17,  1.08s/it]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1172/61035 [16:29<14:46:27,  1.13it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1173/61035 [16:30<12:35:04,  1.32it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1174/61035 [16:30<11:04:32,  1.50it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1175/61035 [16:31<11:21:49,  1.46it/s]

Model weight norms:  torch.Size([57344])
f torch.Size([8192, 57344])


  2%|▏         | 1176/61035 [16:32<14:01:40,  1.19it/s]


KeyboardInterrupt: 