In [1]:
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

In [3]:
gpu = torch.device('cuda:0')
model_id = "google/codegemma-1.1-2b"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForCausalLM.from_pretrained(model_id, device_map='auto', torch_dtype=torch.bfloat16)

`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.


in oss file


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
# create sample input

In [12]:
import requests
sample = requests.get('https://raw.githubusercontent.com/pallets/flask/main/tests/test_basic.py').content.decode('utf-8')

In [48]:
def infer_batch(prompt, prompt_tokens=-1, max_new_tokens=128, top_p=0.9, temperature=0.1, num_return_sequences=1):
    inputs = tokenizer(prompt, return_tensors="pt").to(device)
    if prompt_tokens < 0:
        prompt_tokens = len(inputs.input_ids[0])
    with torch.no_grad():
        generated_ids = model.generate(input_ids=inputs.input_ids[:,-prompt_tokens:],
                                       attention_mask=inputs.attention_mask[:,-prompt_tokens:],
                                       max_new_tokens=max_new_tokens, 
                                       #stopping_criteria=stopper, 
                                       top_p=top_p,
                                       temperature=temperature,
                                       do_sample=True,
                                       num_return_sequences=num_return_sequences)
    output = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)
    del generated_ids
    del inputs
    return output

In [52]:
import time
from tqdm import tqdm
import random


def last_tokens(prompt, tokens):
    return tokenizer.decode(tokenizer(prompt, return_tensors='pt').input_ids[:,-tokens:][0])

def benchmark(prompt, tokens_in, tokens_out, batch_sizes, n_iter):
    experiments = []
    results = []
    for num_in in tokens_in:
        for num_out in tokens_out:
            for batch_size in batch_sizes:
                for n in range(n_iter):
                    if batch_size * (num_in + num_out) <= LIMIT:
                        experiments.append((num_in, num_out, batch_size, n))
    print(f'{len(experiments)=}')
    
    random.shuffle(experiments)
    
    for num_in, num_out, batch_size, n in tqdm(experiments):
        print(f'{num_in=}, {num_out=}, {batch_size=}')
        input = last_tokens(prompt, num_in)
        torch.cuda.empty_cache()
        t = time.time()
        infer_batch(input, 
                    max_new_tokens=num_out, 
                    num_return_sequences=batch_size)
        t = time.time() - t
        results.append((num_in, num_out, batch_size, n, t))
    return results

In [62]:
LIMIT = 2048
device = gpu
bench_tokens_in = range(128, 1025, 128)
bench_tokens_out = range(64, 513, 64)
bench_batch_size = [1, 4, 16]
n_iter = 10
b = benchmark(sample, bench_tokens_in, bench_tokens_out, bench_batch_size, n_iter)

len(experiments)=760


  0%|          | 0/760 [00:00<?, ?it/s]

num_in=896, num_out=256, batch_size=1


  0%|          | 1/760 [00:02<31:27,  2.49s/it]

num_in=896, num_out=512, batch_size=1


  0%|          | 2/760 [00:07<48:17,  3.82s/it]

num_in=1024, num_out=256, batch_size=1


  0%|          | 3/760 [00:09<40:08,  3.18s/it]

num_in=256, num_out=128, batch_size=1
num_in=1024, num_out=64, batch_size=1


  1%|          | 5/760 [00:10<19:53,  1.58s/it]

num_in=640, num_out=192, batch_size=1


  1%|          | 6/760 [00:12<20:43,  1.65s/it]

num_in=128, num_out=128, batch_size=1
num_in=384, num_out=192, batch_size=1


  1%|▏         | 11/760 [00:14<08:39,  1.44it/s]

num_in=128, num_out=128, batch_size=1
num_in=128, num_out=192, batch_size=4
num_in=256, num_out=448, batch_size=1
num_in=384, num_out=384, batch_size=1


  2%|▏         | 15/760 [00:17<08:43,  1.42it/s]

num_in=128, num_out=256, batch_size=1
num_in=128, num_out=128, batch_size=4
num_in=128, num_out=192, batch_size=4
num_in=640, num_out=128, batch_size=1


  2%|▏         | 16/760 [00:19<09:50,  1.26it/s]

num_in=896, num_out=256, batch_size=1


  2%|▏         | 17/760 [00:21<13:50,  1.12s/it]

num_in=512, num_out=192, batch_size=1


  2%|▏         | 18/760 [00:23<15:44,  1.27s/it]

num_in=1024, num_out=64, batch_size=1


  2%|▎         | 19/760 [00:23<13:51,  1.12s/it]

num_in=512, num_out=64, batch_size=1


  3%|▎         | 20/760 [00:24<12:19,  1.00it/s]

num_in=512, num_out=448, batch_size=1


  3%|▎         | 21/760 [00:28<22:42,  1.84s/it]

num_in=512, num_out=512, batch_size=1


  3%|▎         | 22/760 [00:33<32:31,  2.64s/it]

num_in=128, num_out=448, batch_size=1
num_in=384, num_out=384, batch_size=1


  3%|▎         | 24/760 [00:37<27:49,  2.27s/it]

num_in=256, num_out=448, batch_size=1
num_in=256, num_out=384, batch_size=1
num_in=512, num_out=128, batch_size=1


  4%|▍         | 29/760 [00:38<11:41,  1.04it/s]

num_in=128, num_out=64, batch_size=1
num_in=1024, num_out=192, batch_size=1
num_in=896, num_out=128, batch_size=1


  4%|▍         | 30/760 [00:39<12:18,  1.01s/it]

num_in=512, num_out=64, batch_size=1


  4%|▍         | 31/760 [00:40<11:21,  1.07it/s]

num_in=896, num_out=192, batch_size=1


  4%|▍         | 32/760 [00:42<13:47,  1.14s/it]

num_in=512, num_out=512, batch_size=1


  4%|▍         | 33/760 [00:46<24:31,  2.02s/it]

num_in=256, num_out=512, batch_size=1
num_in=768, num_out=256, batch_size=1


  5%|▍         | 37/760 [00:49<13:02,  1.08s/it]

num_in=256, num_out=64, batch_size=1
num_in=256, num_out=128, batch_size=1
num_in=512, num_out=192, batch_size=1


  5%|▌         | 38/760 [00:51<14:49,  1.23s/it]

num_in=1024, num_out=320, batch_size=1
num_in=768, num_out=448, batch_size=1


  5%|▌         | 40/760 [00:55<18:49,  1.57s/it]

num_in=128, num_out=192, batch_size=1
num_in=128, num_out=384, batch_size=1
num_in=1024, num_out=256, batch_size=1


  6%|▌         | 43/760 [00:57<14:48,  1.24s/it]

num_in=768, num_out=320, batch_size=1


  6%|▌         | 46/760 [01:01<12:29,  1.05s/it]

num_in=256, num_out=512, batch_size=1
num_in=256, num_out=192, batch_size=1
num_in=384, num_out=128, batch_size=1


  6%|▌         | 47/760 [01:02<12:49,  1.08s/it]

num_in=384, num_out=448, batch_size=1


  6%|▋         | 48/760 [01:06<20:39,  1.74s/it]

num_in=896, num_out=512, batch_size=1


  6%|▋         | 49/760 [01:11<29:03,  2.45s/it]

num_in=640, num_out=64, batch_size=1


  7%|▋         | 50/760 [01:11<23:38,  2.00s/it]

num_in=1024, num_out=64, batch_size=1


  7%|▋         | 51/760 [01:12<19:24,  1.64s/it]

num_in=768, num_out=448, batch_size=1


  7%|▋         | 52/760 [01:16<27:37,  2.34s/it]

num_in=1024, num_out=384, batch_size=1


  7%|▋         | 53/760 [01:20<31:46,  2.70s/it]

num_in=1024, num_out=448, batch_size=1


  7%|▋         | 54/760 [01:24<36:49,  3.13s/it]

num_in=384, num_out=64, batch_size=1


  7%|▋         | 55/760 [01:25<28:17,  2.41s/it]

num_in=128, num_out=64, batch_size=4
num_in=640, num_out=512, batch_size=1


  8%|▊         | 59/760 [01:30<17:20,  1.48s/it]

num_in=256, num_out=192, batch_size=1
num_in=128, num_out=384, batch_size=4
num_in=256, num_out=384, batch_size=1
num_in=768, num_out=64, batch_size=1


  8%|▊         | 61/760 [01:30<12:30,  1.07s/it]

num_in=1024, num_out=448, batch_size=1


  8%|▊         | 62/760 [01:34<19:55,  1.71s/it]

num_in=896, num_out=320, batch_size=1


  9%|▊         | 65/760 [01:38<14:29,  1.25s/it]

num_in=128, num_out=128, batch_size=4
num_in=1024, num_out=64, batch_size=1
num_in=512, num_out=512, batch_size=1


  9%|▊         | 66/760 [01:42<23:24,  2.02s/it]

num_in=384, num_out=448, batch_size=1


  9%|▉         | 67/760 [01:47<29:17,  2.54s/it]

num_in=128, num_out=320, batch_size=4
num_in=384, num_out=320, batch_size=1


  9%|▉         | 69/760 [01:50<24:29,  2.13s/it]

num_in=768, num_out=512, batch_size=1


  9%|▉         | 70/760 [01:54<31:17,  2.72s/it]

num_in=128, num_out=192, batch_size=4
num_in=384, num_out=448, batch_size=1


  9%|▉         | 72/760 [01:59<28:30,  2.49s/it]

num_in=384, num_out=192, batch_size=1


 10%|▉         | 73/760 [02:00<26:45,  2.34s/it]

num_in=768, num_out=512, batch_size=1


 10%|▉         | 74/760 [02:05<33:20,  2.92s/it]

num_in=512, num_out=512, batch_size=1


 10%|▉         | 75/760 [02:10<38:40,  3.39s/it]

num_in=256, num_out=256, batch_size=4
num_in=896, num_out=512, batch_size=1


 10%|█         | 77/760 [02:15<34:03,  2.99s/it]

num_in=128, num_out=192, batch_size=1
num_in=640, num_out=448, batch_size=1


 10%|█         | 79/760 [02:19<30:12,  2.66s/it]

num_in=128, num_out=320, batch_size=4
num_in=512, num_out=384, batch_size=1


 11%|█         | 81/760 [02:23<26:49,  2.37s/it]

num_in=128, num_out=64, batch_size=4
num_in=384, num_out=128, batch_size=1


 11%|█         | 83/760 [02:24<20:15,  1.80s/it]

num_in=128, num_out=256, batch_size=4
num_in=896, num_out=512, batch_size=1


 11%|█         | 85/760 [02:29<22:29,  2.00s/it]

num_in=256, num_out=256, batch_size=4
num_in=384, num_out=192, batch_size=1


 11%|█▏        | 87/760 [02:31<18:44,  1.67s/it]

num_in=1024, num_out=320, batch_size=1


 12%|█▏        | 88/760 [02:34<21:29,  1.92s/it]

num_in=640, num_out=256, batch_size=1


 12%|█▏        | 89/760 [02:36<22:38,  2.02s/it]

num_in=768, num_out=448, batch_size=1


 12%|█▏        | 90/760 [02:41<28:10,  2.52s/it]

num_in=128, num_out=320, batch_size=1
num_in=384, num_out=256, batch_size=1


 12%|█▏        | 92/760 [02:43<22:21,  2.01s/it]

num_in=896, num_out=320, batch_size=1


 12%|█▏        | 93/760 [02:45<23:30,  2.12s/it]

num_in=1024, num_out=192, batch_size=1


 12%|█▏        | 94/760 [02:47<22:46,  2.05s/it]

num_in=128, num_out=64, batch_size=1
num_in=640, num_out=192, batch_size=1


 13%|█▎        | 96/760 [02:49<17:41,  1.60s/it]

num_in=512, num_out=320, batch_size=1


 13%|█▎        | 97/760 [02:52<21:09,  1.92s/it]

num_in=896, num_out=448, batch_size=1


 13%|█▎        | 98/760 [02:56<27:15,  2.47s/it]

num_in=384, num_out=128, batch_size=1


 13%|█▎        | 99/760 [02:58<23:41,  2.15s/it]

num_in=896, num_out=128, batch_size=1


 13%|█▎        | 102/760 [02:59<12:06,  1.10s/it]

num_in=128, num_out=384, batch_size=1
num_in=256, num_out=192, batch_size=4
num_in=128, num_out=512, batch_size=1
num_in=768, num_out=384, batch_size=1


 14%|█▎        | 104/760 [03:03<15:05,  1.38s/it]

num_in=384, num_out=128, batch_size=4


 14%|█▍        | 105/760 [03:04<15:04,  1.38s/it]

num_in=384, num_out=384, batch_size=1


 14%|█▍        | 106/760 [03:08<20:45,  1.90s/it]

num_in=640, num_out=320, batch_size=1


 14%|█▍        | 107/760 [03:11<23:44,  2.18s/it]

num_in=1024, num_out=320, batch_size=1
num_in=896, num_out=512, batch_size=1


 14%|█▍        | 109/760 [03:16<24:55,  2.30s/it]

num_in=384, num_out=320, batch_size=1


 14%|█▍        | 110/760 [03:19<26:39,  2.46s/it]

num_in=640, num_out=384, batch_size=1


 15%|█▍        | 111/760 [03:22<29:39,  2.74s/it]

num_in=128, num_out=256, batch_size=1
num_in=768, num_out=128, batch_size=1


 15%|█▍        | 113/760 [03:23<20:11,  1.87s/it]

num_in=256, num_out=64, batch_size=4
num_in=384, num_out=512, batch_size=1


 15%|█▌        | 115/760 [03:28<22:24,  2.08s/it]

num_in=1024, num_out=128, batch_size=1
num_in=384, num_out=128, batch_size=4


 15%|█▌        | 117/760 [03:30<17:19,  1.62s/it]

num_in=640, num_out=128, batch_size=1


 16%|█▌        | 118/760 [03:31<16:32,  1.55s/it]

num_in=256, num_out=128, batch_size=1
num_in=384, num_out=384, batch_size=1


 16%|█▌        | 120/760 [03:35<17:33,  1.65s/it]

num_in=640, num_out=320, batch_size=1


 16%|█▌        | 121/760 [03:38<20:31,  1.93s/it]

num_in=128, num_out=192, batch_size=4
num_in=384, num_out=320, batch_size=1


 16%|█▌        | 123/760 [03:41<18:56,  1.78s/it]

num_in=512, num_out=256, batch_size=1


 16%|█▋        | 124/760 [03:43<20:18,  1.92s/it]

num_in=640, num_out=512, batch_size=1


 16%|█▋        | 125/760 [03:48<27:17,  2.58s/it]

num_in=768, num_out=128, batch_size=1


 17%|█▋        | 126/760 [03:49<23:45,  2.25s/it]

num_in=128, num_out=320, batch_size=4
num_in=384, num_out=128, batch_size=4


 17%|█▋        | 128/760 [03:51<17:02,  1.62s/it]

num_in=640, num_out=384, batch_size=1


 17%|█▋        | 129/760 [03:54<21:52,  2.08s/it]

num_in=256, num_out=64, batch_size=1
num_in=384, num_out=64, batch_size=1


 18%|█▊        | 133/760 [03:55<09:34,  1.09it/s]

num_in=128, num_out=128, batch_size=4
num_in=256, num_out=64, batch_size=4
num_in=1024, num_out=512, batch_size=1


 18%|█▊        | 134/760 [04:00<17:56,  1.72s/it]

num_in=640, num_out=192, batch_size=1


 18%|█▊        | 135/760 [04:02<18:13,  1.75s/it]

num_in=128, num_out=384, batch_size=4
num_in=768, num_out=384, batch_size=1


 18%|█▊        | 137/760 [04:06<18:32,  1.79s/it]

num_in=128, num_out=448, batch_size=1
num_in=896, num_out=320, batch_size=1


 19%|█▊        | 141/760 [04:09<11:51,  1.15s/it]

num_in=1024, num_out=384, batch_size=1
num_in=128, num_out=192, batch_size=4
num_in=512, num_out=320, batch_size=1


 19%|█▊        | 142/760 [04:12<15:35,  1.51s/it]

num_in=896, num_out=256, batch_size=1


 19%|█▉        | 143/760 [04:14<17:38,  1.71s/it]

num_in=896, num_out=448, batch_size=1


 19%|█▉        | 144/760 [04:18<23:44,  2.31s/it]

num_in=896, num_out=384, batch_size=1


 19%|█▉        | 145/760 [04:22<27:08,  2.65s/it]

num_in=896, num_out=64, batch_size=1


 19%|█▉        | 146/760 [04:23<21:42,  2.12s/it]

num_in=512, num_out=320, batch_size=1


 19%|█▉        | 147/760 [04:26<24:11,  2.37s/it]

num_in=256, num_out=256, batch_size=4
num_in=896, num_out=512, batch_size=1


 20%|█▉        | 151/760 [04:29<12:05,  1.19s/it]

num_in=128, num_out=384, batch_size=1
num_in=256, num_out=64, batch_size=4
num_in=128, num_out=512, batch_size=1
num_in=768, num_out=512, batch_size=1


 20%|██        | 153/760 [04:33<16:31,  1.63s/it]

num_in=128, num_out=384, batch_size=1
num_in=768, num_out=384, batch_size=1


 20%|██        | 155/760 [04:37<17:08,  1.70s/it]

num_in=128, num_out=256, batch_size=1
num_in=256, num_out=128, batch_size=1
num_in=512, num_out=256, batch_size=1


 21%|██        | 160/760 [04:40<09:42,  1.03it/s]

num_in=256, num_out=512, batch_size=1
num_in=128, num_out=384, batch_size=4
num_in=384, num_out=384, batch_size=1


 21%|██        | 161/760 [04:43<14:09,  1.42s/it]

num_in=1024, num_out=64, batch_size=1
num_in=384, num_out=64, batch_size=1


 21%|██▏       | 163/760 [04:44<10:40,  1.07s/it]

num_in=128, num_out=64, batch_size=4
num_in=1024, num_out=384, batch_size=1


 22%|██▏       | 165/760 [04:48<13:04,  1.32s/it]

num_in=256, num_out=384, batch_size=1
num_in=1024, num_out=384, batch_size=1


 22%|██▏       | 167/760 [04:51<14:39,  1.48s/it]

num_in=1024, num_out=384, batch_size=1


 22%|██▏       | 168/760 [04:55<18:31,  1.88s/it]

num_in=128, num_out=256, batch_size=4
num_in=640, num_out=128, batch_size=1


 22%|██▏       | 170/760 [04:56<14:21,  1.46s/it]

num_in=1024, num_out=128, batch_size=1


 22%|██▎       | 171/760 [04:58<13:56,  1.42s/it]

num_in=512, num_out=64, batch_size=1


 23%|██▎       | 172/760 [04:58<12:16,  1.25s/it]

num_in=384, num_out=448, batch_size=1


 23%|██▎       | 173/760 [05:02<19:08,  1.96s/it]

num_in=896, num_out=128, batch_size=1


 23%|██▎       | 174/760 [05:04<17:20,  1.78s/it]

num_in=384, num_out=256, batch_size=1


 23%|██▎       | 175/760 [05:06<19:00,  1.95s/it]

num_in=256, num_out=384, batch_size=1
num_in=512, num_out=64, batch_size=1


 23%|██▎       | 177/760 [05:07<12:10,  1.25s/it]

num_in=640, num_out=256, batch_size=1


 23%|██▎       | 178/760 [05:09<14:49,  1.53s/it]

num_in=512, num_out=192, batch_size=1


 24%|██▍       | 181/760 [05:11<09:16,  1.04it/s]

num_in=256, num_out=64, batch_size=4
num_in=128, num_out=320, batch_size=1
num_in=512, num_out=192, batch_size=1


 24%|██▍       | 182/760 [05:13<11:08,  1.16s/it]

num_in=128, num_out=192, batch_size=4
num_in=384, num_out=448, batch_size=1


 24%|██▍       | 184/760 [05:17<14:48,  1.54s/it]

num_in=256, num_out=64, batch_size=1
num_in=256, num_out=448, batch_size=1
num_in=896, num_out=384, batch_size=1


 25%|██▍       | 189/760 [05:21<09:30,  1.00it/s]

num_in=1024, num_out=192, batch_size=1
num_in=128, num_out=256, batch_size=1
num_in=512, num_out=256, batch_size=1


 25%|██▌       | 190/760 [05:24<11:52,  1.25s/it]

num_in=384, num_out=128, batch_size=1


 25%|██▌       | 193/760 [05:25<07:44,  1.22it/s]

num_in=256, num_out=128, batch_size=4
num_in=1024, num_out=320, batch_size=1
num_in=512, num_out=128, batch_size=1


 26%|██▌       | 194/760 [05:26<08:34,  1.10it/s]

num_in=128, num_out=320, batch_size=4
num_in=640, num_out=64, batch_size=1


 26%|██▌       | 196/760 [05:27<06:35,  1.43it/s]

num_in=128, num_out=192, batch_size=1
num_in=768, num_out=320, batch_size=1


 26%|██▌       | 198/760 [05:30<09:15,  1.01it/s]

num_in=640, num_out=448, batch_size=1


 26%|██▋       | 201/760 [05:34<09:52,  1.06s/it]

num_in=256, num_out=192, batch_size=1
num_in=128, num_out=320, batch_size=4
num_in=896, num_out=192, batch_size=1


 27%|██▋       | 202/760 [05:36<11:20,  1.22s/it]

num_in=896, num_out=512, batch_size=1


 27%|██▋       | 203/760 [05:41<18:55,  2.04s/it]

num_in=512, num_out=512, batch_size=1


 27%|██▋       | 204/760 [05:46<25:11,  2.72s/it]

num_in=1024, num_out=448, batch_size=1


 27%|██▋       | 207/760 [05:50<16:40,  1.81s/it]

num_in=256, num_out=384, batch_size=1
num_in=128, num_out=384, batch_size=1
num_in=768, num_out=64, batch_size=1


 28%|██▊       | 210/760 [05:51<08:40,  1.06it/s]

num_in=128, num_out=256, batch_size=4
num_in=256, num_out=512, batch_size=1
num_in=640, num_out=384, batch_size=1


 28%|██▊       | 211/760 [05:54<14:03,  1.54s/it]

num_in=896, num_out=128, batch_size=1


 28%|██▊       | 212/760 [05:56<13:24,  1.47s/it]

num_in=128, num_out=320, batch_size=1
num_in=384, num_out=512, batch_size=1


 28%|██▊       | 214/760 [06:01<16:52,  1.85s/it]

num_in=896, num_out=448, batch_size=1


 28%|██▊       | 215/760 [06:05<21:40,  2.39s/it]

num_in=256, num_out=256, batch_size=4
num_in=384, num_out=64, batch_size=4


 29%|██▊       | 217/760 [06:06<14:37,  1.62s/it]

num_in=896, num_out=192, batch_size=1


 29%|██▊       | 218/760 [06:07<15:06,  1.67s/it]

num_in=256, num_out=128, batch_size=4
num_in=640, num_out=448, batch_size=1


 29%|██▉       | 220/760 [06:12<16:43,  1.86s/it]

num_in=640, num_out=320, batch_size=1


 29%|██▉       | 221/760 [06:15<18:57,  2.11s/it]

num_in=1024, num_out=128, batch_size=1


 29%|██▉       | 222/760 [06:16<17:07,  1.91s/it]

num_in=896, num_out=64, batch_size=1


 29%|██▉       | 223/760 [06:17<14:17,  1.60s/it]

num_in=256, num_out=192, batch_size=1
num_in=640, num_out=320, batch_size=1


 30%|██▉       | 225/760 [06:20<14:01,  1.57s/it]

num_in=256, num_out=256, batch_size=4
num_in=384, num_out=128, batch_size=4


 30%|██▉       | 227/760 [06:21<11:12,  1.26s/it]

num_in=768, num_out=192, batch_size=1


 30%|███       | 228/760 [06:23<12:18,  1.39s/it]

num_in=128, num_out=256, batch_size=1
num_in=384, num_out=64, batch_size=4


 30%|███       | 230/760 [06:24<08:56,  1.01s/it]

num_in=896, num_out=448, batch_size=1


 30%|███       | 231/760 [06:28<14:58,  1.70s/it]

num_in=768, num_out=512, batch_size=1


 31%|███       | 232/760 [06:33<21:20,  2.42s/it]

num_in=896, num_out=64, batch_size=1


 31%|███       | 235/760 [06:34<10:22,  1.18s/it]

num_in=128, num_out=320, batch_size=4
num_in=256, num_out=320, batch_size=1
num_in=128, num_out=128, batch_size=1
num_in=768, num_out=256, batch_size=1


 31%|███       | 237/760 [06:36<10:28,  1.20s/it]

num_in=896, num_out=64, batch_size=1


 31%|███▏      | 238/760 [06:37<09:28,  1.09s/it]

num_in=896, num_out=448, batch_size=1


 31%|███▏      | 239/760 [06:41<15:46,  1.82s/it]

num_in=896, num_out=256, batch_size=1


 32%|███▏      | 240/760 [06:44<17:07,  1.98s/it]

num_in=128, num_out=192, batch_size=1
num_in=640, num_out=64, batch_size=1


 32%|███▏      | 244/760 [06:44<07:14,  1.19it/s]

num_in=1024, num_out=512, batch_size=1
num_in=128, num_out=448, batch_size=1
num_in=128, num_out=192, batch_size=4
num_in=128, num_out=128, batch_size=4


 32%|███▏      | 246/760 [06:45<04:52,  1.75it/s]

num_in=640, num_out=256, batch_size=1


 32%|███▎      | 247/760 [06:47<08:02,  1.06it/s]

num_in=256, num_out=320, batch_size=1
num_in=640, num_out=384, batch_size=1


 33%|███▎      | 251/760 [06:51<07:17,  1.16it/s]

num_in=128, num_out=64, batch_size=4
num_in=256, num_out=64, batch_size=4
num_in=384, num_out=512, batch_size=1


 33%|███▎      | 252/760 [06:56<13:50,  1.63s/it]

num_in=768, num_out=192, batch_size=1


 33%|███▎      | 253/760 [06:57<14:11,  1.68s/it]

num_in=768, num_out=384, batch_size=1


 33%|███▎      | 254/760 [07:01<18:05,  2.15s/it]

num_in=768, num_out=384, batch_size=1


 34%|███▎      | 255/760 [07:05<21:14,  2.52s/it]

num_in=640, num_out=64, batch_size=1


 34%|███▍      | 258/760 [07:06<09:53,  1.18s/it]

num_in=1024, num_out=64, batch_size=1
num_in=128, num_out=512, batch_size=1
num_in=512, num_out=128, batch_size=1


 34%|███▍      | 261/760 [07:07<06:06,  1.36it/s]

num_in=128, num_out=384, batch_size=4
num_in=128, num_out=256, batch_size=1
num_in=768, num_out=128, batch_size=1


 35%|███▍      | 264/760 [07:08<04:30,  1.83it/s]

num_in=256, num_out=256, batch_size=4
num_in=128, num_out=320, batch_size=1
num_in=256, num_out=64, batch_size=4
num_in=896, num_out=320, batch_size=1


 35%|███▌      | 268/760 [07:12<05:10,  1.59it/s]

num_in=1024, num_out=320, batch_size=1
num_in=128, num_out=64, batch_size=4
num_in=896, num_out=448, batch_size=1


 35%|███▌      | 269/760 [07:16<10:53,  1.33s/it]

num_in=1024, num_out=128, batch_size=1
num_in=384, num_out=320, batch_size=1


 36%|███▌      | 271/760 [07:19<11:30,  1.41s/it]

num_in=640, num_out=192, batch_size=1


 36%|███▌      | 272/760 [07:21<12:11,  1.50s/it]

num_in=384, num_out=512, batch_size=1


 36%|███▌      | 273/760 [07:26<18:12,  2.24s/it]

num_in=896, num_out=128, batch_size=1


 36%|███▌      | 274/760 [07:27<16:12,  2.00s/it]

num_in=768, num_out=256, batch_size=1


 36%|███▌      | 275/760 [07:29<17:06,  2.12s/it]

num_in=384, num_out=128, batch_size=1


 36%|███▋      | 276/760 [07:30<15:09,  1.88s/it]

num_in=256, num_out=256, batch_size=1
num_in=640, num_out=128, batch_size=1


 37%|███▋      | 278/760 [07:32<10:45,  1.34s/it]

num_in=640, num_out=512, batch_size=1


 37%|███▋      | 279/760 [07:37<17:21,  2.16s/it]

num_in=768, num_out=384, batch_size=1


 37%|███▋      | 280/760 [07:40<20:17,  2.54s/it]

num_in=896, num_out=256, batch_size=1


 37%|███▋      | 281/760 [07:43<20:03,  2.51s/it]

num_in=1024, num_out=128, batch_size=1
num_in=768, num_out=448, batch_size=1


 37%|███▋      | 283/760 [07:47<18:46,  2.36s/it]

num_in=512, num_out=512, batch_size=1


 37%|███▋      | 284/760 [07:52<23:19,  2.94s/it]

num_in=256, num_out=192, batch_size=4
num_in=512, num_out=512, batch_size=1


 38%|███▊      | 286/760 [07:57<21:40,  2.74s/it]

num_in=128, num_out=128, batch_size=1
num_in=640, num_out=320, batch_size=1


 38%|███▊      | 288/760 [08:00<18:05,  2.30s/it]

num_in=512, num_out=256, batch_size=1


 38%|███▊      | 289/760 [08:02<18:15,  2.33s/it]

num_in=384, num_out=64, batch_size=1


 38%|███▊      | 290/760 [08:03<15:11,  1.94s/it]

num_in=768, num_out=64, batch_size=1


 38%|███▊      | 291/760 [08:03<12:40,  1.62s/it]

num_in=1024, num_out=192, batch_size=1


 38%|███▊      | 292/760 [08:05<13:07,  1.68s/it]

num_in=896, num_out=384, batch_size=1


 39%|███▊      | 293/760 [08:09<17:14,  2.22s/it]

num_in=768, num_out=384, batch_size=1


 39%|███▉      | 296/760 [08:13<11:26,  1.48s/it]

num_in=256, num_out=384, batch_size=1
num_in=128, num_out=256, batch_size=4
num_in=512, num_out=384, batch_size=1


 39%|███▉      | 297/760 [08:16<15:22,  1.99s/it]

num_in=768, num_out=384, batch_size=1


 39%|███▉      | 298/760 [08:20<18:33,  2.41s/it]

num_in=1024, num_out=192, batch_size=1
num_in=512, num_out=256, batch_size=1


 39%|███▉      | 300/760 [08:22<14:46,  1.93s/it]

num_in=512, num_out=128, batch_size=1


 40%|███▉      | 301/760 [08:24<13:30,  1.77s/it]

num_in=256, num_out=384, batch_size=1
num_in=768, num_out=256, batch_size=1


 40%|███▉      | 303/760 [08:26<11:51,  1.56s/it]

num_in=768, num_out=512, batch_size=1


 40%|████      | 304/760 [08:31<17:19,  2.28s/it]

num_in=640, num_out=320, batch_size=1


 40%|████      | 305/760 [08:34<18:39,  2.46s/it]

num_in=256, num_out=64, batch_size=1
num_in=1024, num_out=448, batch_size=1


 40%|████      | 307/760 [08:38<17:36,  2.33s/it]

num_in=256, num_out=64, batch_size=1
num_in=896, num_out=384, batch_size=1


 41%|████      | 309/760 [08:42<16:11,  2.16s/it]

num_in=896, num_out=256, batch_size=1


 41%|████      | 310/760 [08:44<16:37,  2.22s/it]

num_in=1024, num_out=448, batch_size=1


 41%|████      | 311/760 [08:49<20:05,  2.68s/it]

num_in=384, num_out=128, batch_size=1


 41%|████▏     | 314/760 [08:50<10:23,  1.40s/it]

num_in=256, num_out=128, batch_size=4
num_in=128, num_out=128, batch_size=4
num_in=128, num_out=256, batch_size=4


 42%|████▏     | 316/760 [08:50<06:39,  1.11it/s]

num_in=128, num_out=64, batch_size=1
num_in=384, num_out=448, batch_size=1


 42%|████▏     | 317/760 [08:54<11:44,  1.59s/it]

num_in=640, num_out=128, batch_size=1


 42%|████▏     | 318/760 [08:56<11:07,  1.51s/it]

num_in=128, num_out=256, batch_size=4
num_in=384, num_out=192, batch_size=1


 42%|████▏     | 320/760 [08:58<09:25,  1.29s/it]

num_in=768, num_out=320, batch_size=1


 42%|████▏     | 321/760 [09:01<12:14,  1.67s/it]

num_in=128, num_out=320, batch_size=1
num_in=384, num_out=512, batch_size=1


 42%|████▎     | 323/760 [09:05<14:18,  1.96s/it]

num_in=640, num_out=384, batch_size=1


 43%|████▎     | 324/760 [09:09<16:53,  2.32s/it]

num_in=512, num_out=64, batch_size=1


 43%|████▎     | 325/760 [09:10<13:58,  1.93s/it]

num_in=1024, num_out=320, batch_size=1
num_in=640, num_out=64, batch_size=1


 43%|████▎     | 327/760 [09:10<09:23,  1.30s/it]

num_in=128, num_out=512, batch_size=1
num_in=512, num_out=384, batch_size=1


 43%|████▎     | 329/760 [09:14<10:44,  1.49s/it]

num_in=384, num_out=320, batch_size=1


 43%|████▎     | 330/760 [09:17<12:58,  1.81s/it]

num_in=128, num_out=384, batch_size=1
num_in=768, num_out=320, batch_size=1


 44%|████▍     | 334/760 [09:20<08:02,  1.13s/it]

num_in=256, num_out=128, batch_size=1
num_in=256, num_out=384, batch_size=1
num_in=896, num_out=256, batch_size=1


 44%|████▍     | 335/760 [09:23<09:51,  1.39s/it]

num_in=1024, num_out=384, batch_size=1


 44%|████▍     | 336/760 [09:26<13:21,  1.89s/it]

num_in=1024, num_out=256, batch_size=1
num_in=384, num_out=192, batch_size=1


 44%|████▍     | 338/760 [09:28<10:46,  1.53s/it]

num_in=384, num_out=256, batch_size=1


 45%|████▍     | 339/760 [09:31<12:05,  1.72s/it]

num_in=128, num_out=256, batch_size=1
num_in=512, num_out=64, batch_size=1


 45%|████▍     | 341/760 [09:31<08:19,  1.19s/it]

num_in=256, num_out=192, batch_size=1
num_in=384, num_out=512, batch_size=1


 45%|████▌     | 343/760 [09:36<11:18,  1.63s/it]

num_in=256, num_out=384, batch_size=1
num_in=512, num_out=320, batch_size=1


 45%|████▌     | 345/760 [09:39<11:02,  1.60s/it]

num_in=768, num_out=448, batch_size=1


 46%|████▌     | 346/760 [09:44<14:30,  2.10s/it]

num_in=384, num_out=320, batch_size=1


 46%|████▌     | 347/760 [09:47<15:51,  2.30s/it]

num_in=128, num_out=512, batch_size=1
num_in=384, num_out=128, batch_size=4


 46%|████▌     | 349/760 [09:48<11:35,  1.69s/it]

num_in=896, num_out=448, batch_size=1


 46%|████▋     | 352/760 [09:52<09:39,  1.42s/it]

num_in=128, num_out=320, batch_size=4
num_in=256, num_out=192, batch_size=4
num_in=256, num_out=448, batch_size=1


 47%|████▋     | 354/760 [09:53<06:21,  1.07it/s]

num_in=128, num_out=448, batch_size=1
num_in=512, num_out=192, batch_size=1


 47%|████▋     | 358/760 [09:54<04:09,  1.61it/s]

num_in=256, num_out=320, batch_size=1
num_in=256, num_out=256, batch_size=1
num_in=256, num_out=64, batch_size=1
num_in=768, num_out=128, batch_size=1


 47%|████▋     | 360/760 [09:56<04:13,  1.58it/s]

num_in=1024, num_out=128, batch_size=1
num_in=512, num_out=448, batch_size=1


 48%|████▊     | 361/760 [10:00<08:30,  1.28s/it]

num_in=1024, num_out=64, batch_size=1


 48%|████▊     | 364/760 [10:01<05:02,  1.31it/s]

num_in=1024, num_out=320, batch_size=1
num_in=1024, num_out=256, batch_size=1
num_in=640, num_out=64, batch_size=1


 48%|████▊     | 365/760 [10:02<04:53,  1.35it/s]

num_in=1024, num_out=384, batch_size=1
num_in=512, num_out=384, batch_size=1


 48%|████▊     | 367/760 [10:05<07:35,  1.16s/it]

num_in=512, num_out=320, batch_size=1


 48%|████▊     | 368/760 [10:08<10:07,  1.55s/it]

num_in=128, num_out=320, batch_size=1
num_in=384, num_out=448, batch_size=1


 49%|████▉     | 372/760 [10:13<07:32,  1.17s/it]

num_in=128, num_out=192, batch_size=4
num_in=128, num_out=128, batch_size=1
num_in=384, num_out=128, batch_size=4


 49%|████▉     | 373/760 [10:14<07:47,  1.21s/it]

num_in=896, num_out=320, batch_size=1


 49%|████▉     | 374/760 [10:17<10:24,  1.62s/it]

num_in=768, num_out=512, batch_size=1


 49%|████▉     | 375/760 [10:22<15:21,  2.39s/it]

num_in=1024, num_out=512, batch_size=1


 49%|████▉     | 376/760 [10:27<19:19,  3.02s/it]

num_in=640, num_out=256, batch_size=1


 50%|████▉     | 377/760 [10:29<18:17,  2.87s/it]

num_in=768, num_out=384, batch_size=1


 50%|████▉     | 378/760 [10:33<19:36,  3.08s/it]

num_in=512, num_out=384, batch_size=1


 50%|████▉     | 379/760 [10:36<20:32,  3.24s/it]

num_in=768, num_out=448, batch_size=1


 50%|█████     | 380/760 [10:41<22:19,  3.53s/it]

num_in=384, num_out=320, batch_size=1


 50%|█████     | 381/760 [10:44<21:20,  3.38s/it]

num_in=384, num_out=64, batch_size=1


 50%|█████     | 382/760 [10:44<16:11,  2.57s/it]

num_in=128, num_out=192, batch_size=1
num_in=512, num_out=448, batch_size=1


 51%|█████     | 384/760 [10:49<14:49,  2.37s/it]

num_in=384, num_out=384, batch_size=1


 51%|█████     | 385/760 [10:52<16:41,  2.67s/it]

num_in=768, num_out=512, batch_size=1


 51%|█████     | 386/760 [10:57<20:08,  3.23s/it]

num_in=896, num_out=384, batch_size=1


 51%|█████     | 387/760 [11:01<20:47,  3.35s/it]

num_in=896, num_out=192, batch_size=1


 51%|█████▏    | 390/760 [11:03<10:14,  1.66s/it]

num_in=128, num_out=448, batch_size=1
num_in=256, num_out=192, batch_size=4
num_in=512, num_out=128, batch_size=1


 51%|█████▏    | 391/760 [11:04<09:35,  1.56s/it]

num_in=256, num_out=128, batch_size=4
num_in=896, num_out=512, batch_size=1


 52%|█████▏    | 393/760 [11:09<11:48,  1.93s/it]

num_in=256, num_out=128, batch_size=1
num_in=384, num_out=64, batch_size=1


 52%|█████▏    | 395/760 [11:10<08:11,  1.35s/it]

num_in=768, num_out=256, batch_size=1


 52%|█████▏    | 396/760 [11:12<09:33,  1.58s/it]

num_in=640, num_out=320, batch_size=1


 52%|█████▏    | 397/760 [11:15<11:33,  1.91s/it]

num_in=512, num_out=128, batch_size=1


 52%|█████▏    | 398/760 [11:16<10:32,  1.75s/it]

num_in=768, num_out=128, batch_size=1


 52%|█████▎    | 399/760 [11:18<09:43,  1.62s/it]

num_in=768, num_out=256, batch_size=1


 53%|█████▎    | 400/760 [11:20<11:03,  1.84s/it]

num_in=128, num_out=256, batch_size=4
num_in=768, num_out=320, batch_size=1


 53%|█████▎    | 402/760 [11:23<10:14,  1.72s/it]

num_in=768, num_out=192, batch_size=1


 53%|█████▎    | 405/760 [11:25<06:18,  1.07s/it]

num_in=256, num_out=256, batch_size=1
num_in=256, num_out=128, batch_size=4
num_in=384, num_out=384, batch_size=1


 53%|█████▎    | 406/760 [11:29<09:40,  1.64s/it]

num_in=768, num_out=256, batch_size=1


 54%|█████▎    | 407/760 [11:31<10:46,  1.83s/it]

num_in=768, num_out=512, batch_size=1


 54%|█████▎    | 408/760 [11:36<15:16,  2.60s/it]

num_in=1024, num_out=128, batch_size=1


 54%|█████▍    | 409/760 [11:37<13:07,  2.24s/it]

num_in=640, num_out=384, batch_size=1


 54%|█████▍    | 410/760 [11:41<15:22,  2.63s/it]

num_in=640, num_out=448, batch_size=1


 54%|█████▍    | 411/760 [11:45<17:57,  3.09s/it]

num_in=384, num_out=128, batch_size=1


 54%|█████▍    | 412/760 [11:46<14:48,  2.55s/it]

num_in=896, num_out=320, batch_size=1


 54%|█████▍    | 413/760 [11:49<15:35,  2.70s/it]

num_in=256, num_out=128, batch_size=4
num_in=1024, num_out=448, batch_size=1


 55%|█████▍    | 415/760 [11:54<14:09,  2.46s/it]

num_in=768, num_out=320, batch_size=1


 55%|█████▍    | 416/760 [11:57<14:57,  2.61s/it]

num_in=128, num_out=512, batch_size=1
num_in=896, num_out=384, batch_size=1


 55%|█████▌    | 418/760 [12:00<13:04,  2.29s/it]

num_in=768, num_out=448, batch_size=1


 55%|█████▌    | 419/760 [12:05<15:33,  2.74s/it]

num_in=512, num_out=192, batch_size=1


 55%|█████▌    | 420/760 [12:07<14:16,  2.52s/it]

num_in=1024, num_out=384, batch_size=1
num_in=768, num_out=128, batch_size=1


 56%|█████▌    | 422/760 [12:08<09:54,  1.76s/it]

num_in=640, num_out=256, batch_size=1


 56%|█████▌    | 423/760 [12:10<10:44,  1.91s/it]

num_in=512, num_out=448, batch_size=1


 56%|█████▌    | 424/760 [12:15<13:53,  2.48s/it]

num_in=128, num_out=448, batch_size=1
num_in=384, num_out=64, batch_size=4


 56%|█████▌    | 426/760 [12:15<09:00,  1.62s/it]

num_in=640, num_out=128, batch_size=1


 56%|█████▋    | 429/760 [12:17<05:17,  1.04it/s]

num_in=256, num_out=256, batch_size=4
num_in=128, num_out=512, batch_size=1
num_in=1024, num_out=384, batch_size=1


 57%|█████▋    | 430/760 [12:20<08:30,  1.55s/it]

num_in=512, num_out=128, batch_size=1


 57%|█████▋    | 431/760 [12:22<08:06,  1.48s/it]

num_in=384, num_out=192, batch_size=1


 57%|█████▋    | 432/760 [12:23<08:34,  1.57s/it]

num_in=384, num_out=256, batch_size=1


 57%|█████▋    | 433/760 [12:26<09:46,  1.79s/it]

num_in=896, num_out=384, batch_size=1


 57%|█████▋    | 436/760 [12:30<07:06,  1.32s/it]

num_in=128, num_out=64, batch_size=1
num_in=256, num_out=192, batch_size=4
num_in=256, num_out=256, batch_size=1
num_in=640, num_out=512, batch_size=1


 58%|█████▊    | 438/760 [12:35<09:23,  1.75s/it]

num_in=384, num_out=256, batch_size=1


 58%|█████▊    | 439/760 [12:37<10:08,  1.90s/it]

num_in=128, num_out=448, batch_size=1
num_in=768, num_out=320, batch_size=1


 58%|█████▊    | 441/760 [12:40<09:20,  1.76s/it]

num_in=896, num_out=448, batch_size=1


 58%|█████▊    | 444/760 [12:44<07:35,  1.44s/it]

num_in=256, num_out=128, batch_size=1
num_in=256, num_out=512, batch_size=1
num_in=896, num_out=320, batch_size=1


 59%|█████▉    | 447/760 [12:47<05:31,  1.06s/it]

num_in=256, num_out=512, batch_size=1
num_in=128, num_out=320, batch_size=4
num_in=256, num_out=384, batch_size=1
num_in=640, num_out=192, batch_size=1


 59%|█████▉    | 449/760 [12:49<05:16,  1.02s/it]

num_in=512, num_out=192, batch_size=1


 59%|█████▉    | 450/760 [12:51<06:06,  1.18s/it]

num_in=896, num_out=320, batch_size=1


 59%|█████▉    | 451/760 [12:54<08:13,  1.60s/it]

num_in=256, num_out=128, batch_size=4
num_in=512, num_out=64, batch_size=1


 60%|█████▉    | 453/760 [12:55<05:44,  1.12s/it]

num_in=256, num_out=320, batch_size=1
num_in=640, num_out=64, batch_size=1


 60%|█████▉    | 455/760 [12:55<04:17,  1.19it/s]

num_in=896, num_out=448, batch_size=1


 60%|██████    | 456/760 [12:58<06:00,  1.19s/it]

num_in=128, num_out=320, batch_size=4
num_in=384, num_out=128, batch_size=1


 60%|██████    | 458/760 [12:59<04:58,  1.01it/s]

num_in=128, num_out=64, batch_size=4
num_in=1024, num_out=512, batch_size=1


 61%|██████    | 460/760 [13:04<07:27,  1.49s/it]

num_in=256, num_out=256, batch_size=1
num_in=1024, num_out=256, batch_size=1


 61%|██████    | 464/760 [13:07<04:49,  1.02it/s]

num_in=128, num_out=192, batch_size=1
num_in=256, num_out=320, batch_size=1
num_in=256, num_out=256, batch_size=4
num_in=384, num_out=448, batch_size=1


 62%|██████▏   | 468/760 [13:11<04:35,  1.06it/s]

num_in=256, num_out=256, batch_size=1
num_in=128, num_out=192, batch_size=4
num_in=128, num_out=384, batch_size=1
num_in=896, num_out=384, batch_size=1


 62%|██████▏   | 470/760 [13:15<05:53,  1.22s/it]

num_in=512, num_out=320, batch_size=1


 62%|██████▏   | 471/760 [13:18<07:26,  1.54s/it]

num_in=640, num_out=128, batch_size=1


 62%|██████▏   | 472/760 [13:19<07:06,  1.48s/it]

num_in=256, num_out=192, batch_size=4
num_in=512, num_out=384, batch_size=1


 62%|██████▏   | 474/760 [13:23<07:43,  1.62s/it]

num_in=640, num_out=448, batch_size=1


 62%|██████▎   | 475/760 [13:27<10:16,  2.16s/it]

num_in=768, num_out=256, batch_size=1


 63%|██████▎   | 476/760 [13:29<10:32,  2.23s/it]

num_in=128, num_out=320, batch_size=1
num_in=640, num_out=512, batch_size=1


 63%|██████▎   | 478/760 [13:34<10:51,  2.31s/it]

num_in=512, num_out=384, batch_size=1


 63%|██████▎   | 479/760 [13:38<12:10,  2.60s/it]

num_in=1024, num_out=512, batch_size=1
num_in=896, num_out=192, batch_size=1


 63%|██████▎   | 481/760 [13:40<09:09,  1.97s/it]

num_in=768, num_out=256, batch_size=1


 63%|██████▎   | 482/760 [13:42<09:36,  2.07s/it]

num_in=640, num_out=512, batch_size=1


 64%|██████▎   | 483/760 [13:47<12:34,  2.72s/it]

num_in=896, num_out=512, batch_size=1


 64%|██████▍   | 486/760 [13:52<08:46,  1.92s/it]

num_in=256, num_out=448, batch_size=1
num_in=256, num_out=192, batch_size=4
num_in=256, num_out=128, batch_size=4


 64%|██████▍   | 488/760 [13:52<05:34,  1.23s/it]

num_in=128, num_out=448, batch_size=1
num_in=384, num_out=384, batch_size=1


 64%|██████▍   | 489/760 [13:56<07:49,  1.73s/it]

num_in=640, num_out=384, batch_size=1


 64%|██████▍   | 490/760 [13:59<09:46,  2.17s/it]

num_in=512, num_out=128, batch_size=1


 65%|██████▍   | 491/760 [14:01<08:41,  1.94s/it]

num_in=384, num_out=512, batch_size=1


 65%|██████▍   | 492/760 [14:05<12:01,  2.69s/it]

num_in=256, num_out=64, batch_size=4
num_in=640, num_out=256, batch_size=1


 65%|██████▌   | 494/760 [14:08<09:13,  2.08s/it]

num_in=640, num_out=64, batch_size=1


 65%|██████▌   | 495/760 [14:09<07:42,  1.74s/it]

num_in=896, num_out=448, batch_size=1


 65%|██████▌   | 496/760 [14:13<10:25,  2.37s/it]

num_in=128, num_out=256, batch_size=1
num_in=256, num_out=64, batch_size=1
num_in=384, num_out=192, batch_size=1


 66%|██████▌   | 499/760 [14:15<06:24,  1.47s/it]

num_in=1024, num_out=64, batch_size=1


 66%|██████▌   | 502/760 [14:16<03:40,  1.17it/s]

num_in=128, num_out=256, batch_size=1
num_in=256, num_out=512, batch_size=1
num_in=640, num_out=64, batch_size=1


 66%|██████▌   | 503/760 [14:16<03:28,  1.23it/s]

num_in=640, num_out=128, batch_size=1


 67%|██████▋   | 506/760 [14:18<02:26,  1.74it/s]

num_in=128, num_out=512, batch_size=1
num_in=256, num_out=128, batch_size=4
num_in=512, num_out=320, batch_size=1


 67%|██████▋   | 507/760 [14:21<04:42,  1.12s/it]

num_in=896, num_out=64, batch_size=1


 67%|██████▋   | 508/760 [14:21<04:13,  1.01s/it]

num_in=256, num_out=192, batch_size=4
num_in=640, num_out=384, batch_size=1


 67%|██████▋   | 510/760 [14:25<05:38,  1.36s/it]

num_in=256, num_out=128, batch_size=1
num_in=384, num_out=256, batch_size=1


 67%|██████▋   | 512/760 [14:27<05:25,  1.31s/it]

num_in=768, num_out=128, batch_size=1


 68%|██████▊   | 513/760 [14:29<05:20,  1.30s/it]

num_in=384, num_out=128, batch_size=4


 68%|██████▊   | 514/760 [14:30<05:24,  1.32s/it]

num_in=384, num_out=64, batch_size=4


 68%|██████▊   | 515/760 [14:31<04:49,  1.18s/it]

num_in=128, num_out=64, batch_size=4
num_in=896, num_out=64, batch_size=1


 68%|██████▊   | 517/760 [14:32<03:25,  1.18it/s]

num_in=384, num_out=64, batch_size=4


 68%|██████▊   | 519/760 [14:32<02:36,  1.54it/s]

num_in=1024, num_out=256, batch_size=1
num_in=256, num_out=256, batch_size=4


 69%|██████▊   | 521/760 [14:33<01:40,  2.38it/s]

num_in=1024, num_out=192, batch_size=1
num_in=128, num_out=128, batch_size=1
num_in=1024, num_out=192, batch_size=1


 69%|██████▉   | 523/760 [14:33<01:08,  3.46it/s]

num_in=512, num_out=512, batch_size=1


 69%|██████▉   | 526/760 [14:38<03:06,  1.26it/s]

num_in=256, num_out=320, batch_size=1
num_in=128, num_out=128, batch_size=4
num_in=512, num_out=128, batch_size=1


 69%|██████▉   | 527/760 [14:39<03:26,  1.13it/s]

num_in=256, num_out=448, batch_size=1
num_in=384, num_out=128, batch_size=1


 70%|██████▉   | 529/760 [14:40<03:03,  1.26it/s]

num_in=128, num_out=384, batch_size=4
num_in=896, num_out=128, batch_size=1


 70%|███████   | 533/760 [14:42<01:57,  1.93it/s]

num_in=128, num_out=256, batch_size=4
num_in=256, num_out=448, batch_size=1
num_in=896, num_out=384, batch_size=1


 70%|███████   | 534/760 [14:45<04:12,  1.12s/it]

num_in=640, num_out=320, batch_size=1


 70%|███████   | 535/760 [14:48<05:44,  1.53s/it]

num_in=256, num_out=128, batch_size=4
num_in=512, num_out=448, batch_size=1


 71%|███████   | 537/760 [14:53<06:34,  1.77s/it]

num_in=768, num_out=192, batch_size=1


 71%|███████   | 538/760 [14:54<06:36,  1.79s/it]

num_in=1024, num_out=512, batch_size=1
num_in=384, num_out=448, batch_size=1


 71%|███████▏  | 542/760 [14:59<04:36,  1.27s/it]

num_in=128, num_out=64, batch_size=4
num_in=256, num_out=320, batch_size=1
num_in=896, num_out=128, batch_size=1


 71%|███████▏  | 543/760 [15:00<04:34,  1.26s/it]

num_in=128, num_out=448, batch_size=1
num_in=256, num_out=192, batch_size=1
num_in=640, num_out=448, batch_size=1


 72%|███████▏  | 546/760 [15:04<04:47,  1.35s/it]

num_in=128, num_out=128, batch_size=1
num_in=640, num_out=192, batch_size=1


 72%|███████▏  | 548/760 [15:06<04:18,  1.22s/it]

num_in=1024, num_out=128, batch_size=1


 72%|███████▏  | 549/760 [15:08<04:19,  1.23s/it]

num_in=512, num_out=256, batch_size=1


 72%|███████▏  | 550/760 [15:10<05:10,  1.48s/it]

num_in=384, num_out=256, batch_size=1


 72%|███████▎  | 551/760 [15:12<05:53,  1.69s/it]

num_in=768, num_out=448, batch_size=1


 73%|███████▎  | 552/760 [15:17<08:00,  2.31s/it]

num_in=768, num_out=320, batch_size=1


 73%|███████▎  | 553/760 [15:20<08:37,  2.50s/it]

num_in=512, num_out=448, batch_size=1


 73%|███████▎  | 556/760 [15:24<05:47,  1.70s/it]

num_in=128, num_out=192, batch_size=1
num_in=256, num_out=192, batch_size=4
num_in=384, num_out=192, batch_size=1


 73%|███████▎  | 557/760 [15:26<05:52,  1.74s/it]

num_in=384, num_out=448, batch_size=1


 73%|███████▎  | 558/760 [15:30<07:57,  2.36s/it]

num_in=640, num_out=192, batch_size=1


 74%|███████▎  | 559/760 [15:32<07:26,  2.22s/it]

num_in=512, num_out=64, batch_size=1


 74%|███████▎  | 560/760 [15:33<05:57,  1.79s/it]

num_in=896, num_out=384, batch_size=1


 74%|███████▍  | 563/760 [15:36<04:18,  1.31s/it]

num_in=256, num_out=128, batch_size=1
num_in=1024, num_out=448, batch_size=1
num_in=384, num_out=192, batch_size=1


 75%|███████▍  | 567/760 [15:38<02:21,  1.37it/s]

num_in=128, num_out=128, batch_size=1
num_in=256, num_out=448, batch_size=1
num_in=128, num_out=192, batch_size=4
num_in=896, num_out=256, batch_size=1


 75%|███████▍  | 568/760 [15:41<03:24,  1.07s/it]

num_in=384, num_out=256, batch_size=1


 75%|███████▌  | 571/760 [15:43<02:44,  1.15it/s]

num_in=1024, num_out=384, batch_size=1
num_in=128, num_out=320, batch_size=4
num_in=512, num_out=256, batch_size=1


 75%|███████▌  | 572/760 [15:46<03:46,  1.21s/it]

num_in=128, num_out=64, batch_size=1
num_in=512, num_out=192, batch_size=1


 76%|███████▌  | 574/760 [15:48<03:25,  1.10s/it]

num_in=128, num_out=64, batch_size=1
num_in=512, num_out=384, batch_size=1


 76%|███████▌  | 576/760 [15:51<04:09,  1.36s/it]

num_in=128, num_out=384, batch_size=4
num_in=384, num_out=512, batch_size=1


 76%|███████▌  | 578/760 [15:56<05:13,  1.72s/it]

num_in=640, num_out=192, batch_size=1


 76%|███████▌  | 579/760 [15:58<05:15,  1.74s/it]

num_in=896, num_out=64, batch_size=1


 76%|███████▋  | 580/760 [15:59<04:31,  1.51s/it]

num_in=768, num_out=128, batch_size=1


 76%|███████▋  | 581/760 [16:00<04:19,  1.45s/it]

num_in=768, num_out=448, batch_size=1


 77%|███████▋  | 582/760 [16:04<06:23,  2.15s/it]

num_in=128, num_out=192, batch_size=1
num_in=640, num_out=448, batch_size=1


 77%|███████▋  | 584/760 [16:09<06:17,  2.15s/it]

num_in=896, num_out=128, batch_size=1


 77%|███████▋  | 585/760 [16:10<05:39,  1.94s/it]

num_in=384, num_out=64, batch_size=4


 77%|███████▋  | 586/760 [16:11<04:46,  1.65s/it]

num_in=128, num_out=384, batch_size=4
num_in=384, num_out=384, batch_size=1


 77%|███████▋  | 588/760 [16:14<04:57,  1.73s/it]

num_in=768, num_out=320, batch_size=1


 78%|███████▊  | 589/760 [16:17<05:46,  2.03s/it]

num_in=1024, num_out=512, batch_size=1
num_in=896, num_out=192, batch_size=1


 78%|███████▊  | 591/760 [16:19<04:32,  1.61s/it]

num_in=256, num_out=192, batch_size=4
num_in=768, num_out=64, batch_size=1


 78%|███████▊  | 593/760 [16:20<03:15,  1.17s/it]

num_in=128, num_out=64, batch_size=1
num_in=768, num_out=256, batch_size=1


 78%|███████▊  | 595/760 [16:23<03:17,  1.20s/it]

num_in=640, num_out=448, batch_size=1


 78%|███████▊  | 596/760 [16:27<04:54,  1.79s/it]

num_in=768, num_out=512, batch_size=1


 79%|███████▊  | 597/760 [16:32<06:42,  2.47s/it]

num_in=512, num_out=256, batch_size=1


 79%|███████▊  | 598/760 [16:34<06:39,  2.46s/it]

num_in=256, num_out=512, batch_size=1
num_in=512, num_out=192, batch_size=1


 79%|███████▉  | 600/760 [16:36<04:55,  1.85s/it]

num_in=128, num_out=64, batch_size=1
num_in=768, num_out=512, batch_size=1


 79%|███████▉  | 602/760 [16:41<05:27,  2.07s/it]

num_in=640, num_out=128, batch_size=1


 79%|███████▉  | 603/760 [16:42<04:58,  1.90s/it]

num_in=128, num_out=128, batch_size=4
num_in=384, num_out=64, batch_size=4


 80%|███████▉  | 605/760 [16:43<03:28,  1.34s/it]

num_in=768, num_out=192, batch_size=1


 80%|███████▉  | 606/760 [16:45<03:44,  1.46s/it]

num_in=512, num_out=320, batch_size=1


 80%|███████▉  | 607/760 [16:48<04:39,  1.83s/it]

num_in=128, num_out=448, batch_size=1
num_in=896, num_out=256, batch_size=1


 80%|████████  | 609/760 [16:50<04:01,  1.60s/it]

num_in=128, num_out=384, batch_size=1
num_in=768, num_out=64, batch_size=1


 80%|████████  | 611/760 [16:51<02:50,  1.15s/it]

num_in=256, num_out=192, batch_size=1
num_in=768, num_out=64, batch_size=1


 81%|████████  | 613/760 [16:52<02:08,  1.14it/s]

num_in=640, num_out=128, batch_size=1


 81%|████████  | 614/760 [16:53<02:18,  1.05it/s]

num_in=896, num_out=256, batch_size=1


 81%|████████  | 615/760 [16:56<03:05,  1.28s/it]

num_in=512, num_out=320, batch_size=1


 81%|████████▏ | 619/760 [16:59<02:02,  1.15it/s]

num_in=256, num_out=512, batch_size=1
num_in=128, num_out=384, batch_size=1
num_in=256, num_out=128, batch_size=1
num_in=768, num_out=64, batch_size=1


 82%|████████▏ | 620/760 [16:59<01:55,  1.21it/s]

num_in=384, num_out=64, batch_size=4


 82%|████████▏ | 621/760 [17:00<01:52,  1.24it/s]

num_in=1024, num_out=64, batch_size=1


 82%|████████▏ | 622/760 [17:01<01:47,  1.28it/s]

num_in=768, num_out=192, batch_size=1


 82%|████████▏ | 625/760 [17:03<01:27,  1.55it/s]

num_in=1024, num_out=512, batch_size=1
num_in=128, num_out=192, batch_size=1
num_in=512, num_out=192, batch_size=1


 82%|████████▏ | 626/760 [17:05<02:03,  1.09it/s]

num_in=768, num_out=128, batch_size=1


 82%|████████▎ | 627/760 [17:06<02:13,  1.00s/it]

num_in=896, num_out=192, batch_size=1


 83%|████████▎ | 628/760 [17:08<02:42,  1.23s/it]

num_in=1024, num_out=256, batch_size=1
num_in=1024, num_out=64, batch_size=1


 83%|████████▎ | 630/760 [17:09<01:52,  1.15it/s]

num_in=1024, num_out=320, batch_size=1


 83%|████████▎ | 631/760 [17:12<02:58,  1.39s/it]

num_in=384, num_out=384, batch_size=1


 84%|████████▎ | 635/760 [17:15<02:01,  1.03it/s]

num_in=256, num_out=192, batch_size=1
num_in=256, num_out=256, batch_size=1
num_in=1024, num_out=320, batch_size=1
num_in=128, num_out=64, batch_size=4


 84%|████████▍ | 637/760 [17:16<01:23,  1.48it/s]

num_in=1024, num_out=192, batch_size=1
num_in=1024, num_out=192, batch_size=1


 84%|████████▍ | 638/760 [17:18<01:50,  1.10it/s]

num_in=256, num_out=320, batch_size=1
num_in=1024, num_out=512, batch_size=1


 84%|████████▍ | 640/760 [17:22<02:54,  1.46s/it]

num_in=640, num_out=512, batch_size=1


 84%|████████▍ | 641/760 [17:27<04:15,  2.14s/it]

num_in=384, num_out=320, batch_size=1


 84%|████████▍ | 642/760 [17:30<04:36,  2.35s/it]

num_in=768, num_out=64, batch_size=1


 85%|████████▍ | 643/760 [17:31<03:46,  1.94s/it]

num_in=256, num_out=64, batch_size=4
num_in=512, num_out=64, batch_size=1


 85%|████████▍ | 645/760 [17:32<02:29,  1.30s/it]

num_in=1024, num_out=256, batch_size=1
num_in=896, num_out=64, batch_size=1


 85%|████████▌ | 647/760 [17:33<01:48,  1.04it/s]

num_in=128, num_out=320, batch_size=1
num_in=512, num_out=320, batch_size=1


 86%|████████▌ | 651/760 [17:36<01:27,  1.25it/s]

num_in=1024, num_out=320, batch_size=1
num_in=256, num_out=320, batch_size=1
num_in=640, num_out=512, batch_size=1


 86%|████████▌ | 652/760 [17:41<02:50,  1.57s/it]

num_in=1024, num_out=256, batch_size=1


 86%|████████▌ | 653/760 [17:43<03:09,  1.77s/it]

num_in=512, num_out=384, batch_size=1


 86%|████████▌ | 654/760 [17:47<03:54,  2.21s/it]

num_in=256, num_out=64, batch_size=1
num_in=512, num_out=256, batch_size=1


 86%|████████▋ | 656/760 [17:49<03:09,  1.82s/it]

num_in=768, num_out=128, batch_size=1


 86%|████████▋ | 657/760 [17:50<02:54,  1.70s/it]

num_in=384, num_out=192, batch_size=1


 87%|████████▋ | 658/760 [17:52<02:56,  1.73s/it]

num_in=896, num_out=128, batch_size=1


 87%|████████▋ | 659/760 [17:54<02:42,  1.61s/it]

num_in=384, num_out=128, batch_size=4


 87%|████████▋ | 660/760 [17:55<02:35,  1.55s/it]

num_in=384, num_out=256, batch_size=1


 87%|████████▋ | 661/760 [17:57<02:58,  1.80s/it]

num_in=768, num_out=192, batch_size=1


 87%|████████▋ | 662/760 [17:59<02:58,  1.82s/it]

num_in=512, num_out=448, batch_size=1


 87%|████████▋ | 663/760 [18:04<04:04,  2.52s/it]

num_in=896, num_out=512, batch_size=1


 87%|████████▋ | 664/760 [18:08<05:07,  3.20s/it]

num_in=128, num_out=128, batch_size=1
num_in=384, num_out=128, batch_size=4


 88%|████████▊ | 666/760 [18:10<03:15,  2.07s/it]

num_in=768, num_out=64, batch_size=1


 88%|████████▊ | 669/760 [18:11<01:35,  1.05s/it]

num_in=128, num_out=256, batch_size=4
num_in=128, num_out=256, batch_size=4
num_in=128, num_out=384, batch_size=4
num_in=128, num_out=512, batch_size=1


 89%|████████▊ | 673/760 [18:11<00:40,  2.14it/s]

num_in=128, num_out=128, batch_size=4
num_in=256, num_out=256, batch_size=4
num_in=640, num_out=448, batch_size=1
num_in=768, num_out=448, batch_size=1


 89%|████████▉ | 675/760 [18:19<02:26,  1.72s/it]

num_in=640, num_out=256, batch_size=1


 89%|████████▉ | 676/760 [18:22<02:36,  1.86s/it]

num_in=512, num_out=448, batch_size=1


 89%|████████▉ | 677/760 [18:26<03:17,  2.38s/it]

num_in=384, num_out=512, batch_size=1


 89%|████████▉ | 678/760 [18:31<04:02,  2.95s/it]

num_in=896, num_out=192, batch_size=1


 89%|████████▉ | 679/760 [18:33<03:36,  2.68s/it]

num_in=768, num_out=192, batch_size=1


 89%|████████▉ | 680/760 [18:35<03:17,  2.46s/it]

num_in=640, num_out=64, batch_size=1


 90%|████████▉ | 681/760 [18:35<02:35,  1.97s/it]

num_in=512, num_out=448, batch_size=1


 90%|████████▉ | 682/760 [18:39<03:21,  2.58s/it]

num_in=896, num_out=192, batch_size=1


 90%|████████▉ | 683/760 [18:41<03:02,  2.37s/it]

num_in=896, num_out=320, batch_size=1


 90%|█████████ | 684/760 [18:44<03:15,  2.58s/it]

num_in=640, num_out=320, batch_size=1


 90%|█████████ | 685/760 [18:47<03:23,  2.72s/it]

num_in=640, num_out=192, batch_size=1


 90%|█████████ | 686/760 [18:49<03:02,  2.46s/it]

num_in=640, num_out=384, batch_size=1


 91%|█████████ | 689/760 [18:53<01:50,  1.55s/it]

num_in=128, num_out=128, batch_size=4
num_in=128, num_out=320, batch_size=1
num_in=256, num_out=256, batch_size=1
num_in=1024, num_out=448, batch_size=1


 91%|█████████ | 691/760 [18:57<02:03,  1.79s/it]

num_in=640, num_out=448, batch_size=1


 91%|█████████ | 692/760 [19:02<02:38,  2.34s/it]

num_in=256, num_out=64, batch_size=1
num_in=512, num_out=64, batch_size=1


 91%|█████████▏| 694/760 [19:02<01:43,  1.57s/it]

num_in=128, num_out=192, batch_size=1
num_in=384, num_out=64, batch_size=1


 92%|█████████▏| 699/760 [19:03<00:40,  1.52it/s]

num_in=128, num_out=512, batch_size=1
num_in=128, num_out=64, batch_size=1
num_in=128, num_out=320, batch_size=1
num_in=1024, num_out=448, batch_size=1


 92%|█████████▏| 701/760 [19:08<01:04,  1.10s/it]

num_in=256, num_out=256, batch_size=1
num_in=384, num_out=64, batch_size=4


 92%|█████████▏| 702/760 [19:08<00:59,  1.03s/it]

num_in=512, num_out=384, batch_size=1


 92%|█████████▎| 703/760 [19:12<01:28,  1.55s/it]

num_in=512, num_out=512, batch_size=1


 93%|█████████▎| 704/760 [19:17<02:07,  2.28s/it]

num_in=640, num_out=384, batch_size=1


 93%|█████████▎| 705/760 [19:20<02:23,  2.61s/it]

num_in=1024, num_out=192, batch_size=1


 93%|█████████▎| 706/760 [19:22<02:10,  2.42s/it]

num_in=384, num_out=128, batch_size=4


 93%|█████████▎| 707/760 [19:24<01:53,  2.15s/it]

num_in=640, num_out=256, batch_size=1


 93%|█████████▎| 708/760 [19:26<01:56,  2.24s/it]

num_in=1024, num_out=512, batch_size=1


 93%|█████████▎| 709/760 [19:31<02:32,  2.98s/it]

num_in=1024, num_out=256, batch_size=1
num_in=640, num_out=256, batch_size=1


 94%|█████████▎| 711/760 [19:34<01:48,  2.22s/it]

num_in=512, num_out=256, batch_size=1


 94%|█████████▎| 712/760 [19:36<01:49,  2.28s/it]

num_in=896, num_out=64, batch_size=1


 94%|█████████▍| 713/760 [19:37<01:27,  1.86s/it]

num_in=128, num_out=256, batch_size=1
num_in=640, num_out=512, batch_size=1


 94%|█████████▍| 715/760 [19:42<01:34,  2.11s/it]

num_in=768, num_out=192, batch_size=1


 94%|█████████▍| 716/760 [19:43<01:30,  2.05s/it]

num_in=896, num_out=128, batch_size=1


 94%|█████████▍| 717/760 [19:45<01:19,  1.86s/it]

num_in=256, num_out=320, batch_size=1
num_in=1024, num_out=128, batch_size=1


 95%|█████████▍| 719/760 [19:46<00:55,  1.36s/it]

num_in=1024, num_out=448, batch_size=1
num_in=640, num_out=320, batch_size=1


 95%|█████████▍| 721/760 [19:49<00:56,  1.44s/it]

num_in=128, num_out=128, batch_size=4
num_in=384, num_out=64, batch_size=1


 95%|█████████▌| 723/760 [19:50<00:39,  1.06s/it]

num_in=640, num_out=512, batch_size=1


 96%|█████████▌| 727/760 [19:55<00:33,  1.01s/it]

num_in=128, num_out=128, batch_size=1
num_in=128, num_out=384, batch_size=1
num_in=256, num_out=64, batch_size=4
num_in=128, num_out=64, batch_size=1
num_in=640, num_out=256, batch_size=1


 96%|█████████▌| 729/760 [19:57<00:33,  1.08s/it]

num_in=512, num_out=448, batch_size=1


 96%|█████████▌| 730/760 [20:02<00:49,  1.65s/it]

num_in=256, num_out=512, batch_size=1
num_in=896, num_out=320, batch_size=1


 96%|█████████▋| 732/760 [20:05<00:45,  1.61s/it]

num_in=768, num_out=320, batch_size=1


 96%|█████████▋| 733/760 [20:08<00:51,  1.90s/it]

num_in=384, num_out=64, batch_size=4


 97%|█████████▋| 734/760 [20:09<00:42,  1.65s/it]

num_in=768, num_out=384, batch_size=1


 97%|█████████▋| 735/760 [20:12<00:53,  2.13s/it]

num_in=128, num_out=384, batch_size=4
num_in=384, num_out=320, batch_size=1


 97%|█████████▋| 737/760 [20:15<00:43,  1.90s/it]

num_in=128, num_out=384, batch_size=4
num_in=384, num_out=64, batch_size=1


 97%|█████████▋| 739/760 [20:16<00:27,  1.33s/it]

num_in=256, num_out=448, batch_size=1
num_in=512, num_out=512, batch_size=1


 98%|█████████▊| 741/760 [20:21<00:32,  1.71s/it]

num_in=384, num_out=64, batch_size=1


 98%|█████████▊| 742/760 [20:22<00:27,  1.50s/it]

num_in=1024, num_out=128, batch_size=1


 98%|█████████▊| 743/760 [20:23<00:24,  1.45s/it]

num_in=384, num_out=128, batch_size=1


 98%|█████████▊| 744/760 [20:24<00:22,  1.40s/it]

num_in=256, num_out=192, batch_size=1
num_in=256, num_out=64, batch_size=1
num_in=896, num_out=64, batch_size=1


 98%|█████████▊| 747/760 [20:25<00:10,  1.22it/s]

num_in=512, num_out=128, batch_size=1


 99%|█████████▊| 750/760 [20:26<00:06,  1.66it/s]

num_in=128, num_out=64, batch_size=4
num_in=256, num_out=448, batch_size=1
num_in=768, num_out=64, batch_size=1


 99%|█████████▉| 751/760 [20:27<00:05,  1.64it/s]

num_in=256, num_out=256, batch_size=1
num_in=896, num_out=192, batch_size=1


 99%|█████████▉| 753/760 [20:29<00:05,  1.35it/s]

num_in=640, num_out=192, batch_size=1


 99%|█████████▉| 756/760 [20:31<00:02,  1.57it/s]

num_in=256, num_out=64, batch_size=4
num_in=256, num_out=192, batch_size=1
num_in=384, num_out=320, batch_size=1


100%|█████████▉| 757/760 [20:34<00:03,  1.14s/it]

num_in=384, num_out=512, batch_size=1


100%|█████████▉| 758/760 [20:39<00:04,  2.00s/it]

num_in=384, num_out=256, batch_size=1


100%|█████████▉| 759/760 [20:41<00:02,  2.11s/it]

num_in=768, num_out=192, batch_size=1


100%|██████████| 760/760 [20:43<00:00,  1.64s/it]


In [73]:
from collections import defaultdict
import numpy as np

def tps(n_in, n_out, samples):
    return round(np.mean([n_out/s for s in samples]), 1)

def avg(n_in, n_out, samples):
    return round(np.mean(samples), 2)

def print_bench(b, bench_batch_size=bench_batch_size, bench_tokens_in=bench_tokens_in, bench_tokens_out=bench_tokens_out, aggregate=avg):
    b.sort()
    
    for current_size in bench_batch_size:
        
        print('-'*60)
        print(f'parallel generations: {current_size}')
    
        data = defaultdict(list)
        for n_in, n_out, b_size, _, timing in b:
            if b_size == current_size:
                data[(n_in, n_out)].append(timing)
                
        print('in\tout')
        line = '\t'
        for n_out in bench_tokens_out:
            line += str(n_out) + '\t'
        print(line)
        for n_in in bench_tokens_in:
            line = str(n_in) + '\t'
            for n_out in bench_tokens_out:
                samples = data.get((n_in, n_out), None)
                if samples:
                    line += str(aggregate(n_in, n_out, samples)) + '\t'
                else:
                    line += '--\t'
            print(line)



In [76]:
# generation times
print_bench(b, aggregate=avg)

------------------------------------------------------------
parallel generations: 1
in	out
	64	128	192	256	320	384	448	512	
128	0.02	0.02	0.02	0.02	0.02	0.02	0.02	0.02	
256	0.03	0.03	0.03	0.03	0.03	0.03	0.03	0.03	
384	0.61	1.2	1.8	2.4	3.0	3.57	4.17	4.79	
512	0.61	1.21	1.8	2.41	3.0	3.6	4.19	4.77	
640	0.62	1.22	1.81	2.41	3.0	3.61	4.2	4.8	
768	0.63	1.22	1.83	2.41	3.01	3.6	4.2	4.8	
896	0.63	1.23	1.83	2.42	2.91	3.61	4.03	4.56	
1024	0.46	0.76	0.77	1.0	0.65	2.19	3.38	2.44	
------------------------------------------------------------
parallel generations: 4
in	out
	64	128	192	256	320	384	448	512	
128	0.04	0.04	0.04	0.04	0.04	0.04	--	--	
256	0.06	0.06	0.06	0.06	--	--	--	--	
384	0.71	1.36	--	--	--	--	--	--	
512	--	--	--	--	--	--	--	--	
640	--	--	--	--	--	--	--	--	
768	--	--	--	--	--	--	--	--	
896	--	--	--	--	--	--	--	--	
1024	--	--	--	--	--	--	--	--	
------------------------------------------------------------
parallel generations: 16
in	out
	64	128	192	256	320	384	448	512	
128	--	--	--	--	--	-

In [77]:
# tokens per second
print_bench(b, aggregate=tps)

------------------------------------------------------------
parallel generations: 1
in	out
	64	128	192	256	320	384	448	512	
128	2889.7	5972.4	8747.7	11417.7	14215.9	17496.7	20387.6	22624.9	
256	2397.8	4712.5	7023.9	9402.5	11482.6	14220.7	16288.4	18979.4	
384	105.0	106.5	106.8	106.8	106.8	107.5	107.4	106.9	
512	104.1	105.8	106.6	106.4	106.5	106.7	107.0	107.4	
640	103.1	105.2	105.9	106.1	106.5	106.5	106.6	106.6	
768	101.8	104.6	105.1	106.2	106.3	106.6	106.8	106.7	
896	101.3	104.5	105.1	105.7	110.6	106.3	113.8	116.8	
1024	399.3	932.1	2000.3	2633.6	4386.1	2715.4	1606.2	4457.5	
------------------------------------------------------------
parallel generations: 4
in	out
	64	128	192	256	320	384	448	512	
128	1662.0	3326.5	5035.2	6537.5	8244.2	10053.3	--	--	
256	1092.6	2185.4	3289.4	4365.3	--	--	--	--	
384	89.8	94.0	--	--	--	--	--	--	
512	--	--	--	--	--	--	--	--	
640	--	--	--	--	--	--	--	--	
768	--	--	--	--	--	--	--	--	
896	--	--	--	--	--	--	--	--	
1024	--	--	--	--	--	--	--	--	
----------------

In [80]:
# median tokens per second
np.median([tps(n_in, n_out, [s]) for n_in, n_out, _, _, s in b])

106.95

In [81]:
# store results
import pickle
with open('llm-benchmark.pickle', 'wb') as fp:
    pickle.dump(b, fp)

In [69]:
# reload results
with open('llm-benchmark.pickle', 'rb') as fp:
    b = pickle.load(fp)

[(128, 64, 1, 0, 0.023736238479614258),
 (128, 64, 1, 1, 0.0209963321685791),
 (128, 64, 1, 2, 0.023659467697143555),
 (128, 64, 1, 3, 0.02104806900024414),
 (128, 64, 1, 4, 0.023738861083984375),
 (128, 64, 1, 5, 0.02096724510192871),
 (128, 64, 1, 6, 0.02392578125),
 (128, 64, 1, 7, 0.022480487823486328),
 (128, 64, 1, 8, 0.020773649215698242),
 (128, 64, 1, 9, 0.020944595336914062),
 (128, 64, 4, 0, 0.041047096252441406),
 (128, 64, 4, 1, 0.03752493858337402),
 (128, 64, 4, 2, 0.03747844696044922),
 (128, 64, 4, 3, 0.04098820686340332),
 (128, 64, 4, 4, 0.03745436668395996),
 (128, 64, 4, 5, 0.03761100769042969),
 (128, 64, 4, 6, 0.03762030601501465),
 (128, 64, 4, 7, 0.04076075553894043),
 (128, 64, 4, 8, 0.03754830360412598),
 (128, 64, 4, 9, 0.037644147872924805),
 (128, 128, 1, 0, 0.02371382713317871),
 (128, 128, 1, 1, 0.020860671997070312),
 (128, 128, 1, 2, 0.020993947982788086),
 (128, 128, 1, 3, 0.020980358123779297),
 (128, 128, 1, 4, 0.021054744720458984),
 (128, 128, 1, 