In [1]:
import os, sys
import ipdb # debugging
from tqdm import tqdm # dat
from datetime import datetime # may use for checkpoint files for the state of training
import platform, shutil # detect platform type
import requests, zipfile, io


# Pytorch
import torch # library for deep learning
import torch.nn as nn # neuron network
from torch.nn import functional as F

# Tokenizer
import sentencepiece as spm # translate human language to tokens for computer to understand

# A-series GPU will benefit from this setting
#torch.backends.cuda.matmul.allow_tf32 = True
#torch.backends.cudnn.allow_tf32 = True

torch.cuda.empty_cache() # empty GPU cache
files_url = "https://ideami.com/llm_train"
print("Downloading files using Python")
response = requests.get(files_url) # GET request to fetch stuff from the website
zipfile.ZipFile(io.BytesIO(response.content)).extractall(".") # . indicate current folder





Downloading files using Python


In [4]:

# ARCHITECTURE PARAMETERS
batch_size = 32 # 8 to 128 and beyond, if only 4GB available for your GPU then make this 8 -- GPU for parallel computing
context = 512 # Sequence length aka focus window
embed_size = 384 # dimensions of vectors, this means 384 numbers to explain a token
n_layers = 7
n_heads = 7
BIAS = True

# Hyper parameters are the ones impacting the training process
lr = 3e-4
dropout = 0.05
weight_decay = 0.01
grad_clip = 1.0

# Training parameters
load_pretrained = True
train_iters = 100000
eval_interval = 50 # Every 50 times of iteration we are going to evaluate our data
eval_iters = 10 
compile = True
checkpoint_dir = "models/"
# checkpoint_fn = "lastest.pt"
checkpoint_fn = 'latest.pt' # File name for saving a checkpoint
# checkpoint_load_fn = "latest.pt"
checkpoint_load_fn = 'latest.pt' # File name for loading a checkpoint
dtype = torch.bfloat16

# Mode : False for training, True for getting output for use 
inference = True

# Device
device = "cuda" if torch.cuda.is_available() else "cpu"
print("device: You will be using: ", device)





device: You will be using:  cuda


In [5]:
# Logging

wandb_log = True
wandb_project = "llm9"
# wandb_run_name = "llm1-" + datetime.now().strftime("%Y_%m_%d_%H_%M_%S")
wandb_run_name = 'llm9run'

if wandb_log:
    import wandb # For deep-learning metrics visualizaiton
    wandb.init(project = wandb_project, name = wandb_run_name)

#wandb.finish() # This will finish the project






wandb: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.
wandb: Currently logged in as: jwzero8 (jwzero8-the-university-of-hong-kong). Use `wandb login --relogin` to force relogin


In [6]:
# Load Data
with open('wiki.txt', 'r', encoding = 'utf-8') as f: #
    text = f.read()
print(text[30000:30300])









terms.
For example, there are objects in two groups (as shown on the right). The objects are various shapes, where one group has 3 of them while the other has 2. When the two groups combine into one, the overall amount (sum) of the shapes become 5.

Vertical Addition

The animation above demonstrate


In [7]:
# Initialize Tokenizer

sp = spm.SentencePieceProcessor(model_file = 'wiki_tokenizer.model')

vocab_size = sp.get_piece_size()
print(f"Tokenizer vocab size: {vocab_size}")




Tokenizer vocab size: 4096


In [8]:
# Set up tokenizing function
encode = lambda s : sp.Encode(s)
decode = lambda l : sp.Decode(l)

print(encode('Once upon a time'))
print(decode(encode('Once upon a time')))
      

[612, 370, 698, 265, 261, 684]
Once upon a time


In [9]:
# Tokenize our data or load tokenizer if pre-trained tokenizers are available
if os.path.exists('encoded_data.pt'):
    print('Loading Encoding')
    data = torch.load('encoded_data.pt')
else:
    data = torch.tensor(encode(text), dtype = torch.long) # dtype can be more accurate for better performacne yet more memory and computation power is required
    torch.save(data, 'encoded_data.pt')




Loading Encoding


  data = torch.load('encoded_data.pt')


In [10]:
# Split data between traiing and evaluation datasets
data_size = len(data)
spl = int(0.9 * data_size)
train_data = data[:spl]
val_data = data[spl:]
print(f'Total size: {data_size/1e6:.2f} million | Training {len(train_data)/1e6:.2f} Million | Validation {len(val_data)/1e6:.2f} Million')




Total size: 59.21 million | Training 53.29 Million | Validation 5.92 Million


In [11]:
def get_batch(split):
    # BS = Batch Size (e.g. 32) / SL = Sequence or Context Length (e.g. 512)
    data = train_data if split == 'train' else val_data
    inds = torch.randint(len(data) - context, (batch_size,))
    x = torch.stack([data[i: i + context] for i in inds]) # (BS, SL) e.g. (32, 512)
    y = torch.stack([data[i + 1: i + context + 1] for i in inds])

    x,y = x.to(device), y.to(device)
    return x,y

x,y = get_batch("train")
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])
    





torch.Size([32, 512]) torch.Size([32, 512])
tensor([ 410, 4051,  310,  353,  944,  836,  381, 4035,  307, 4031],
       device='cuda:0')
tensor([4051,  310,  353,  944,  836,  381, 4035,  307, 4031, 4056],
       device='cuda:0')


In [12]:
class GPT(nn.Module):
    def __init__(self):
        super().__init__()
        self.embeddings = nn.Embedding(vocab_size, embed_size) # In our case will be from 4096 x 384
        self.positions = nn.Embedding(context, embed_size) # e.g. 512 x 384
        # Number of heads with multi-attention mechanism in each layer, which is 7 in this case
        self.blocks = nn.Sequential(*[Block(n_heads) for _ in range(n_layers)]) 
        self.ln = nn.LayerNorm(embed_size)
        self.final_linear = nn.Linear(embed_size, vocab_size, bias=BIAS) # e.g. 384 x 4906, {embeddings x vocab tokens/options}
        self.apply(self._init_weights)
    
    # parameter initialization
    def _init_weights(self, module):
        if isinstance(module, nn.Linear): # If it is linear layer
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)
            if module.bias is not None:
                torch.nn.init.zeros_(module.bias)

        elif isinstance(module, nn.Embedding):
            torch.nn.init.normal_(module.weight, mean = 0.0, std = 0.02)


    def forward(self, input, targets = None):
        # BS = batch size, SL = Sequence or Context Length
        loss = None
        BS, SL = input.shape # BS x SL
        emb = self.embeddings(input) # BS x SL x Embed_size(384)
        pos = self.positions(torch.arange(SL, device = device)) # SL x 384
        x = emb + pos # BS x SL x 384
        x = self.blocks(x) # Pass our input to blocks, traverl through blocks and layers, BS x SL x 384
        x = self.ln(x) # BS x SL x 384 (Embed Size)
        logits = self.final_linear(x) # BS x SL x 4096 (vocab size)



        if targets is not None:
            BS, SL, VS = logits.shape # BS x SL x 4906 (Vocabulary Size)
            logits = logits.view(BS*SL, VS)
            targets = targets.view(BS*SL)
            loss = F.cross_entropy(logits, targets)     


            # Manual calculation 
            counts = logits.exp()
            prob = counts / counts.sum(-1, keepdim = True) # Keep dim = keeping the dimension for dividing each single dimension
            loss2 = prob[torch.arange(BS*SL), targets].log().mean()
            # E.g., Target[3] = 329 prob[3][329] = 0.014
            # Cross entropy = - Log p(x)

            # if (not torch.allclose(loss, loss2)): # This is to show Pytorch calculate entropy is a bit different from manual way, on a time-to-time basis
            #    print(f"[Loss Diff] Pytorch:{loss.item()} Manual:{loss2.item()}")
                
        return logits, loss

        # Generate a new sample
    def generate(self, input, max = 500):
        for _ in range(max):
            input = input[:, -context:] # (1, input length until max of SL)
            logits, _ = self(input) # (1, input length, 4096)
            logits = logits[:, -1,:] # Pick last logits/probability (1, 4096) as we are extracting the last token
            probs = F.softmax(logits, dim = -1) # (1, 4096)
            next = torch.multinomial(probs, num_samples = 1)
            input = torch.cat((input, next), dim = 1)
        return input
                



        
        
            








In [13]:
class Block(nn.Module):
    def __init__(self, n_heads):
        super().__init__()
        head_size = embed_size // n_heads
        self.ma = Multihead(n_heads, head_size)
        self.feed_forward = ForwardLayer(embed_size)
        self.ln1 = nn.LayerNorm(embed_size)
        self.ln2 = nn.LayerNorm(embed_size)


    def forward(self, x):
        x = x + self.ma(self.ln1(x))
        x = x + self.feed_forward(self.ln2(x))
        return x






In [14]:
class ForwardLayer(nn.Module):
    def __init__(self, embed_size):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(embed_size, 6 * embed_size, bias = BIAS),
        nn.GELU(),
        nn.Linear(6 * embed_size, embed_size, bias = BIAS),
        nn.Dropout(dropout)
    )

    def forward(self, x):
        x = self.network(x)
        return x 



In [15]:
class Multihead(nn.Module):
    def __init__(self, n_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(n_heads)])
        self.combine = nn.Linear(head_size * n_heads, embed_size, bias = BIAS) # 378, 384
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        x = torch.cat([head(x) for head in self.heads], dim = -1)
        # Each head output (BS, SL, head_size)
        x = self.combine(x)
        x = self.dropout(x)
        return x

In [16]:
class Head(nn.Module):
    def __init__(self, head_size):
        super().__init__()
        self.queries = nn.Linear(embed_size, head_size, bias = BIAS)
        self.keys = nn.Linear(embed_size, head_size, bias = BIAS)
        self.values = nn.Linear(embed_size, head_size, bias = BIAS)

        self.register_buffer('tril', torch.tril(torch.ones(context, context)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        BS, SL, VS = x.shape
        q = self.queries(x) # BS, SL, 54
        k = self.keys(x) # BS, SL, 54
        v = self.values(x) # BS, SL, 54

        attn_w = q @ k.transpose(-2, -1) * k.shape[-1]**-0.5 # BS, SL, SL
        attn_w = attn_w.masked_fill(self.tril[:SL, :SL] == 0, float('-inf'))
        attn_w = F.softmax(attn_w, dim = -1) # BS, SL, SL

        x = attn_w @ v # BS, SL, 54

        return x
        

In [17]:
head_size = embed_size // n_heads
print(f"embed: {embed_size} n_heads: {n_heads} head_size: {head_size}")

embed: 384 n_heads: 7 head_size: 54


In [52]:
# Understand the attention calculations
# Commented out as it is for learning purpose 

'''
x, y = get_batch('train')
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])
x = x.to(device)
y = y.to(device)

embeddings = nn.Embedding(vocab_size, embed_size).to(device)
positions = nn.Embedding(context, embed_size).to(device)
queries = nn.Linear(embed_size, head_size, bias = BIAS).to(device)
keys = nn.Linear(embed_size, head_size, bias = BIAS).to(device)
values = nn.Linear(embed_size, head_size, bias = BIAS).to(device)
tril = torch.tril(torch.ones(context, context)).to(device)

emb = embeddings(x)
pos = positions(torch.arange(context, device = device))
x = emb + pos

q = queries(x)
k = keys(x)
v = values(x)
print(q.shape, k.shape, v.shape)
torch.set_printoptions(precision = 2, sci_mode = False)
print(q[0][0][:5])

attn_w = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5
attn_w = attn_w.masked_fill(tril[:context, :context] == 0, float('-inf'))
attn_w = F.softmax(attn_w, dim = -1)
x = attn_w @ v
'''

"\nx, y = get_batch('train')\nprint(x.shape, y.shape)\nprint(x[0][:10])\nprint(y[0][:10])\nx = x.to(device)\ny = y.to(device)\n\nembeddings = nn.Embedding(vocab_size, embed_size).to(device)\npositions = nn.Embedding(context, embed_size).to(device)\nqueries = nn.Linear(embed_size, head_size, bias = BIAS).to(device)\nkeys = nn.Linear(embed_size, head_size, bias = BIAS).to(device)\nvalues = nn.Linear(embed_size, head_size, bias = BIAS).to(device)\ntril = torch.tril(torch.ones(context, context)).to(device)\n\nemb = embeddings(x)\npos = positions(torch.arange(context, device = device))\nx = emb + pos\n\nq = queries(x)\nk = keys(x)\nv = values(x)\nprint(q.shape, k.shape, v.shape)\ntorch.set_printoptions(precision = 2, sci_mode = False)\nprint(q[0][0][:5])\n\nattn_w = q @ k.transpose(-2, -1) * k.shape[-1] ** -0.5\nattn_w = attn_w.masked_fill(tril[:context, :context] == 0, float('-inf'))\nattn_w = F.softmax(attn_w, dim = -1)\nx = attn_w @ v\n"

In [53]:
# Understand Attention Matrix
# Commented out as it is for learning purpose 
'''
full = q @ k.transpose(-2, -1) # 512*54 @ 54 x 512 = 512 x 512

a = q[0][5]
b = k.transpose(-2, -1)[0, :, 3]
print(a, b)
c = torch.dot(a, b)
print(c)
print(full[0][5][3])
'''

'\nfull = q @ k.transpose(-2, -1) # 512*54 @ 54 x 512 = 512 x 512\n\na = q[0][5]\nb = k.transpose(-2, -1)[0, :, 3]\nprint(a, b)\nc = torch.dot(a, b)\nprint(c)\nprint(full[0][5][3])\n'

In [54]:
# Understand the updating of the V content
# Commented out as it is for learning purpose 
'''
print(attn_w.shape, v.shape)

print(x[0][7])

attn_scores2 = attn_w[0, 7, :] # Shape [512]

# Initialize a tensor to sotre the result
result = torch.zeros(54)
# Compute the dot product for each column in V for the first tokens in the first batch
for i in range(54):
    result[i] = torch.dot(attn_scores2, v[0, :, i])

print(result)
'''


'\nprint(attn_w.shape, v.shape)\n\nprint(x[0][7])\n\nattn_scores2 = attn_w[0, 7, :] # Shape [512]\n\n# Initialize a tensor to sotre the result\nresult = torch.zeros(54)\n# Compute the dot product for each column in V for the first tokens in the first batch\nfor i in range(54):\n    result[i] = torch.dot(attn_scores2, v[0, :, i])\n\nprint(result)\n'

In [18]:
# Optional (after doing it, take out the loss2 from the output of the model)
x, y = get_batch('train')
print(x.shape, y.shape)
print(x[0][:10])
print(y[0][:10])

model = GPT()
model = model.to(dtype) # Denote the precision level
model = model.to(device)


# logit, loss, loss2 = model(x, y)
# print(loss.item(), loss2.item())

torch.Size([32, 512]) torch.Size([32, 512])
tensor([4055,  307,  590,  337,  430, 1248, 4042, 2087,  437, 1675],
       device='cuda:0')
tensor([ 307,  590,  337,  430, 1248, 4042, 2087,  437, 1675,  441],
       device='cuda:0')


In [19]:
@torch.no_grad()
def generate_sample(input):
    t1 = torch.tensor(encode(input), dtype = torch.long, device = device) # take input, encode it, transform to tensor
    t1 = t1[None, :] # (1, [size of the ids]  - BS = 1 as we are inferencing
    newgen = model.generate(t1, max = 64)[0].tolist()
    result = decode(newgen)
    print(f"{result}")

# generate_sample('I am a person.')




In [20]:
# Training Setup


model = GPT()
model = model.to(dtype)
model = model.to(device)

if compile:
    print('Torch: Compiling model')
    model = torch.compile (model)

print(sum(p.numel() for p in model.parameters()) / 1e6, 'Million parameters')








Torch: Compiling model
19.837954 Million parameters


In [21]:
# Calculate loss averages
import torch._dynamo
torch._dynamo.config.suppress_errors = True

@torch.no_grad()
def calculate_loss():
# WE are evaluating but not training so this decorator
    out = {}
    model.eval()
    for split in ['train', 'eval']:
        l = torch.zeros(eval_iters)
        # calculation the MEAN of the loss
        for i in range(eval_iters):
            x, y = get_batch(split)
            _, loss = model(x, y)
            l[i] = loss
        out[split] = l.mean().item()
    model.train()
    return out

l = calculate_loss()
print(l)
            









W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125] WON'T CONVERT forward C:\Users\ZGMF-X42S\AppData\Local\Temp\ipykernel_2036\3703986496.py line 23 
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125] due to: 
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125] Traceback (most recent call last):
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125]   File "C:\Users\ZGMF-X42S\miniconda3\envs\test2\Lib\site-packages\torch\_dynamo\output_graph.py", line 1446, in _call_user_compiler
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125]     compiled_fn = compiler_fn(gm, self.example_inputs())
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125]                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
W1029 11:03:09.496000 2036 site-packages\torch\_dynamo\convert_frame.py:1125]   File "C:\Users\ZGMF-X42S\miniconda3\envs\test2\Lib\site-package

{'train': 8.375, 'eval': 8.375}


In [22]:
# Optimizer for minimizing the cross-entropy loss 

p_dict = {p_name: p for p_name, p in model.named_parameters() if p.requires_grad}

# Only those with multiple dimensions will benefit from the weight decay
weight_decay_p = [p for n, p in p_dict.items() if p.dim() >= 2]

no_weight_decay_p = [p for n, p in p_dict.items() if p.dim() <2]


optimizer_groups = [
    {'params': weight_decay_p, 'weight_decay': weight_decay},
    {'params': no_weight_decay_p, 'weight_decay': 0.0}
]

optimizer = torch.optim.AdamW(optimizer_groups, lr = lr, betas = (0.9, 0.99))

scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, train_iters, eta_min = lr / 10) 

start_iteration = 0
best_val_loss = float('inf') # Track the best validation loss value











In [23]:
# Loading Checkpoints


# def load_checkpoint(path):
#     print('LLM - Loading Model')
#     checkpoint = torch.load(path)
#     model.load_state_dict(checkpoint['model_state_dict'])
#     optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#     iteration = checkpoint['iteration']
#     loss = checkpoint['loss']
#     print(f"Loaded iter {iteration} with loss {loss}")
#     return iteration, loss


# if os.path.exists(f"{checkpoint_dir}/{checkpoint_fn}") and load_pretrained:
#     checkpoint_path = f"{checkpoint_dir}/{checkpoint_load_fn}"
#     start_iteration, loss = load_checkpoint(checkpoint_path)
#     best_val_loss = loss


def load_checkpoint(path):
    print('LLM - Loading Model')
    try:
        checkpoint = torch.load(path)
        print("\nCheckpoint contents:")
        for key in checkpoint.keys():
            print(f"- {key}")
            
        if 'model_state_dict' in checkpoint:
            checkpoint_state = checkpoint['model_state_dict']
            
            # Try to handle OptimizedModule wrapper
            if hasattr(model, '_orig_mod'):
                print("Detected OptimizedModule wrapper, attempting to load into _orig_mod")
                try:
                    # Try loading into the wrapped module
                    model._orig_mod.load_state_dict(checkpoint_state)
                    print("Successfully loaded state into _orig_mod")
                except Exception as e:
                    print(f"Failed to load into _orig_mod: {str(e)}")
                    # If that fails, try to modify the state dict keys
                    modified_state = {"_orig_mod." + k: v for k, v in checkpoint_state.items()}
                    try:
                        model.load_state_dict(modified_state)
                        print("Successfully loaded state with modified keys")
                    except Exception as e2:
                        print(f"Failed to load with modified keys: {str(e2)}")
                        raise
            else:
                # Regular loading if no wrapper
                model.load_state_dict(checkpoint_state)
                
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        iteration = checkpoint['iteration']
        loss = checkpoint['loss']
        print(f"Loaded iter {iteration} with loss {loss}")
        return iteration, loss
        
    except Exception as e:
        print(f"\nDetailed error information:")
        print(f"Error type: {type(e).__name__}")
        print(f"Error message: {str(e)}")
        
        # Print some debugging information
        if 'model_state_dict' in checkpoint:
            print("\nFirst few checkpoint keys:")
            for k in list(checkpoint['model_state_dict'].keys())[:5]:
                print(f"- {k}")
            
            print("\nFirst few model state_dict keys:")
            for k in list(model.state_dict().keys())[:5]:
                print(f"- {k}")
        raise

if os.path.exists(f"{checkpoint_dir}/{checkpoint_load_fn}") and load_pretrained:
    checkpoint_path = f"{checkpoint_dir}/{checkpoint_load_fn}"
    print(f"Loading checkpoint from: {checkpoint_path}")
    start_iteration, loss = load_checkpoint(checkpoint_path)
    best_val_loss = loss

# DEBUGGING MODULE
# def load_checkpoint(path):
#     print('LLM - Loading Model')
#     try:
#         # Check if path exists
#         if not os.path.exists(path):
#             print(f"Checkpoint path not found: {path}")
#             return 0, float('inf')
            
#         checkpoint = torch.load(path)
#         model.load_state_dict(checkpoint['model_state_dict'])
#         optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
#         iteration = checkpoint['iteration']
#         loss = checkpoint['loss']
#         print(f"Loaded iter {iteration} with loss {loss}")
#         return iteration, loss
#     except Exception as e:
#         print(f"Error loading checkpoint: {e}")
#         return 0, float('inf')

# # Fix path construction
# checkpoint_path = os.path.join(checkpoint_dir, checkpoint_fn)
# # or
# # checkpoint_path = f"{checkpoint_dir}/{checkpoint_fn}"

# print(f"Checking path: {checkpoint_path}")
# print(f"Path exists: {os.path.exists(checkpoint_path)}")
# print(f"load_pretrained value: {load_pretrained}")

# if os.path.exists(checkpoint_path) and load_pretrained:
#     print("Loading checkpoint...")
#     start_iteration, loss = load_checkpoint(checkpoint_path)
#     best_val_loss = loss
# else:
#     print("No checkpoint loaded because:")
#     if not os.path.exists(checkpoint_path):
#         print(f"- Checkpoint path does not exist: {checkpoint_path}")
#     if not load_pretrained:
#         print("- load_pretrained is False")



Loading checkpoint from: models//latest.pt
LLM - Loading Model


  checkpoint = torch.load(path)



Checkpoint contents:
- model_state_dict
- optimizer_state_dict
- loss
- iteration
Detected OptimizedModule wrapper, attempting to load into _orig_mod
Failed to load into _orig_mod: Error(s) in loading state_dict for GPT:
	Missing key(s) in state_dict: "embeddings.weight", "positions.weight", "blocks.0.ma.heads.0.tril", "blocks.0.ma.heads.0.queries.weight", "blocks.0.ma.heads.0.queries.bias", "blocks.0.ma.heads.0.keys.weight", "blocks.0.ma.heads.0.keys.bias", "blocks.0.ma.heads.0.values.weight", "blocks.0.ma.heads.0.values.bias", "blocks.0.ma.heads.1.tril", "blocks.0.ma.heads.1.queries.weight", "blocks.0.ma.heads.1.queries.bias", "blocks.0.ma.heads.1.keys.weight", "blocks.0.ma.heads.1.keys.bias", "blocks.0.ma.heads.1.values.weight", "blocks.0.ma.heads.1.values.bias", "blocks.0.ma.heads.2.tril", "blocks.0.ma.heads.2.queries.weight", "blocks.0.ma.heads.2.queries.bias", "blocks.0.ma.heads.2.keys.weight", "blocks.0.ma.heads.2.keys.bias", "blocks.0.ma.heads.2.values.weight", "blocks.0.ma.he

RuntimeError: Error(s) in loading state_dict for OptimizedModule:
	Missing key(s) in state_dict: "_orig_mod.embeddings.weight", "_orig_mod.positions.weight", "_orig_mod.blocks.0.ma.heads.0.tril", "_orig_mod.blocks.0.ma.heads.0.queries.weight", "_orig_mod.blocks.0.ma.heads.0.queries.bias", "_orig_mod.blocks.0.ma.heads.0.keys.weight", "_orig_mod.blocks.0.ma.heads.0.keys.bias", "_orig_mod.blocks.0.ma.heads.0.values.weight", "_orig_mod.blocks.0.ma.heads.0.values.bias", "_orig_mod.blocks.0.ma.heads.1.tril", "_orig_mod.blocks.0.ma.heads.1.queries.weight", "_orig_mod.blocks.0.ma.heads.1.queries.bias", "_orig_mod.blocks.0.ma.heads.1.keys.weight", "_orig_mod.blocks.0.ma.heads.1.keys.bias", "_orig_mod.blocks.0.ma.heads.1.values.weight", "_orig_mod.blocks.0.ma.heads.1.values.bias", "_orig_mod.blocks.0.ma.heads.2.tril", "_orig_mod.blocks.0.ma.heads.2.queries.weight", "_orig_mod.blocks.0.ma.heads.2.queries.bias", "_orig_mod.blocks.0.ma.heads.2.keys.weight", "_orig_mod.blocks.0.ma.heads.2.keys.bias", "_orig_mod.blocks.0.ma.heads.2.values.weight", "_orig_mod.blocks.0.ma.heads.2.values.bias", "_orig_mod.blocks.0.ma.heads.3.tril", "_orig_mod.blocks.0.ma.heads.3.queries.weight", "_orig_mod.blocks.0.ma.heads.3.queries.bias", "_orig_mod.blocks.0.ma.heads.3.keys.weight", "_orig_mod.blocks.0.ma.heads.3.keys.bias", "_orig_mod.blocks.0.ma.heads.3.values.weight", "_orig_mod.blocks.0.ma.heads.3.values.bias", "_orig_mod.blocks.0.ma.heads.4.tril", "_orig_mod.blocks.0.ma.heads.4.queries.weight", "_orig_mod.blocks.0.ma.heads.4.queries.bias", "_orig_mod.blocks.0.ma.heads.4.keys.weight", "_orig_mod.blocks.0.ma.heads.4.keys.bias", "_orig_mod.blocks.0.ma.heads.4.values.weight", "_orig_mod.blocks.0.ma.heads.4.values.bias", "_orig_mod.blocks.0.ma.heads.5.tril", "_orig_mod.blocks.0.ma.heads.5.queries.weight", "_orig_mod.blocks.0.ma.heads.5.queries.bias", "_orig_mod.blocks.0.ma.heads.5.keys.weight", "_orig_mod.blocks.0.ma.heads.5.keys.bias", "_orig_mod.blocks.0.ma.heads.5.values.weight", "_orig_mod.blocks.0.ma.heads.5.values.bias", "_orig_mod.blocks.0.ma.heads.6.tril", "_orig_mod.blocks.0.ma.heads.6.queries.weight", "_orig_mod.blocks.0.ma.heads.6.queries.bias", "_orig_mod.blocks.0.ma.heads.6.keys.weight", "_orig_mod.blocks.0.ma.heads.6.keys.bias", "_orig_mod.blocks.0.ma.heads.6.values.weight", "_orig_mod.blocks.0.ma.heads.6.values.bias", "_orig_mod.blocks.0.ma.combine.weight", "_orig_mod.blocks.0.ma.combine.bias", "_orig_mod.blocks.0.feed_forward.network.0.weight", "_orig_mod.blocks.0.feed_forward.network.0.bias", "_orig_mod.blocks.0.feed_forward.network.2.weight", "_orig_mod.blocks.0.feed_forward.network.2.bias", "_orig_mod.blocks.0.ln1.weight", "_orig_mod.blocks.0.ln1.bias", "_orig_mod.blocks.0.ln2.weight", "_orig_mod.blocks.0.ln2.bias", "_orig_mod.blocks.1.ma.heads.0.tril", "_orig_mod.blocks.1.ma.heads.0.queries.weight", "_orig_mod.blocks.1.ma.heads.0.queries.bias", "_orig_mod.blocks.1.ma.heads.0.keys.weight", "_orig_mod.blocks.1.ma.heads.0.keys.bias", "_orig_mod.blocks.1.ma.heads.0.values.weight", "_orig_mod.blocks.1.ma.heads.0.values.bias", "_orig_mod.blocks.1.ma.heads.1.tril", "_orig_mod.blocks.1.ma.heads.1.queries.weight", "_orig_mod.blocks.1.ma.heads.1.queries.bias", "_orig_mod.blocks.1.ma.heads.1.keys.weight", "_orig_mod.blocks.1.ma.heads.1.keys.bias", "_orig_mod.blocks.1.ma.heads.1.values.weight", "_orig_mod.blocks.1.ma.heads.1.values.bias", "_orig_mod.blocks.1.ma.heads.2.tril", "_orig_mod.blocks.1.ma.heads.2.queries.weight", "_orig_mod.blocks.1.ma.heads.2.queries.bias", "_orig_mod.blocks.1.ma.heads.2.keys.weight", "_orig_mod.blocks.1.ma.heads.2.keys.bias", "_orig_mod.blocks.1.ma.heads.2.values.weight", "_orig_mod.blocks.1.ma.heads.2.values.bias", "_orig_mod.blocks.1.ma.heads.3.tril", "_orig_mod.blocks.1.ma.heads.3.queries.weight", "_orig_mod.blocks.1.ma.heads.3.queries.bias", "_orig_mod.blocks.1.ma.heads.3.keys.weight", "_orig_mod.blocks.1.ma.heads.3.keys.bias", "_orig_mod.blocks.1.ma.heads.3.values.weight", "_orig_mod.blocks.1.ma.heads.3.values.bias", "_orig_mod.blocks.1.ma.heads.4.tril", "_orig_mod.blocks.1.ma.heads.4.queries.weight", "_orig_mod.blocks.1.ma.heads.4.queries.bias", "_orig_mod.blocks.1.ma.heads.4.keys.weight", "_orig_mod.blocks.1.ma.heads.4.keys.bias", "_orig_mod.blocks.1.ma.heads.4.values.weight", "_orig_mod.blocks.1.ma.heads.4.values.bias", "_orig_mod.blocks.1.ma.heads.5.tril", "_orig_mod.blocks.1.ma.heads.5.queries.weight", "_orig_mod.blocks.1.ma.heads.5.queries.bias", "_orig_mod.blocks.1.ma.heads.5.keys.weight", "_orig_mod.blocks.1.ma.heads.5.keys.bias", "_orig_mod.blocks.1.ma.heads.5.values.weight", "_orig_mod.blocks.1.ma.heads.5.values.bias", "_orig_mod.blocks.1.ma.heads.6.tril", "_orig_mod.blocks.1.ma.heads.6.queries.weight", "_orig_mod.blocks.1.ma.heads.6.queries.bias", "_orig_mod.blocks.1.ma.heads.6.keys.weight", "_orig_mod.blocks.1.ma.heads.6.keys.bias", "_orig_mod.blocks.1.ma.heads.6.values.weight", "_orig_mod.blocks.1.ma.heads.6.values.bias", "_orig_mod.blocks.1.ma.combine.weight", "_orig_mod.blocks.1.ma.combine.bias", "_orig_mod.blocks.1.feed_forward.network.0.weight", "_orig_mod.blocks.1.feed_forward.network.0.bias", "_orig_mod.blocks.1.feed_forward.network.2.weight", "_orig_mod.blocks.1.feed_forward.network.2.bias", "_orig_mod.blocks.1.ln1.weight", "_orig_mod.blocks.1.ln1.bias", "_orig_mod.blocks.1.ln2.weight", "_orig_mod.blocks.1.ln2.bias", "_orig_mod.blocks.2.ma.heads.0.tril", "_orig_mod.blocks.2.ma.heads.0.queries.weight", "_orig_mod.blocks.2.ma.heads.0.queries.bias", "_orig_mod.blocks.2.ma.heads.0.keys.weight", "_orig_mod.blocks.2.ma.heads.0.keys.bias", "_orig_mod.blocks.2.ma.heads.0.values.weight", "_orig_mod.blocks.2.ma.heads.0.values.bias", "_orig_mod.blocks.2.ma.heads.1.tril", "_orig_mod.blocks.2.ma.heads.1.queries.weight", "_orig_mod.blocks.2.ma.heads.1.queries.bias", "_orig_mod.blocks.2.ma.heads.1.keys.weight", "_orig_mod.blocks.2.ma.heads.1.keys.bias", "_orig_mod.blocks.2.ma.heads.1.values.weight", "_orig_mod.blocks.2.ma.heads.1.values.bias", "_orig_mod.blocks.2.ma.heads.2.tril", "_orig_mod.blocks.2.ma.heads.2.queries.weight", "_orig_mod.blocks.2.ma.heads.2.queries.bias", "_orig_mod.blocks.2.ma.heads.2.keys.weight", "_orig_mod.blocks.2.ma.heads.2.keys.bias", "_orig_mod.blocks.2.ma.heads.2.values.weight", "_orig_mod.blocks.2.ma.heads.2.values.bias", "_orig_mod.blocks.2.ma.heads.3.tril", "_orig_mod.blocks.2.ma.heads.3.queries.weight", "_orig_mod.blocks.2.ma.heads.3.queries.bias", "_orig_mod.blocks.2.ma.heads.3.keys.weight", "_orig_mod.blocks.2.ma.heads.3.keys.bias", "_orig_mod.blocks.2.ma.heads.3.values.weight", "_orig_mod.blocks.2.ma.heads.3.values.bias", "_orig_mod.blocks.2.ma.heads.4.tril", "_orig_mod.blocks.2.ma.heads.4.queries.weight", "_orig_mod.blocks.2.ma.heads.4.queries.bias", "_orig_mod.blocks.2.ma.heads.4.keys.weight", "_orig_mod.blocks.2.ma.heads.4.keys.bias", "_orig_mod.blocks.2.ma.heads.4.values.weight", "_orig_mod.blocks.2.ma.heads.4.values.bias", "_orig_mod.blocks.2.ma.heads.5.tril", "_orig_mod.blocks.2.ma.heads.5.queries.weight", "_orig_mod.blocks.2.ma.heads.5.queries.bias", "_orig_mod.blocks.2.ma.heads.5.keys.weight", "_orig_mod.blocks.2.ma.heads.5.keys.bias", "_orig_mod.blocks.2.ma.heads.5.values.weight", "_orig_mod.blocks.2.ma.heads.5.values.bias", "_orig_mod.blocks.2.ma.heads.6.tril", "_orig_mod.blocks.2.ma.heads.6.queries.weight", "_orig_mod.blocks.2.ma.heads.6.queries.bias", "_orig_mod.blocks.2.ma.heads.6.keys.weight", "_orig_mod.blocks.2.ma.heads.6.keys.bias", "_orig_mod.blocks.2.ma.heads.6.values.weight", "_orig_mod.blocks.2.ma.heads.6.values.bias", "_orig_mod.blocks.2.ma.combine.weight", "_orig_mod.blocks.2.ma.combine.bias", "_orig_mod.blocks.2.feed_forward.network.0.weight", "_orig_mod.blocks.2.feed_forward.network.0.bias", "_orig_mod.blocks.2.feed_forward.network.2.weight", "_orig_mod.blocks.2.feed_forward.network.2.bias", "_orig_mod.blocks.2.ln1.weight", "_orig_mod.blocks.2.ln1.bias", "_orig_mod.blocks.2.ln2.weight", "_orig_mod.blocks.2.ln2.bias", "_orig_mod.blocks.3.ma.heads.0.tril", "_orig_mod.blocks.3.ma.heads.0.queries.weight", "_orig_mod.blocks.3.ma.heads.0.queries.bias", "_orig_mod.blocks.3.ma.heads.0.keys.weight", "_orig_mod.blocks.3.ma.heads.0.keys.bias", "_orig_mod.blocks.3.ma.heads.0.values.weight", "_orig_mod.blocks.3.ma.heads.0.values.bias", "_orig_mod.blocks.3.ma.heads.1.tril", "_orig_mod.blocks.3.ma.heads.1.queries.weight", "_orig_mod.blocks.3.ma.heads.1.queries.bias", "_orig_mod.blocks.3.ma.heads.1.keys.weight", "_orig_mod.blocks.3.ma.heads.1.keys.bias", "_orig_mod.blocks.3.ma.heads.1.values.weight", "_orig_mod.blocks.3.ma.heads.1.values.bias", "_orig_mod.blocks.3.ma.heads.2.tril", "_orig_mod.blocks.3.ma.heads.2.queries.weight", "_orig_mod.blocks.3.ma.heads.2.queries.bias", "_orig_mod.blocks.3.ma.heads.2.keys.weight", "_orig_mod.blocks.3.ma.heads.2.keys.bias", "_orig_mod.blocks.3.ma.heads.2.values.weight", "_orig_mod.blocks.3.ma.heads.2.values.bias", "_orig_mod.blocks.3.ma.heads.3.tril", "_orig_mod.blocks.3.ma.heads.3.queries.weight", "_orig_mod.blocks.3.ma.heads.3.queries.bias", "_orig_mod.blocks.3.ma.heads.3.keys.weight", "_orig_mod.blocks.3.ma.heads.3.keys.bias", "_orig_mod.blocks.3.ma.heads.3.values.weight", "_orig_mod.blocks.3.ma.heads.3.values.bias", "_orig_mod.blocks.3.ma.heads.4.tril", "_orig_mod.blocks.3.ma.heads.4.queries.weight", "_orig_mod.blocks.3.ma.heads.4.queries.bias", "_orig_mod.blocks.3.ma.heads.4.keys.weight", "_orig_mod.blocks.3.ma.heads.4.keys.bias", "_orig_mod.blocks.3.ma.heads.4.values.weight", "_orig_mod.blocks.3.ma.heads.4.values.bias", "_orig_mod.blocks.3.ma.heads.5.tril", "_orig_mod.blocks.3.ma.heads.5.queries.weight", "_orig_mod.blocks.3.ma.heads.5.queries.bias", "_orig_mod.blocks.3.ma.heads.5.keys.weight", "_orig_mod.blocks.3.ma.heads.5.keys.bias", "_orig_mod.blocks.3.ma.heads.5.values.weight", "_orig_mod.blocks.3.ma.heads.5.values.bias", "_orig_mod.blocks.3.ma.heads.6.tril", "_orig_mod.blocks.3.ma.heads.6.queries.weight", "_orig_mod.blocks.3.ma.heads.6.queries.bias", "_orig_mod.blocks.3.ma.heads.6.keys.weight", "_orig_mod.blocks.3.ma.heads.6.keys.bias", "_orig_mod.blocks.3.ma.heads.6.values.weight", "_orig_mod.blocks.3.ma.heads.6.values.bias", "_orig_mod.blocks.3.ma.combine.weight", "_orig_mod.blocks.3.ma.combine.bias", "_orig_mod.blocks.3.feed_forward.network.0.weight", "_orig_mod.blocks.3.feed_forward.network.0.bias", "_orig_mod.blocks.3.feed_forward.network.2.weight", "_orig_mod.blocks.3.feed_forward.network.2.bias", "_orig_mod.blocks.3.ln1.weight", "_orig_mod.blocks.3.ln1.bias", "_orig_mod.blocks.3.ln2.weight", "_orig_mod.blocks.3.ln2.bias", "_orig_mod.blocks.4.ma.heads.0.tril", "_orig_mod.blocks.4.ma.heads.0.queries.weight", "_orig_mod.blocks.4.ma.heads.0.queries.bias", "_orig_mod.blocks.4.ma.heads.0.keys.weight", "_orig_mod.blocks.4.ma.heads.0.keys.bias", "_orig_mod.blocks.4.ma.heads.0.values.weight", "_orig_mod.blocks.4.ma.heads.0.values.bias", "_orig_mod.blocks.4.ma.heads.1.tril", "_orig_mod.blocks.4.ma.heads.1.queries.weight", "_orig_mod.blocks.4.ma.heads.1.queries.bias", "_orig_mod.blocks.4.ma.heads.1.keys.weight", "_orig_mod.blocks.4.ma.heads.1.keys.bias", "_orig_mod.blocks.4.ma.heads.1.values.weight", "_orig_mod.blocks.4.ma.heads.1.values.bias", "_orig_mod.blocks.4.ma.heads.2.tril", "_orig_mod.blocks.4.ma.heads.2.queries.weight", "_orig_mod.blocks.4.ma.heads.2.queries.bias", "_orig_mod.blocks.4.ma.heads.2.keys.weight", "_orig_mod.blocks.4.ma.heads.2.keys.bias", "_orig_mod.blocks.4.ma.heads.2.values.weight", "_orig_mod.blocks.4.ma.heads.2.values.bias", "_orig_mod.blocks.4.ma.heads.3.tril", "_orig_mod.blocks.4.ma.heads.3.queries.weight", "_orig_mod.blocks.4.ma.heads.3.queries.bias", "_orig_mod.blocks.4.ma.heads.3.keys.weight", "_orig_mod.blocks.4.ma.heads.3.keys.bias", "_orig_mod.blocks.4.ma.heads.3.values.weight", "_orig_mod.blocks.4.ma.heads.3.values.bias", "_orig_mod.blocks.4.ma.heads.4.tril", "_orig_mod.blocks.4.ma.heads.4.queries.weight", "_orig_mod.blocks.4.ma.heads.4.queries.bias", "_orig_mod.blocks.4.ma.heads.4.keys.weight", "_orig_mod.blocks.4.ma.heads.4.keys.bias", "_orig_mod.blocks.4.ma.heads.4.values.weight", "_orig_mod.blocks.4.ma.heads.4.values.bias", "_orig_mod.blocks.4.ma.heads.5.tril", "_orig_mod.blocks.4.ma.heads.5.queries.weight", "_orig_mod.blocks.4.ma.heads.5.queries.bias", "_orig_mod.blocks.4.ma.heads.5.keys.weight", "_orig_mod.blocks.4.ma.heads.5.keys.bias", "_orig_mod.blocks.4.ma.heads.5.values.weight", "_orig_mod.blocks.4.ma.heads.5.values.bias", "_orig_mod.blocks.4.ma.heads.6.tril", "_orig_mod.blocks.4.ma.heads.6.queries.weight", "_orig_mod.blocks.4.ma.heads.6.queries.bias", "_orig_mod.blocks.4.ma.heads.6.keys.weight", "_orig_mod.blocks.4.ma.heads.6.keys.bias", "_orig_mod.blocks.4.ma.heads.6.values.weight", "_orig_mod.blocks.4.ma.heads.6.values.bias", "_orig_mod.blocks.4.ma.combine.weight", "_orig_mod.blocks.4.ma.combine.bias", "_orig_mod.blocks.4.feed_forward.network.0.weight", "_orig_mod.blocks.4.feed_forward.network.0.bias", "_orig_mod.blocks.4.feed_forward.network.2.weight", "_orig_mod.blocks.4.feed_forward.network.2.bias", "_orig_mod.blocks.4.ln1.weight", "_orig_mod.blocks.4.ln1.bias", "_orig_mod.blocks.4.ln2.weight", "_orig_mod.blocks.4.ln2.bias", "_orig_mod.blocks.5.ma.heads.0.tril", "_orig_mod.blocks.5.ma.heads.0.queries.weight", "_orig_mod.blocks.5.ma.heads.0.queries.bias", "_orig_mod.blocks.5.ma.heads.0.keys.weight", "_orig_mod.blocks.5.ma.heads.0.keys.bias", "_orig_mod.blocks.5.ma.heads.0.values.weight", "_orig_mod.blocks.5.ma.heads.0.values.bias", "_orig_mod.blocks.5.ma.heads.1.tril", "_orig_mod.blocks.5.ma.heads.1.queries.weight", "_orig_mod.blocks.5.ma.heads.1.queries.bias", "_orig_mod.blocks.5.ma.heads.1.keys.weight", "_orig_mod.blocks.5.ma.heads.1.keys.bias", "_orig_mod.blocks.5.ma.heads.1.values.weight", "_orig_mod.blocks.5.ma.heads.1.values.bias", "_orig_mod.blocks.5.ma.heads.2.tril", "_orig_mod.blocks.5.ma.heads.2.queries.weight", "_orig_mod.blocks.5.ma.heads.2.queries.bias", "_orig_mod.blocks.5.ma.heads.2.keys.weight", "_orig_mod.blocks.5.ma.heads.2.keys.bias", "_orig_mod.blocks.5.ma.heads.2.values.weight", "_orig_mod.blocks.5.ma.heads.2.values.bias", "_orig_mod.blocks.5.ma.heads.3.tril", "_orig_mod.blocks.5.ma.heads.3.queries.weight", "_orig_mod.blocks.5.ma.heads.3.queries.bias", "_orig_mod.blocks.5.ma.heads.3.keys.weight", "_orig_mod.blocks.5.ma.heads.3.keys.bias", "_orig_mod.blocks.5.ma.heads.3.values.weight", "_orig_mod.blocks.5.ma.heads.3.values.bias", "_orig_mod.blocks.5.ma.heads.4.tril", "_orig_mod.blocks.5.ma.heads.4.queries.weight", "_orig_mod.blocks.5.ma.heads.4.queries.bias", "_orig_mod.blocks.5.ma.heads.4.keys.weight", "_orig_mod.blocks.5.ma.heads.4.keys.bias", "_orig_mod.blocks.5.ma.heads.4.values.weight", "_orig_mod.blocks.5.ma.heads.4.values.bias", "_orig_mod.blocks.5.ma.heads.5.tril", "_orig_mod.blocks.5.ma.heads.5.queries.weight", "_orig_mod.blocks.5.ma.heads.5.queries.bias", "_orig_mod.blocks.5.ma.heads.5.keys.weight", "_orig_mod.blocks.5.ma.heads.5.keys.bias", "_orig_mod.blocks.5.ma.heads.5.values.weight", "_orig_mod.blocks.5.ma.heads.5.values.bias", "_orig_mod.blocks.5.ma.heads.6.tril", "_orig_mod.blocks.5.ma.heads.6.queries.weight", "_orig_mod.blocks.5.ma.heads.6.queries.bias", "_orig_mod.blocks.5.ma.heads.6.keys.weight", "_orig_mod.blocks.5.ma.heads.6.keys.bias", "_orig_mod.blocks.5.ma.heads.6.values.weight", "_orig_mod.blocks.5.ma.heads.6.values.bias", "_orig_mod.blocks.5.ma.combine.weight", "_orig_mod.blocks.5.ma.combine.bias", "_orig_mod.blocks.5.feed_forward.network.0.weight", "_orig_mod.blocks.5.feed_forward.network.0.bias", "_orig_mod.blocks.5.feed_forward.network.2.weight", "_orig_mod.blocks.5.feed_forward.network.2.bias", "_orig_mod.blocks.5.ln1.weight", "_orig_mod.blocks.5.ln1.bias", "_orig_mod.blocks.5.ln2.weight", "_orig_mod.blocks.5.ln2.bias", "_orig_mod.blocks.6.ma.heads.0.tril", "_orig_mod.blocks.6.ma.heads.0.queries.weight", "_orig_mod.blocks.6.ma.heads.0.queries.bias", "_orig_mod.blocks.6.ma.heads.0.keys.weight", "_orig_mod.blocks.6.ma.heads.0.keys.bias", "_orig_mod.blocks.6.ma.heads.0.values.weight", "_orig_mod.blocks.6.ma.heads.0.values.bias", "_orig_mod.blocks.6.ma.heads.1.tril", "_orig_mod.blocks.6.ma.heads.1.queries.weight", "_orig_mod.blocks.6.ma.heads.1.queries.bias", "_orig_mod.blocks.6.ma.heads.1.keys.weight", "_orig_mod.blocks.6.ma.heads.1.keys.bias", "_orig_mod.blocks.6.ma.heads.1.values.weight", "_orig_mod.blocks.6.ma.heads.1.values.bias", "_orig_mod.blocks.6.ma.heads.2.tril", "_orig_mod.blocks.6.ma.heads.2.queries.weight", "_orig_mod.blocks.6.ma.heads.2.queries.bias", "_orig_mod.blocks.6.ma.heads.2.keys.weight", "_orig_mod.blocks.6.ma.heads.2.keys.bias", "_orig_mod.blocks.6.ma.heads.2.values.weight", "_orig_mod.blocks.6.ma.heads.2.values.bias", "_orig_mod.blocks.6.ma.heads.3.tril", "_orig_mod.blocks.6.ma.heads.3.queries.weight", "_orig_mod.blocks.6.ma.heads.3.queries.bias", "_orig_mod.blocks.6.ma.heads.3.keys.weight", "_orig_mod.blocks.6.ma.heads.3.keys.bias", "_orig_mod.blocks.6.ma.heads.3.values.weight", "_orig_mod.blocks.6.ma.heads.3.values.bias", "_orig_mod.blocks.6.ma.heads.4.tril", "_orig_mod.blocks.6.ma.heads.4.queries.weight", "_orig_mod.blocks.6.ma.heads.4.queries.bias", "_orig_mod.blocks.6.ma.heads.4.keys.weight", "_orig_mod.blocks.6.ma.heads.4.keys.bias", "_orig_mod.blocks.6.ma.heads.4.values.weight", "_orig_mod.blocks.6.ma.heads.4.values.bias", "_orig_mod.blocks.6.ma.heads.5.tril", "_orig_mod.blocks.6.ma.heads.5.queries.weight", "_orig_mod.blocks.6.ma.heads.5.queries.bias", "_orig_mod.blocks.6.ma.heads.5.keys.weight", "_orig_mod.blocks.6.ma.heads.5.keys.bias", "_orig_mod.blocks.6.ma.heads.5.values.weight", "_orig_mod.blocks.6.ma.heads.5.values.bias", "_orig_mod.blocks.6.ma.heads.6.tril", "_orig_mod.blocks.6.ma.heads.6.queries.weight", "_orig_mod.blocks.6.ma.heads.6.queries.bias", "_orig_mod.blocks.6.ma.heads.6.keys.weight", "_orig_mod.blocks.6.ma.heads.6.keys.bias", "_orig_mod.blocks.6.ma.heads.6.values.weight", "_orig_mod.blocks.6.ma.heads.6.values.bias", "_orig_mod.blocks.6.ma.combine.weight", "_orig_mod.blocks.6.ma.combine.bias", "_orig_mod.blocks.6.feed_forward.network.0.weight", "_orig_mod.blocks.6.feed_forward.network.0.bias", "_orig_mod.blocks.6.feed_forward.network.2.weight", "_orig_mod.blocks.6.feed_forward.network.2.bias", "_orig_mod.blocks.6.ln1.weight", "_orig_mod.blocks.6.ln1.bias", "_orig_mod.blocks.6.ln2.weight", "_orig_mod.blocks.6.ln2.bias", "_orig_mod.ln.weight", "_orig_mod.ln.bias", "_orig_mod.final_linear.weight", "_orig_mod.final_linear.bias". 
	Unexpected key(s) in state_dict: "_orig_mod._orig_mod.embeddings.weight", "_orig_mod._orig_mod.positions.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.0.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.0.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.0.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.0.ma.combine.weight", "_orig_mod._orig_mod.blocks.0.ma.combine.bias", "_orig_mod._orig_mod.blocks.0.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.0.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.0.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.0.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.0.ln1.weight", "_orig_mod._orig_mod.blocks.0.ln1.bias", "_orig_mod._orig_mod.blocks.0.ln2.weight", "_orig_mod._orig_mod.blocks.0.ln2.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.1.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.1.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.1.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.1.ma.combine.weight", "_orig_mod._orig_mod.blocks.1.ma.combine.bias", "_orig_mod._orig_mod.blocks.1.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.1.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.1.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.1.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.1.ln1.weight", "_orig_mod._orig_mod.blocks.1.ln1.bias", "_orig_mod._orig_mod.blocks.1.ln2.weight", "_orig_mod._orig_mod.blocks.1.ln2.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.2.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.2.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.2.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.2.ma.combine.weight", "_orig_mod._orig_mod.blocks.2.ma.combine.bias", "_orig_mod._orig_mod.blocks.2.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.2.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.2.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.2.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.2.ln1.weight", "_orig_mod._orig_mod.blocks.2.ln1.bias", "_orig_mod._orig_mod.blocks.2.ln2.weight", "_orig_mod._orig_mod.blocks.2.ln2.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.3.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.3.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.3.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.3.ma.combine.weight", "_orig_mod._orig_mod.blocks.3.ma.combine.bias", "_orig_mod._orig_mod.blocks.3.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.3.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.3.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.3.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.3.ln1.weight", "_orig_mod._orig_mod.blocks.3.ln1.bias", "_orig_mod._orig_mod.blocks.3.ln2.weight", "_orig_mod._orig_mod.blocks.3.ln2.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.4.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.4.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.4.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.4.ma.combine.weight", "_orig_mod._orig_mod.blocks.4.ma.combine.bias", "_orig_mod._orig_mod.blocks.4.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.4.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.4.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.4.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.4.ln1.weight", "_orig_mod._orig_mod.blocks.4.ln1.bias", "_orig_mod._orig_mod.blocks.4.ln2.weight", "_orig_mod._orig_mod.blocks.4.ln2.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.5.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.5.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.5.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.5.ma.combine.weight", "_orig_mod._orig_mod.blocks.5.ma.combine.bias", "_orig_mod._orig_mod.blocks.5.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.5.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.5.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.5.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.5.ln1.weight", "_orig_mod._orig_mod.blocks.5.ln1.bias", "_orig_mod._orig_mod.blocks.5.ln2.weight", "_orig_mod._orig_mod.blocks.5.ln2.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.0.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.0.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.0.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.0.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.0.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.0.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.0.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.1.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.1.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.1.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.1.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.1.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.1.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.1.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.2.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.2.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.2.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.2.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.2.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.2.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.2.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.3.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.3.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.3.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.3.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.3.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.3.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.3.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.4.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.4.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.4.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.4.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.4.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.4.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.4.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.5.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.5.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.5.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.5.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.5.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.5.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.5.values.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.6.tril", "_orig_mod._orig_mod.blocks.6.ma.heads.6.queries.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.6.queries.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.6.keys.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.6.keys.bias", "_orig_mod._orig_mod.blocks.6.ma.heads.6.values.weight", "_orig_mod._orig_mod.blocks.6.ma.heads.6.values.bias", "_orig_mod._orig_mod.blocks.6.ma.combine.weight", "_orig_mod._orig_mod.blocks.6.ma.combine.bias", "_orig_mod._orig_mod.blocks.6.feed_forward.network.0.weight", "_orig_mod._orig_mod.blocks.6.feed_forward.network.0.bias", "_orig_mod._orig_mod.blocks.6.feed_forward.network.2.weight", "_orig_mod._orig_mod.blocks.6.feed_forward.network.2.bias", "_orig_mod._orig_mod.blocks.6.ln1.weight", "_orig_mod._orig_mod.blocks.6.ln1.bias", "_orig_mod._orig_mod.blocks.6.ln2.weight", "_orig_mod._orig_mod.blocks.6.ln2.bias", "_orig_mod._orig_mod.ln.weight", "_orig_mod._orig_mod.ln.bias", "_orig_mod._orig_mod.final_linear.weight", "_orig_mod._orig_mod.final_linear.bias". 

In [24]:
# Inference 

if inference == True: 
    model.eval()
    while True:
        qs = input('Enter text (q to quit): ')
        if qs == "":
            continue
        if qs == "q":
            break
        generate_sample(qs)


Enter text (q to quit):  I go to school by bus


I go to school by bus books colon~ waysexev Chic joined chart estungient market Danielcience Jes UKiol enginealf Khem suggestyr Hel clos being Wal agre phrey Hist Blackended Bang tooilesston Persight sing Walpleming Khatithingimbround heldylesp near Angelesired medicalanks politician Japangest Os COVID building computer


Enter text (q to quit):  q


In [62]:
# Training Loop

try:
    for i in tqdm(range(start_iteration, train_iters)):
        print(f"Iteration: {i}")
        xb, yb = get_batch('train')
        logits, loss = model(xb, yb)
    
        # Evaluating loss
        if (i % eval_interval == 0 or i == train_iters-1):
            l = calculate_loss()
            print(f"\n{i}: train loss: {l['train']} / val loss: {l['eval']}")
            # generate_sample('Once upon a time')
    
        if l['eval'] < best_val_loss:
            best_val_loss = l['eval']
            print('[CHECKPOINT]: Saving with loss: ', best_val_loss)
            torch.save({
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': best_val_loss,
                'iteration': i,
            }, checkpoint_dir + checkpoint_fn)
    
        if wandb_log:
            wandb.log({
                'loss/train': l['train'],
                'loss/val': l['eval'],
                'lr': scheduler.get_last_lr()[0],
            },
            step=i)

        optimizer.zero_grad(set_to_none=True)
        loss.backward()

        nn.utils.clip_grad_norm(model.parameters(), max_norm=grad_clip)

        optimizer.step()
        scheduler.step()

    if wandb_log:
        wandb.finish()

except KeyboardInterrupt:
    print('Training interrupted. Cleaning up......')

finally:
    # Release GPU memory
    print('GPU memory released')
    # sys.exit(0)




# try:
#     for i in tqdm(range(start_iteration, train_iters)):
#         xb, yb = get_batch('train')
#         logits, loss = model(xb, yb)
    
#         # Evaluating loss
#         if (i % eval_interval == 0 or i == train_iters-1):
#             l = calculate_loss()
#             print(f"\n{i}: train loss: {l['train']} / val loss: {l['eval']}")
#             # generate_sample('Once upon a time')
    
#         if l['eval'] < best_val_loss:
#             best_val_loss = l['eval']
#             print('[CHECKPOINT]: Saving with loss: ', best_val_loss)
#             torch.save({
#                 'model_state_dict': model.state_dict(),
#                 'optimizer_state_dict': optimizer.state_dict(),
#                 'loss': best_val_loss,
#                 'iteration': i,
#             }, checkpoint_dir + checkpoint_fn)
    
    
#         if wandb_log:
#             wandb_log({
#                 'loss/train': l['train'],
#                 'loss/val': l['eval'],
#                 'lr': scheduler.get_last_lr()[0],
#             },
#             step = i)

#         optimizer.zero_grad(set_to_none = True)
#         loss.backward()

#         nn.utils.clip_grad_norm(model.parameters(), max_norm = grad_clip)

#         optimizer.step()
#         scheduler.step()

#     if wandb_log:
#         wandb.finish()

# except KeyboardInterrupt:
#     print('Training interrupted. Cleaning up......')

# finally:
#     # Release GPU memory
#     print('GPU memory released')
#     # sys.exit(0)

# # torch.cuda.empty_cache()






  0%|                                                                                                 | 0/100000 [00:00<?, ?it/s]

Iteration: 0

0: train loss: 8.375 / val loss: 8.375
[CHECKPOINT]: Saving with loss:  8.375


  nn.utils.clip_grad_norm(model.parameters(), max_norm=grad_clip)
  0%|                                                                                      | 1/100000 [00:02<65:50:08,  2.37s/it]

Iteration: 1


  0%|                                                                                      | 2/100000 [00:02<30:48:40,  1.11s/it]

Iteration: 2


  0%|                                                                                      | 3/100000 [00:02<20:28:38,  1.36it/s]

Iteration: 3


  0%|                                                                                      | 4/100000 [00:03<15:39:44,  1.77it/s]

Iteration: 4


  0%|                                                                                      | 5/100000 [00:03<12:48:31,  2.17it/s]

Iteration: 5


  0%|                                                                                      | 6/100000 [00:03<11:08:11,  2.49it/s]

Iteration: 6


  0%|                                                                                      | 7/100000 [00:04<10:07:41,  2.74it/s]

Iteration: 7


  0%|                                                                                       | 8/100000 [00:04<9:24:37,  2.95it/s]

Iteration: 8


  0%|                                                                                       | 9/100000 [00:04<9:01:08,  3.08it/s]

Iteration: 9


  0%|                                                                                      | 10/100000 [00:04<8:39:32,  3.21it/s]

Iteration: 10


  0%|                                                                                      | 11/100000 [00:05<8:23:24,  3.31it/s]

Iteration: 11


  0%|                                                                                      | 12/100000 [00:05<8:14:19,  3.37it/s]

Iteration: 12


  0%|                                                                                      | 13/100000 [00:05<8:08:04,  3.41it/s]

Iteration: 13


  0%|                                                                                      | 14/100000 [00:06<8:07:50,  3.42it/s]

Iteration: 14


  0%|                                                                                      | 15/100000 [00:06<8:12:02,  3.39it/s]

Iteration: 15


  0%|                                                                                      | 16/100000 [00:06<8:10:55,  3.39it/s]

Iteration: 16


  0%|                                                                                      | 17/100000 [00:06<8:09:30,  3.40it/s]

Iteration: 17


  0%|                                                                                      | 18/100000 [00:07<8:04:45,  3.44it/s]

Iteration: 18


  0%|                                                                                      | 19/100000 [00:07<8:00:55,  3.46it/s]

Iteration: 19


  0%|                                                                                      | 20/100000 [00:07<7:55:21,  3.51it/s]

Iteration: 20


  0%|                                                                                      | 21/100000 [00:08<8:04:44,  3.44it/s]

Iteration: 21


  0%|                                                                                      | 22/100000 [00:08<7:55:00,  3.51it/s]

Iteration: 22


  0%|                                                                                      | 23/100000 [00:08<7:53:25,  3.52it/s]

Iteration: 23


  0%|                                                                                      | 24/100000 [00:08<7:50:48,  3.54it/s]

Iteration: 24


  0%|                                                                                      | 25/100000 [00:09<8:09:26,  3.40it/s]

Iteration: 25


  0%|                                                                                      | 26/100000 [00:09<8:09:29,  3.40it/s]

Iteration: 26


  0%|                                                                                      | 27/100000 [00:09<8:03:47,  3.44it/s]

Iteration: 27


  0%|                                                                                      | 28/100000 [00:10<8:03:26,  3.45it/s]

Iteration: 28


  0%|                                                                                      | 29/100000 [00:10<7:59:12,  3.48it/s]

Iteration: 29


  0%|                                                                                      | 30/100000 [00:10<7:58:58,  3.48it/s]

Iteration: 30


  0%|                                                                                      | 31/100000 [00:10<8:08:20,  3.41it/s]

Iteration: 31


  0%|                                                                                      | 32/100000 [00:11<8:03:48,  3.44it/s]

Iteration: 32


  0%|                                                                                      | 33/100000 [00:11<7:58:01,  3.49it/s]

Iteration: 33


  0%|                                                                                      | 34/100000 [00:11<8:00:49,  3.47it/s]

Iteration: 34


  0%|                                                                                      | 35/100000 [00:12<7:58:35,  3.48it/s]

Iteration: 35


  0%|                                                                                      | 36/100000 [00:12<7:56:24,  3.50it/s]

Iteration: 36


  0%|                                                                                      | 37/100000 [00:12<8:01:08,  3.46it/s]

Iteration: 37


  0%|                                                                                      | 38/100000 [00:12<7:56:08,  3.50it/s]

Iteration: 38


  0%|                                                                                      | 39/100000 [00:13<7:55:19,  3.50it/s]

Iteration: 39


  0%|                                                                                      | 40/100000 [00:13<7:54:20,  3.51it/s]

Iteration: 40


  0%|                                                                                      | 41/100000 [00:13<8:02:22,  3.45it/s]

Iteration: 41


  0%|                                                                                      | 42/100000 [00:14<8:07:17,  3.42it/s]

Iteration: 42


  0%|                                                                                      | 43/100000 [00:14<8:13:44,  3.37it/s]

Iteration: 43


  0%|                                                                                      | 44/100000 [00:14<8:05:03,  3.43it/s]

Iteration: 44


  0%|                                                                                      | 45/100000 [00:15<8:15:55,  3.36it/s]

Iteration: 45


  0%|                                                                                      | 46/100000 [00:15<8:16:45,  3.35it/s]

Iteration: 46


  0%|                                                                                      | 47/100000 [00:15<8:16:19,  3.36it/s]

Iteration: 47


  0%|                                                                                      | 48/100000 [00:15<8:22:07,  3.32it/s]

Iteration: 48


  0%|                                                                                      | 49/100000 [00:16<8:18:31,  3.34it/s]

Iteration: 49


  0%|                                                                                      | 50/100000 [00:16<8:09:26,  3.40it/s]

Iteration: 50

50: train loss: 5.840624809265137 / val loss: 5.853125095367432
[CHECKPOINT]: Saving with loss:  5.853125095367432


  0%|                                                                                     | 51/100000 [00:19<26:42:47,  1.04it/s]

Iteration: 51


  0%|                                                                                     | 52/100000 [00:19<21:01:33,  1.32it/s]

Iteration: 52


  0%|                                                                                     | 53/100000 [00:19<17:20:20,  1.60it/s]

Iteration: 53


  0%|                                                                                     | 54/100000 [00:19<14:48:52,  1.87it/s]

Iteration: 54


  0%|                                                                                     | 55/100000 [00:20<12:52:34,  2.16it/s]

Iteration: 55


  0%|                                                                                     | 56/100000 [00:20<11:28:53,  2.42it/s]

Iteration: 56


  0%|                                                                                     | 57/100000 [00:20<10:40:49,  2.60it/s]

Iteration: 57


  0%|                                                                                     | 58/100000 [00:21<10:02:08,  2.77it/s]

Iteration: 58


  0%|                                                                                      | 59/100000 [00:21<9:32:51,  2.91it/s]

Iteration: 59


  0%|                                                                                      | 60/100000 [00:21<9:06:58,  3.05it/s]

Iteration: 60


  0%|                                                                                      | 61/100000 [00:22<8:51:35,  3.13it/s]

Iteration: 61


  0%|                                                                                      | 62/100000 [00:22<8:41:08,  3.20it/s]

Iteration: 62


  0%|                                                                                      | 63/100000 [00:22<8:26:19,  3.29it/s]

Iteration: 63


  0%|                                                                                      | 64/100000 [00:22<8:29:30,  3.27it/s]

Iteration: 64


  0%|                                                                                      | 65/100000 [00:23<8:25:31,  3.29it/s]

Iteration: 65


  0%|                                                                                      | 66/100000 [00:23<8:20:24,  3.33it/s]

Iteration: 66


  0%|                                                                                      | 67/100000 [00:23<8:26:40,  3.29it/s]

Iteration: 67


  0%|                                                                                      | 68/100000 [00:24<8:22:21,  3.32it/s]

Iteration: 68


  0%|                                                                                      | 69/100000 [00:24<8:20:13,  3.33it/s]

Iteration: 69


  0%|                                                                                      | 70/100000 [00:24<8:11:26,  3.39it/s]

Iteration: 70


  0%|                                                                                      | 71/100000 [00:25<8:09:58,  3.40it/s]

Iteration: 71


  0%|                                                                                      | 72/100000 [00:25<8:06:26,  3.42it/s]

Iteration: 72


  0%|                                                                                      | 73/100000 [00:25<8:08:59,  3.41it/s]

Iteration: 73


  0%|                                                                                      | 74/100000 [00:25<8:02:00,  3.46it/s]

Iteration: 74


  0%|                                                                                      | 75/100000 [00:26<8:04:32,  3.44it/s]

Iteration: 75


  0%|                                                                                      | 76/100000 [00:26<8:14:33,  3.37it/s]

Iteration: 76


  0%|                                                                                      | 77/100000 [00:26<8:18:32,  3.34it/s]

Iteration: 77


  0%|                                                                                      | 78/100000 [00:27<8:14:32,  3.37it/s]

Iteration: 78


  0%|                                                                                      | 79/100000 [00:27<8:11:50,  3.39it/s]

Iteration: 79


  0%|                                                                                      | 80/100000 [00:27<8:12:04,  3.38it/s]

Iteration: 80


  0%|                                                                                      | 81/100000 [00:27<8:05:54,  3.43it/s]

Iteration: 81


  0%|                                                                                      | 82/100000 [00:28<8:07:55,  3.41it/s]

Iteration: 82


  0%|                                                                                      | 83/100000 [00:28<8:09:02,  3.41it/s]

Iteration: 83


  0%|                                                                                      | 84/100000 [00:28<8:11:31,  3.39it/s]

Iteration: 84


  0%|                                                                                      | 85/100000 [00:29<8:06:46,  3.42it/s]

Iteration: 85


  0%|                                                                                      | 86/100000 [00:29<8:08:26,  3.41it/s]

Iteration: 86


  0%|                                                                                      | 87/100000 [00:29<8:15:03,  3.36it/s]

Iteration: 87


  0%|                                                                                      | 88/100000 [00:30<8:08:48,  3.41it/s]

Iteration: 88


  0%|                                                                                      | 89/100000 [00:30<8:15:37,  3.36it/s]

Iteration: 89


  0%|                                                                                      | 90/100000 [00:30<8:15:12,  3.36it/s]

Iteration: 90


  0%|                                                                                      | 91/100000 [00:30<8:09:49,  3.40it/s]

Iteration: 91


  0%|                                                                                      | 92/100000 [00:31<8:09:04,  3.40it/s]

Iteration: 92


  0%|                                                                                      | 93/100000 [00:31<8:09:38,  3.40it/s]

Iteration: 93


  0%|                                                                                      | 94/100000 [00:31<8:02:09,  3.45it/s]

Iteration: 94


  0%|                                                                                      | 95/100000 [00:32<8:06:39,  3.42it/s]

Iteration: 95


  0%|                                                                                      | 96/100000 [00:32<8:07:48,  3.41it/s]

Iteration: 96


  0%|                                                                                      | 97/100000 [00:32<8:09:16,  3.40it/s]

Iteration: 97


  0%|                                                                                      | 98/100000 [00:32<8:03:54,  3.44it/s]

Iteration: 98


  0%|                                                                                      | 99/100000 [00:33<8:07:01,  3.42it/s]

Iteration: 99


  0%|                                                                                     | 100/100000 [00:33<8:07:48,  3.41it/s]

Iteration: 100

100: train loss: 5.318749904632568 / val loss: 5.371874809265137
[CHECKPOINT]: Saving with loss:  5.371874809265137


  0%|                                                                                    | 101/100000 [00:35<25:59:58,  1.07it/s]

Iteration: 101


  0%|                                                                                    | 102/100000 [00:36<20:32:02,  1.35it/s]

Iteration: 102


  0%|                                                                                    | 103/100000 [00:36<16:43:55,  1.66it/s]

Iteration: 103


  0%|                                                                                    | 104/100000 [00:36<14:24:46,  1.93it/s]

Iteration: 104


  0%|                                                                                    | 105/100000 [00:37<12:33:40,  2.21it/s]

Iteration: 105


  0%|                                                                                    | 106/100000 [00:37<11:07:41,  2.49it/s]

Iteration: 106


  0%|                                                                                    | 107/100000 [00:37<10:16:54,  2.70it/s]

Iteration: 107


  0%|                                                                                     | 108/100000 [00:38<9:39:09,  2.87it/s]

Iteration: 108


  0%|                                                                                     | 109/100000 [00:38<9:06:06,  3.05it/s]

Iteration: 109


  0%|                                                                                     | 110/100000 [00:38<8:50:23,  3.14it/s]

Iteration: 110


  0%|                                                                                     | 111/100000 [00:38<8:40:15,  3.20it/s]

Iteration: 111


  0%|                                                                                     | 112/100000 [00:39<8:24:07,  3.30it/s]

Iteration: 112


  0%|                                                                                     | 113/100000 [00:39<8:19:52,  3.33it/s]

Iteration: 113


  0%|                                                                                     | 114/100000 [00:39<8:19:04,  3.34it/s]

Iteration: 114


  0%|                                                                                     | 115/100000 [00:40<8:09:46,  3.40it/s]

Iteration: 115


  0%|                                                                                     | 116/100000 [00:40<8:10:01,  3.40it/s]

Iteration: 116


  0%|                                                                                     | 117/100000 [00:40<8:10:15,  3.40it/s]

Iteration: 117


  0%|                                                                                     | 118/100000 [00:40<8:05:05,  3.43it/s]

Iteration: 118


  0%|                                                                                     | 119/100000 [00:41<8:05:06,  3.43it/s]

Iteration: 119


  0%|                                                                                     | 120/100000 [00:41<8:02:19,  3.45it/s]

Iteration: 120


  0%|                                                                                     | 121/100000 [00:41<8:07:47,  3.41it/s]

Iteration: 121


  0%|                                                                                     | 122/100000 [00:42<8:05:53,  3.43it/s]

Iteration: 122


  0%|                                                                                     | 123/100000 [00:42<8:06:22,  3.42it/s]

Iteration: 123


  0%|                                                                                     | 124/100000 [00:42<8:02:10,  3.45it/s]

Iteration: 124


  0%|                                                                                     | 125/100000 [00:42<8:06:24,  3.42it/s]

Iteration: 125


  0%|                                                                                     | 126/100000 [00:43<8:08:01,  3.41it/s]

Iteration: 126


  0%|                                                                                     | 127/100000 [00:43<8:10:17,  3.40it/s]

Iteration: 127


  0%|                                                                                     | 128/100000 [00:43<8:05:54,  3.43it/s]

Iteration: 128


  0%|                                                                                     | 129/100000 [00:44<8:08:01,  3.41it/s]

Iteration: 129


  0%|                                                                                     | 130/100000 [00:44<8:11:46,  3.38it/s]

Iteration: 130


  0%|                                                                                     | 131/100000 [00:44<8:12:42,  3.38it/s]

Iteration: 131


  0%|                                                                                     | 132/100000 [00:45<8:13:08,  3.38it/s]

Iteration: 132


  0%|                                                                                     | 133/100000 [00:45<8:06:14,  3.42it/s]

Iteration: 133


  0%|                                                                                     | 134/100000 [00:45<8:08:03,  3.41it/s]

Iteration: 134


  0%|                                                                                     | 135/100000 [00:45<8:08:39,  3.41it/s]

Iteration: 135


  0%|                                                                                     | 136/100000 [00:46<8:12:59,  3.38it/s]

Iteration: 136


  0%|                                                                                     | 137/100000 [00:46<8:10:29,  3.39it/s]

Iteration: 137


  0%|                                                                                     | 138/100000 [00:46<8:10:54,  3.39it/s]

Iteration: 138


  0%|                                                                                     | 139/100000 [00:47<8:05:30,  3.43it/s]

Iteration: 139


  0%|                                                                                     | 140/100000 [00:47<8:07:40,  3.41it/s]

Iteration: 140


  0%|                                                                                     | 141/100000 [00:47<8:10:55,  3.39it/s]

Iteration: 141


  0%|                                                                                     | 142/100000 [00:47<8:09:06,  3.40it/s]

Iteration: 142


  0%|                                                                                     | 143/100000 [00:48<8:05:18,  3.43it/s]

Iteration: 143


  0%|                                                                                     | 144/100000 [00:48<8:06:04,  3.42it/s]

Iteration: 144


  0%|                                                                                     | 145/100000 [00:48<8:07:26,  3.41it/s]

Iteration: 145


  0%|                                                                                     | 146/100000 [00:49<8:03:45,  3.44it/s]

Iteration: 146


  0%|                                                                                     | 147/100000 [00:49<8:04:53,  3.43it/s]

Iteration: 147


  0%|▏                                                                                    | 148/100000 [00:49<8:09:34,  3.40it/s]

Iteration: 148


  0%|▏                                                                                    | 149/100000 [00:50<8:09:14,  3.40it/s]

Iteration: 149


  0%|▏                                                                                    | 150/100000 [00:50<8:04:17,  3.44it/s]

Iteration: 150

150: train loss: 5.121874809265137 / val loss: 5.190625190734863
[CHECKPOINT]: Saving with loss:  5.190625190734863


  0%|▏                                                                                   | 151/100000 [00:52<26:27:33,  1.05it/s]

Iteration: 151


  0%|▏                                                                                   | 152/100000 [00:53<21:00:53,  1.32it/s]

Iteration: 152


  0%|▏                                                                                   | 153/100000 [00:53<17:03:04,  1.63it/s]

Iteration: 153


  0%|▏                                                                                   | 154/100000 [00:53<14:36:48,  1.90it/s]

Iteration: 154


  0%|▏                                                                                   | 155/100000 [00:54<12:35:35,  2.20it/s]

Iteration: 155


  0%|▏                                                                                   | 156/100000 [00:54<11:17:00,  2.46it/s]

Iteration: 156


  0%|▏                                                                                   | 157/100000 [00:54<10:12:32,  2.72it/s]

Iteration: 157


  0%|▏                                                                                    | 158/100000 [00:54<9:38:43,  2.88it/s]

Iteration: 158


  0%|▏                                                                                    | 159/100000 [00:55<9:13:24,  3.01it/s]

Iteration: 159


  0%|▏                                                                                    | 160/100000 [00:55<8:48:24,  3.15it/s]

Iteration: 160


  0%|▏                                                                                    | 161/100000 [00:55<8:45:04,  3.17it/s]

Iteration: 161


  0%|▏                                                                                    | 162/100000 [00:56<8:27:09,  3.28it/s]

Iteration: 162


  0%|▏                                                                                    | 163/100000 [00:56<8:24:31,  3.30it/s]

Iteration: 163


  0%|▏                                                                                    | 164/100000 [00:56<8:21:11,  3.32it/s]

Iteration: 164


  0%|▏                                                                                    | 165/100000 [00:56<8:17:57,  3.34it/s]

Iteration: 165


  0%|▏                                                                                    | 166/100000 [00:57<8:17:21,  3.35it/s]

Iteration: 166


  0%|▏                                                                                    | 167/100000 [00:57<8:14:35,  3.36it/s]

Iteration: 167


  0%|▏                                                                                    | 168/100000 [00:57<8:13:05,  3.37it/s]

Iteration: 168


  0%|▏                                                                                    | 169/100000 [00:58<8:07:17,  3.41it/s]

Iteration: 169


  0%|▏                                                                                    | 170/100000 [00:58<8:08:04,  3.41it/s]

Iteration: 170


  0%|▏                                                                                    | 171/100000 [00:58<8:10:30,  3.39it/s]

Iteration: 171


  0%|▏                                                                                    | 172/100000 [00:59<8:12:55,  3.38it/s]

Iteration: 172


  0%|▏                                                                                    | 173/100000 [00:59<8:05:57,  3.42it/s]

Iteration: 173


  0%|▏                                                                                    | 174/100000 [00:59<8:10:47,  3.39it/s]

Iteration: 174


  0%|▏                                                                                    | 175/100000 [00:59<8:13:59,  3.37it/s]

Iteration: 175


  0%|▏                                                                                    | 176/100000 [01:00<8:12:13,  3.38it/s]

Iteration: 176


  0%|▏                                                                                    | 177/100000 [01:00<8:07:28,  3.41it/s]

Iteration: 177


  0%|▏                                                                                    | 178/100000 [01:00<8:14:19,  3.37it/s]

Iteration: 178


  0%|▏                                                                                    | 179/100000 [01:01<8:08:21,  3.41it/s]

Iteration: 179


  0%|▏                                                                                    | 180/100000 [01:01<8:09:56,  3.40it/s]

Iteration: 180


  0%|▏                                                                                    | 181/100000 [01:01<8:12:19,  3.38it/s]

Iteration: 181


  0%|▏                                                                                    | 182/100000 [01:01<8:05:34,  3.43it/s]

Iteration: 182


  0%|▏                                                                                    | 183/100000 [01:02<8:10:28,  3.39it/s]

Iteration: 183


  0%|▏                                                                                    | 184/100000 [01:02<8:09:28,  3.40it/s]

Iteration: 184


  0%|▏                                                                                    | 185/100000 [01:02<8:12:45,  3.38it/s]

Iteration: 185


  0%|▏                                                                                    | 186/100000 [01:03<8:11:06,  3.39it/s]

Iteration: 186


  0%|▏                                                                                    | 187/100000 [01:03<8:11:32,  3.38it/s]

Iteration: 187


  0%|▏                                                                                    | 188/100000 [01:03<8:11:54,  3.38it/s]

Iteration: 188


  0%|▏                                                                                    | 189/100000 [01:04<8:05:55,  3.42it/s]

Iteration: 189


  0%|▏                                                                                    | 190/100000 [01:04<8:08:04,  3.41it/s]

Iteration: 190


  0%|▏                                                                                    | 191/100000 [01:04<8:09:58,  3.40it/s]

Iteration: 191


  0%|▏                                                                                    | 192/100000 [01:04<8:03:59,  3.44it/s]

Iteration: 192


  0%|▏                                                                                    | 193/100000 [01:05<8:07:34,  3.41it/s]

Iteration: 193


  0%|▏                                                                                    | 194/100000 [01:05<8:10:37,  3.39it/s]

Iteration: 194


  0%|▏                                                                                    | 195/100000 [01:05<8:07:20,  3.41it/s]

Iteration: 195


  0%|▏                                                                                    | 196/100000 [01:06<8:14:06,  3.37it/s]

Iteration: 196


  0%|▏                                                                                    | 197/100000 [01:06<8:21:03,  3.32it/s]

Iteration: 197


  0%|▏                                                                                    | 198/100000 [01:06<8:29:10,  3.27it/s]

Iteration: 198


  0%|▏                                                                                    | 199/100000 [01:07<8:26:59,  3.28it/s]

Iteration: 199


  0%|▏                                                                                    | 200/100000 [01:07<8:32:15,  3.25it/s]

Iteration: 200

200: train loss: 4.96875 / val loss: 4.965624809265137
[CHECKPOINT]: Saving with loss:  4.965624809265137


  0%|▏                                                                                   | 201/100000 [01:09<26:32:33,  1.04it/s]

Iteration: 201


  0%|▏                                                                                   | 202/100000 [01:10<21:01:31,  1.32it/s]

Iteration: 202


  0%|▏                                                                                   | 203/100000 [01:10<17:13:17,  1.61it/s]

Iteration: 203


  0%|▏                                                                                   | 204/100000 [01:10<14:53:31,  1.86it/s]

Iteration: 204


  0%|▏                                                                                   | 205/100000 [01:11<12:53:22,  2.15it/s]

Iteration: 205


  0%|▏                                                                                   | 206/100000 [01:11<11:29:27,  2.41it/s]

Iteration: 206


  0%|▏                                                                                   | 207/100000 [01:11<10:31:37,  2.63it/s]

Iteration: 207


  0%|▏                                                                                    | 208/100000 [01:11<9:57:31,  2.78it/s]

Iteration: 208


  0%|▏                                                                                    | 209/100000 [01:12<9:24:25,  2.95it/s]

Iteration: 209


  0%|▏                                                                                    | 210/100000 [01:12<9:10:18,  3.02it/s]

Iteration: 210


  0%|▏                                                                                    | 211/100000 [01:12<8:53:31,  3.12it/s]

Iteration: 211


  0%|▏                                                                                    | 212/100000 [01:13<8:46:20,  3.16it/s]

Iteration: 212


  0%|▏                                                                                    | 213/100000 [01:13<8:37:09,  3.22it/s]

Iteration: 213


  0%|▏                                                                                    | 214/100000 [01:13<8:38:40,  3.21it/s]

Iteration: 214


  0%|▏                                                                                    | 215/100000 [01:14<8:38:19,  3.21it/s]

Iteration: 215


  0%|▏                                                                                    | 216/100000 [01:14<8:40:32,  3.19it/s]

Iteration: 216


  0%|▏                                                                                    | 217/100000 [01:14<8:45:57,  3.16it/s]

Iteration: 217


  0%|▏                                                                                    | 218/100000 [01:15<8:36:42,  3.22it/s]

Iteration: 218


  0%|▏                                                                                    | 219/100000 [01:15<8:36:46,  3.22it/s]

Iteration: 219


  0%|▏                                                                                    | 220/100000 [01:15<8:38:56,  3.20it/s]

Iteration: 220


  0%|▏                                                                                    | 221/100000 [01:15<8:36:49,  3.22it/s]

Iteration: 221


  0%|▏                                                                                    | 222/100000 [01:16<8:29:54,  3.26it/s]

Iteration: 222


  0%|▏                                                                                    | 223/100000 [01:16<8:32:18,  3.25it/s]

Iteration: 223


  0%|▏                                                                                    | 224/100000 [01:16<8:33:56,  3.24it/s]

Iteration: 224


  0%|▏                                                                                    | 225/100000 [01:17<8:28:33,  3.27it/s]

Iteration: 225


  0%|▏                                                                                    | 226/100000 [01:17<8:31:44,  3.25it/s]

Iteration: 226


  0%|▏                                                                                    | 227/100000 [01:17<8:28:14,  3.27it/s]

Iteration: 227


  0%|▏                                                                                    | 228/100000 [01:18<8:29:45,  3.26it/s]

Iteration: 228


  0%|▏                                                                                    | 229/100000 [01:18<8:30:49,  3.26it/s]

Iteration: 229


  0%|▏                                                                                    | 230/100000 [01:18<8:28:06,  3.27it/s]

Iteration: 230


  0%|▏                                                                                    | 231/100000 [01:19<8:37:48,  3.21it/s]

Iteration: 231


  0%|▏                                                                                    | 232/100000 [01:19<8:37:05,  3.22it/s]

Iteration: 232


  0%|▏                                                                                    | 233/100000 [01:19<8:41:30,  3.19it/s]

Iteration: 233


  0%|▏                                                                                    | 234/100000 [01:19<8:41:29,  3.19it/s]

Iteration: 234


  0%|▏                                                                                    | 235/100000 [01:20<8:39:09,  3.20it/s]

Iteration: 235


  0%|▏                                                                                    | 236/100000 [01:20<8:36:18,  3.22it/s]

Iteration: 236


  0%|▏                                                                                    | 237/100000 [01:20<8:31:05,  3.25it/s]

Iteration: 237


  0%|▏                                                                                    | 238/100000 [01:21<8:33:56,  3.24it/s]

Iteration: 238


  0%|▏                                                                                    | 239/100000 [01:21<8:35:13,  3.23it/s]

Iteration: 239


  0%|▏                                                                                    | 240/100000 [01:21<8:29:53,  3.26it/s]

Iteration: 240


  0%|▏                                                                                    | 241/100000 [01:22<8:30:34,  3.26it/s]

Iteration: 241


  0%|▏                                                                                    | 242/100000 [01:22<8:31:55,  3.25it/s]

Iteration: 242


  0%|▏                                                                                    | 243/100000 [01:22<8:32:37,  3.24it/s]

Iteration: 243


  0%|▏                                                                                    | 244/100000 [01:23<8:35:42,  3.22it/s]

Iteration: 244


  0%|▏                                                                                    | 245/100000 [01:23<8:29:20,  3.26it/s]

Iteration: 245


  0%|▏                                                                                    | 246/100000 [01:23<8:31:03,  3.25it/s]

Iteration: 246


  0%|▏                                                                                    | 247/100000 [01:23<8:33:55,  3.23it/s]

Iteration: 247


  0%|▏                                                                                    | 248/100000 [01:24<8:35:32,  3.22it/s]

Iteration: 248


  0%|▏                                                                                    | 249/100000 [01:24<8:35:14,  3.23it/s]

Iteration: 249


  0%|▏                                                                                    | 250/100000 [01:24<8:29:48,  3.26it/s]

Iteration: 250

250: train loss: 4.868750095367432 / val loss: 4.893750190734863
[CHECKPOINT]: Saving with loss:  4.893750190734863


  0%|▏                                                                                   | 251/100000 [01:27<27:46:49,  1.00s/it]

Iteration: 251


  0%|▏                                                                                   | 252/100000 [01:27<22:02:45,  1.26it/s]

Iteration: 252


  0%|▏                                                                                   | 253/100000 [01:28<17:52:23,  1.55it/s]

Iteration: 253


  0%|▏                                                                                   | 254/100000 [01:28<15:14:36,  1.82it/s]

Iteration: 254


  0%|▏                                                                                   | 255/100000 [01:28<13:09:09,  2.11it/s]

Iteration: 255


  0%|▏                                                                                   | 256/100000 [01:29<11:50:11,  2.34it/s]

Iteration: 256


  0%|▏                                                                                   | 257/100000 [01:29<11:00:28,  2.52it/s]

Iteration: 257


  0%|▏                                                                                   | 258/100000 [01:29<10:21:36,  2.67it/s]

Iteration: 258


  0%|▏                                                                                    | 259/100000 [01:30<9:55:43,  2.79it/s]

Iteration: 259


  0%|▏                                                                                    | 260/100000 [01:30<9:41:25,  2.86it/s]

Iteration: 260


  0%|▏                                                                                    | 261/100000 [01:30<9:26:32,  2.93it/s]

Iteration: 261


  0%|▏                                                                                    | 262/100000 [01:31<9:20:20,  2.97it/s]

Iteration: 262


  0%|▏                                                                                    | 263/100000 [01:31<9:09:43,  3.02it/s]

Iteration: 263


  0%|▏                                                                                    | 264/100000 [01:31<9:04:16,  3.05it/s]

Iteration: 264


  0%|▏                                                                                    | 265/100000 [01:31<8:57:53,  3.09it/s]

Iteration: 265


  0%|▏                                                                                    | 266/100000 [01:32<8:47:10,  3.15it/s]

Iteration: 266


  0%|▏                                                                                    | 267/100000 [01:32<8:44:53,  3.17it/s]

Iteration: 267


  0%|▏                                                                                    | 268/100000 [01:32<8:47:32,  3.15it/s]

Iteration: 268


  0%|▏                                                                                    | 269/100000 [01:33<8:55:06,  3.11it/s]

Iteration: 269


  0%|▏                                                                                    | 270/100000 [01:33<8:49:59,  3.14it/s]

Iteration: 270


  0%|▏                                                                                    | 271/100000 [01:33<8:47:47,  3.15it/s]

Iteration: 271


  0%|▏                                                                                    | 272/100000 [01:34<8:42:17,  3.18it/s]

Iteration: 272


  0%|▏                                                                                    | 273/100000 [01:34<8:36:10,  3.22it/s]

Iteration: 273


  0%|▏                                                                                    | 274/100000 [01:34<8:27:15,  3.28it/s]

Iteration: 274


  0%|▏                                                                                    | 275/100000 [01:35<8:23:21,  3.30it/s]

Iteration: 275


  0%|▏                                                                                    | 276/100000 [01:35<8:26:04,  3.28it/s]

Iteration: 276


  0%|▏                                                                                    | 277/100000 [01:35<8:21:44,  3.31it/s]

Iteration: 277


  0%|▏                                                                                    | 278/100000 [01:35<8:18:28,  3.33it/s]

Iteration: 278


  0%|▏                                                                                    | 279/100000 [01:36<8:18:28,  3.33it/s]

Iteration: 279


  0%|▏                                                                                    | 280/100000 [01:36<8:23:33,  3.30it/s]

Iteration: 280


  0%|▏                                                                                    | 281/100000 [01:36<8:35:15,  3.23it/s]

Iteration: 281


  0%|▏                                                                                    | 282/100000 [01:37<8:28:37,  3.27it/s]

Iteration: 282


  0%|▏                                                                                    | 283/100000 [01:37<8:19:28,  3.33it/s]

Iteration: 283


  0%|▏                                                                                    | 284/100000 [01:37<8:24:08,  3.30it/s]

Iteration: 284


  0%|▏                                                                                    | 285/100000 [01:38<8:19:26,  3.33it/s]

Iteration: 285


  0%|▏                                                                                    | 286/100000 [01:38<8:18:18,  3.34it/s]

Iteration: 286


  0%|▏                                                                                    | 287/100000 [01:38<8:17:51,  3.34it/s]

Iteration: 287


  0%|▏                                                                                    | 288/100000 [01:38<8:16:03,  3.35it/s]

Iteration: 288


  0%|▏                                                                                    | 289/100000 [01:39<8:15:00,  3.36it/s]

Iteration: 289


  0%|▏                                                                                    | 290/100000 [01:39<8:23:45,  3.30it/s]

Iteration: 290


  0%|▏                                                                                    | 291/100000 [01:39<8:25:50,  3.29it/s]

Iteration: 291


  0%|▏                                                                                    | 292/100000 [01:40<8:29:54,  3.26it/s]

Iteration: 292


  0%|▏                                                                                    | 293/100000 [01:40<8:39:58,  3.20it/s]

Iteration: 293


  0%|▏                                                                                    | 294/100000 [01:40<8:31:08,  3.25it/s]

Iteration: 294


  0%|▎                                                                                    | 295/100000 [01:41<8:35:53,  3.22it/s]

Iteration: 295


  0%|▎                                                                                    | 296/100000 [01:41<8:33:54,  3.23it/s]

Iteration: 296


  0%|▎                                                                                    | 297/100000 [01:41<8:36:43,  3.22it/s]

Iteration: 297


  0%|▎                                                                                    | 298/100000 [01:42<8:30:33,  3.25it/s]

Iteration: 298


  0%|▎                                                                                    | 299/100000 [01:42<8:25:08,  3.29it/s]

Iteration: 299


  0%|▎                                                                                    | 300/100000 [01:42<8:23:04,  3.30it/s]

Iteration: 300

300: train loss: 4.737500190734863 / val loss: 4.809374809265137
[CHECKPOINT]: Saving with loss:  4.809374809265137


  0%|▎                                                                                   | 301/100000 [01:45<27:28:55,  1.01it/s]

Iteration: 301


  0%|▎                                                                                   | 302/100000 [01:45<21:41:49,  1.28it/s]

Iteration: 302


  0%|▎                                                                                   | 303/100000 [01:45<17:37:56,  1.57it/s]

Iteration: 303


  0%|▎                                                                                   | 304/100000 [01:46<15:09:27,  1.83it/s]

Iteration: 304


  0%|▎                                                                                   | 305/100000 [01:46<13:15:26,  2.09it/s]

Iteration: 305


  0%|▎                                                                                   | 306/100000 [01:46<11:44:49,  2.36it/s]

Iteration: 306


  0%|▎                                                                                   | 307/100000 [01:47<10:59:21,  2.52it/s]

Iteration: 307


  0%|▎                                                                                   | 308/100000 [01:47<10:13:49,  2.71it/s]

Iteration: 308


  0%|▎                                                                                    | 309/100000 [01:47<9:46:15,  2.83it/s]

Iteration: 309


  0%|▎                                                                                    | 310/100000 [01:48<9:27:01,  2.93it/s]

Iteration: 310


  0%|▎                                                                                    | 311/100000 [01:48<9:09:02,  3.03it/s]

Iteration: 311


  0%|▎                                                                                    | 312/100000 [01:48<8:46:19,  3.16it/s]

Iteration: 312


  0%|▎                                                                                    | 313/100000 [01:48<8:38:25,  3.20it/s]

Iteration: 313


  0%|▎                                                                                    | 314/100000 [01:49<8:29:22,  3.26it/s]

Iteration: 314


  0%|▎                                                                                    | 315/100000 [01:49<8:30:44,  3.25it/s]

Iteration: 315


  0%|▎                                                                                    | 316/100000 [01:49<8:26:37,  3.28it/s]

Iteration: 316


  0%|▎                                                                                    | 317/100000 [01:50<8:32:49,  3.24it/s]

Iteration: 317


  0%|▎                                                                                    | 318/100000 [01:50<8:34:58,  3.23it/s]

Iteration: 318


  0%|▎                                                                                    | 319/100000 [01:50<8:41:59,  3.18it/s]

Iteration: 319


  0%|▎                                                                                    | 320/100000 [01:51<8:39:48,  3.20it/s]

Iteration: 320


  0%|▎                                                                                    | 321/100000 [01:51<8:32:26,  3.24it/s]

Iteration: 321


  0%|▎                                                                                    | 322/100000 [01:51<8:26:21,  3.28it/s]

Iteration: 322


  0%|▎                                                                                    | 323/100000 [01:52<8:38:09,  3.21it/s]

Iteration: 323


  0%|▎                                                                                    | 324/100000 [01:52<8:32:13,  3.24it/s]

Iteration: 324


  0%|▎                                                                                    | 325/100000 [01:52<8:43:16,  3.17it/s]

Iteration: 325


  0%|▎                                                                                    | 326/100000 [01:52<8:38:09,  3.21it/s]

Iteration: 326


  0%|▎                                                                                    | 327/100000 [01:53<8:42:38,  3.18it/s]

Iteration: 327


  0%|▎                                                                                    | 328/100000 [01:53<8:45:18,  3.16it/s]

Iteration: 328


  0%|▎                                                                                    | 329/100000 [01:53<8:37:19,  3.21it/s]

Iteration: 329


  0%|▎                                                                                    | 330/100000 [01:54<8:36:55,  3.21it/s]

Iteration: 330


  0%|▎                                                                                    | 331/100000 [01:54<8:40:27,  3.19it/s]

Iteration: 331


  0%|▎                                                                                    | 332/100000 [01:54<8:28:21,  3.27it/s]

Iteration: 332


  0%|▎                                                                                    | 333/100000 [01:55<8:23:57,  3.30it/s]

Iteration: 333


  0%|▎                                                                                    | 334/100000 [01:55<8:21:09,  3.31it/s]

Iteration: 334


  0%|▎                                                                                    | 335/100000 [01:55<8:18:23,  3.33it/s]

Iteration: 335


  0%|▎                                                                                    | 336/100000 [01:56<8:15:01,  3.36it/s]

Iteration: 336


  0%|▎                                                                                    | 337/100000 [01:56<8:16:44,  3.34it/s]

Iteration: 337


  0%|▎                                                                                    | 338/100000 [01:56<8:14:27,  3.36it/s]

Iteration: 338


  0%|▎                                                                                    | 339/100000 [01:56<8:14:23,  3.36it/s]

Iteration: 339


  0%|▎                                                                                    | 340/100000 [01:57<8:14:09,  3.36it/s]

Iteration: 340


  0%|▎                                                                                    | 341/100000 [01:57<8:14:27,  3.36it/s]

Iteration: 341


  0%|▎                                                                                    | 342/100000 [01:57<8:14:09,  3.36it/s]

Iteration: 342


  0%|▎                                                                                    | 343/100000 [01:58<8:13:40,  3.36it/s]

Iteration: 343


  0%|▎                                                                                    | 344/100000 [01:58<8:27:55,  3.27it/s]

Iteration: 344


  0%|▎                                                                                    | 345/100000 [01:58<8:29:18,  3.26it/s]

Iteration: 345


  0%|▎                                                                                    | 346/100000 [01:59<8:33:11,  3.24it/s]

Iteration: 346


  0%|▎                                                                                    | 347/100000 [01:59<8:35:40,  3.22it/s]

Iteration: 347


  0%|▎                                                                                    | 348/100000 [01:59<8:34:51,  3.23it/s]

Iteration: 348


  0%|▎                                                                                    | 349/100000 [01:59<8:24:42,  3.29it/s]

Iteration: 349


  0%|▎                                                                                    | 350/100000 [02:00<8:33:34,  3.23it/s]

Iteration: 350

350: train loss: 4.693749904632568 / val loss: 4.731249809265137
[CHECKPOINT]: Saving with loss:  4.731249809265137


  0%|▎                                                                                   | 351/100000 [02:02<27:28:20,  1.01it/s]

Iteration: 351


  0%|▎                                                                                   | 352/100000 [02:03<21:48:55,  1.27it/s]

Iteration: 352


  0%|▎                                                                                   | 353/100000 [02:03<17:43:52,  1.56it/s]

Iteration: 353


  0%|▎                                                                                   | 354/100000 [02:03<15:07:54,  1.83it/s]

Iteration: 354


  0%|▎                                                                                   | 355/100000 [02:04<13:03:57,  2.12it/s]

Iteration: 355


  0%|▎                                                                                   | 356/100000 [02:04<11:35:49,  2.39it/s]

Iteration: 356


  0%|▎                                                                                   | 357/100000 [02:04<10:41:53,  2.59it/s]

Iteration: 357


  0%|▎                                                                                   | 358/100000 [02:05<10:03:34,  2.75it/s]

Iteration: 358


  0%|▎                                                                                    | 359/100000 [02:05<9:30:54,  2.91it/s]

Iteration: 359


  0%|▎                                                                                    | 360/100000 [02:05<9:07:38,  3.03it/s]

Iteration: 360


  0%|▎                                                                                    | 361/100000 [02:05<9:00:12,  3.07it/s]

Iteration: 361


  0%|▎                                                                                    | 362/100000 [02:06<8:46:07,  3.16it/s]

Iteration: 362


  0%|▎                                                                                    | 363/100000 [02:06<8:37:49,  3.21it/s]

Iteration: 363


  0%|▎                                                                                    | 364/100000 [02:06<8:44:59,  3.16it/s]

Iteration: 364


  0%|▎                                                                                    | 365/100000 [02:07<8:50:53,  3.13it/s]

Iteration: 365


  0%|▎                                                                                    | 366/100000 [02:07<8:38:54,  3.20it/s]

Iteration: 366


  0%|▎                                                                                    | 367/100000 [02:07<8:52:31,  3.12it/s]

Iteration: 367


  0%|▎                                                                                    | 368/100000 [02:08<8:57:41,  3.09it/s]

Iteration: 368


  0%|▎                                                                                    | 369/100000 [02:08<8:43:04,  3.17it/s]

Iteration: 369


  0%|▎                                                                                    | 370/100000 [02:08<8:36:41,  3.21it/s]

Iteration: 370


  0%|▎                                                                                    | 371/100000 [02:09<8:27:35,  3.27it/s]

Iteration: 371


  0%|▎                                                                                    | 372/100000 [02:09<8:37:14,  3.21it/s]

Iteration: 372


  0%|▎                                                                                    | 373/100000 [02:09<8:34:57,  3.22it/s]

Iteration: 373


  0%|▎                                                                                    | 374/100000 [02:09<8:37:24,  3.21it/s]

Iteration: 374


  0%|▎                                                                                    | 375/100000 [02:10<8:30:35,  3.25it/s]

Iteration: 375


  0%|▎                                                                                    | 376/100000 [02:10<8:25:45,  3.28it/s]

Iteration: 376


  0%|▎                                                                                    | 377/100000 [02:10<8:31:04,  3.25it/s]

Iteration: 377


  0%|▎                                                                                    | 378/100000 [02:11<8:41:44,  3.18it/s]

Iteration: 378


  0%|▎                                                                                    | 379/100000 [02:11<8:39:21,  3.20it/s]

Iteration: 379


  0%|▎                                                                                    | 380/100000 [02:11<8:40:30,  3.19it/s]

Iteration: 380


  0%|▎                                                                                    | 381/100000 [02:12<8:40:28,  3.19it/s]

Iteration: 381


  0%|▎                                                                                    | 382/100000 [02:12<8:30:29,  3.25it/s]

Iteration: 382


  0%|▎                                                                                    | 383/100000 [02:12<8:35:31,  3.22it/s]

Iteration: 383


  0%|▎                                                                                    | 384/100000 [02:13<8:36:46,  3.21it/s]

Iteration: 384


  0%|▎                                                                                    | 385/100000 [02:13<8:39:11,  3.20it/s]

Iteration: 385


  0%|▎                                                                                    | 386/100000 [02:13<8:38:53,  3.20it/s]

Iteration: 386


  0%|▎                                                                                    | 387/100000 [02:14<8:34:27,  3.23it/s]

Iteration: 387


  0%|▎                                                                                    | 388/100000 [02:14<8:28:21,  3.27it/s]

Iteration: 388


  0%|▎                                                                                    | 389/100000 [02:14<8:31:31,  3.25it/s]

Iteration: 389


  0%|▎                                                                                    | 390/100000 [02:14<8:36:23,  3.21it/s]

Iteration: 390


  0%|▎                                                                                    | 391/100000 [02:15<8:37:40,  3.21it/s]

Iteration: 391


  0%|▎                                                                                    | 392/100000 [02:15<8:29:22,  3.26it/s]

Iteration: 392


  0%|▎                                                                                    | 393/100000 [02:15<8:30:45,  3.25it/s]

Iteration: 393


  0%|▎                                                                                    | 394/100000 [02:16<8:24:14,  3.29it/s]

Iteration: 394


  0%|▎                                                                                    | 395/100000 [02:16<8:16:05,  3.35it/s]

Iteration: 395


  0%|▎                                                                                    | 396/100000 [02:16<8:22:18,  3.30it/s]

Iteration: 396


  0%|▎                                                                                    | 397/100000 [02:17<8:19:11,  3.33it/s]

Iteration: 397


  0%|▎                                                                                    | 398/100000 [02:17<8:19:04,  3.33it/s]

Iteration: 398


  0%|▎                                                                                    | 399/100000 [02:17<8:17:58,  3.33it/s]

Iteration: 399


  0%|▎                                                                                    | 400/100000 [02:17<8:14:54,  3.35it/s]

Iteration: 400

400: train loss: 4.631249904632568 / val loss: 4.628125190734863
[CHECKPOINT]: Saving with loss:  4.628125190734863


  0%|▎                                                                                   | 401/100000 [02:20<27:16:29,  1.01it/s]

Iteration: 401


  0%|▎                                                                                   | 402/100000 [02:20<21:33:51,  1.28it/s]

Iteration: 402


  0%|▎                                                                                   | 403/100000 [02:21<17:34:10,  1.57it/s]

Iteration: 403


  0%|▎                                                                                   | 404/100000 [02:21<15:03:23,  1.84it/s]

Iteration: 404


  0%|▎                                                                                   | 405/100000 [02:21<13:05:55,  2.11it/s]

Iteration: 405


  0%|▎                                                                                   | 406/100000 [02:22<11:35:01,  2.39it/s]

Iteration: 406


  0%|▎                                                                                   | 407/100000 [02:22<10:35:22,  2.61it/s]

Iteration: 407


  0%|▎                                                                                    | 408/100000 [02:22<9:59:54,  2.77it/s]

Iteration: 408


  0%|▎                                                                                    | 409/100000 [02:23<9:36:16,  2.88it/s]

Iteration: 409


  0%|▎                                                                                    | 410/100000 [02:23<9:17:43,  2.98it/s]

Iteration: 410


  0%|▎                                                                                    | 411/100000 [02:23<9:09:53,  3.02it/s]

Iteration: 411


  0%|▎                                                                                    | 412/100000 [02:23<9:05:03,  3.05it/s]

Iteration: 412


  0%|▎                                                                                    | 413/100000 [02:24<8:49:25,  3.14it/s]

Iteration: 413


  0%|▎                                                                                    | 414/100000 [02:24<8:40:13,  3.19it/s]

Iteration: 414


  0%|▎                                                                                    | 415/100000 [02:24<8:41:56,  3.18it/s]

Iteration: 415


  0%|▎                                                                                    | 416/100000 [02:25<8:32:41,  3.24it/s]

Iteration: 416


  0%|▎                                                                                    | 417/100000 [02:25<8:31:20,  3.25it/s]

Iteration: 417


  0%|▎                                                                                    | 418/100000 [02:25<8:27:28,  3.27it/s]

Iteration: 418


  0%|▎                                                                                    | 419/100000 [02:26<8:32:42,  3.24it/s]

Iteration: 419


  0%|▎                                                                                    | 420/100000 [02:26<8:31:28,  3.24it/s]

Iteration: 420


  0%|▎                                                                                    | 421/100000 [02:26<8:27:03,  3.27it/s]

Iteration: 421


  0%|▎                                                                                    | 422/100000 [02:27<8:37:08,  3.21it/s]

Iteration: 422


  0%|▎                                                                                    | 423/100000 [02:27<8:38:13,  3.20it/s]

Iteration: 423


  0%|▎                                                                                    | 424/100000 [02:27<8:31:54,  3.24it/s]

Iteration: 424


  0%|▎                                                                                    | 425/100000 [02:27<8:39:07,  3.20it/s]

Iteration: 425


  0%|▎                                                                                    | 426/100000 [02:28<8:38:20,  3.20it/s]

Iteration: 426


  0%|▎                                                                                    | 427/100000 [02:28<8:30:06,  3.25it/s]

Iteration: 427


  0%|▎                                                                                    | 428/100000 [02:28<8:32:57,  3.24it/s]

Iteration: 428


  0%|▎                                                                                    | 429/100000 [02:29<8:29:23,  3.26it/s]

Iteration: 429


  0%|▎                                                                                    | 430/100000 [02:29<8:33:18,  3.23it/s]

Iteration: 430


  0%|▎                                                                                    | 431/100000 [02:29<8:25:24,  3.28it/s]

Iteration: 431


  0%|▎                                                                                    | 432/100000 [02:30<8:28:18,  3.26it/s]

Iteration: 432


  0%|▎                                                                                    | 433/100000 [02:30<8:22:47,  3.30it/s]

Iteration: 433


  0%|▎                                                                                    | 434/100000 [02:30<8:20:35,  3.31it/s]

Iteration: 434


  0%|▎                                                                                    | 435/100000 [02:30<8:18:06,  3.33it/s]

Iteration: 435


  0%|▎                                                                                    | 436/100000 [02:31<8:17:53,  3.33it/s]

Iteration: 436


  0%|▎                                                                                    | 437/100000 [02:31<8:24:08,  3.29it/s]

Iteration: 437


  0%|▎                                                                                    | 438/100000 [02:31<8:21:20,  3.31it/s]

Iteration: 438


  0%|▎                                                                                    | 439/100000 [02:32<8:19:44,  3.32it/s]

Iteration: 439


  0%|▎                                                                                    | 440/100000 [02:32<8:21:02,  3.31it/s]

Iteration: 440


  0%|▎                                                                                    | 441/100000 [02:32<8:20:50,  3.31it/s]

Iteration: 441


  0%|▍                                                                                    | 442/100000 [02:33<8:19:25,  3.32it/s]

Iteration: 442


  0%|▍                                                                                    | 443/100000 [02:33<8:18:37,  3.33it/s]

Iteration: 443


  0%|▍                                                                                    | 444/100000 [02:33<8:30:50,  3.25it/s]

Iteration: 444


  0%|▍                                                                                    | 445/100000 [02:34<8:33:35,  3.23it/s]

Iteration: 445


  0%|▍                                                                                    | 446/100000 [02:34<8:34:39,  3.22it/s]

Iteration: 446


  0%|▍                                                                                    | 447/100000 [02:34<8:35:48,  3.22it/s]

Iteration: 447


  0%|▍                                                                                    | 448/100000 [02:34<8:36:07,  3.21it/s]

Iteration: 448


  0%|▍                                                                                    | 449/100000 [02:35<8:45:29,  3.16it/s]

Iteration: 449


  0%|▍                                                                                    | 450/100000 [02:35<8:34:20,  3.23it/s]

Iteration: 450

450: train loss: 4.5625 / val loss: 4.618750095367432
[CHECKPOINT]: Saving with loss:  4.618750095367432


  0%|▍                                                                                   | 451/100000 [02:38<27:51:57,  1.01s/it]

Iteration: 451


  0%|▍                                                                                   | 452/100000 [02:38<21:57:49,  1.26it/s]

Iteration: 452


  0%|▍                                                                                   | 453/100000 [02:38<18:00:27,  1.54it/s]

Iteration: 453


  0%|▍                                                                                   | 454/100000 [02:39<15:12:06,  1.82it/s]

Iteration: 454


  0%|▍                                                                                   | 455/100000 [02:39<13:11:54,  2.10it/s]

Iteration: 455


  0%|▍                                                                                   | 456/100000 [02:39<11:42:55,  2.36it/s]

Iteration: 456


  0%|▍                                                                                   | 457/100000 [02:40<10:47:11,  2.56it/s]

Iteration: 457


  0%|▍                                                                                    | 458/100000 [02:40<9:59:46,  2.77it/s]

Iteration: 458


  0%|▍                                                                                    | 459/100000 [02:40<9:29:53,  2.91it/s]

Iteration: 459


  0%|▍                                                                                    | 460/100000 [02:40<9:08:14,  3.03it/s]

Iteration: 460


  0%|▍                                                                                    | 461/100000 [02:41<9:05:50,  3.04it/s]

Iteration: 461


  0%|▍                                                                                    | 462/100000 [02:41<8:56:42,  3.09it/s]

Iteration: 462


  0%|▍                                                                                    | 463/100000 [02:41<8:57:36,  3.09it/s]

Iteration: 463


  0%|▍                                                                                    | 464/100000 [02:42<8:51:12,  3.12it/s]

Iteration: 464


  0%|▍                                                                                    | 465/100000 [02:42<8:48:35,  3.14it/s]

Iteration: 465


  0%|▍                                                                                    | 466/100000 [02:42<8:51:39,  3.12it/s]

Iteration: 466


  0%|▍                                                                                    | 467/100000 [02:43<8:50:29,  3.13it/s]

Iteration: 467


  0%|▍                                                                                    | 468/100000 [02:43<8:52:33,  3.11it/s]

Iteration: 468


  0%|▍                                                                                    | 469/100000 [02:43<8:55:09,  3.10it/s]

Iteration: 469


  0%|▍                                                                                    | 470/100000 [02:44<8:49:36,  3.13it/s]

Iteration: 470


  0%|▍                                                                                    | 471/100000 [02:44<8:46:02,  3.15it/s]

Iteration: 471


  0%|▍                                                                                    | 472/100000 [02:44<8:44:27,  3.16it/s]

Iteration: 472


  0%|▍                                                                                    | 473/100000 [02:45<8:40:33,  3.19it/s]

Iteration: 473


  0%|▍                                                                                    | 474/100000 [02:45<8:47:32,  3.14it/s]

Iteration: 474


  0%|▍                                                                                    | 475/100000 [02:45<8:44:59,  3.16it/s]

Iteration: 475


  0%|▍                                                                                    | 476/100000 [02:46<8:42:24,  3.18it/s]

Iteration: 476


  0%|▍                                                                                    | 477/100000 [02:46<8:34:13,  3.23it/s]

Iteration: 477


  0%|▍                                                                                    | 478/100000 [02:46<8:33:12,  3.23it/s]

Iteration: 478


  0%|▍                                                                                    | 479/100000 [02:46<8:36:52,  3.21it/s]

Iteration: 479


  0%|▍                                                                                    | 480/100000 [02:47<8:52:05,  3.12it/s]

Iteration: 480


  0%|▍                                                                                    | 481/100000 [02:47<8:55:04,  3.10it/s]

Iteration: 481


  0%|▍                                                                                    | 482/100000 [02:47<8:49:12,  3.13it/s]

Iteration: 482


  0%|▍                                                                                    | 483/100000 [02:48<8:37:44,  3.20it/s]

Iteration: 483


  0%|▍                                                                                    | 484/100000 [02:48<8:36:32,  3.21it/s]

Iteration: 484


  0%|▍                                                                                    | 485/100000 [02:48<8:23:03,  3.30it/s]

Iteration: 485


  0%|▍                                                                                    | 486/100000 [02:49<8:27:15,  3.27it/s]

Iteration: 486


  0%|▍                                                                                    | 487/100000 [02:49<8:22:39,  3.30it/s]

Iteration: 487


  0%|▍                                                                                    | 488/100000 [02:49<8:18:38,  3.33it/s]

Iteration: 488


  0%|▍                                                                                    | 489/100000 [02:50<8:17:43,  3.33it/s]

Iteration: 489


  0%|▍                                                                                    | 490/100000 [02:50<8:15:30,  3.35it/s]

Iteration: 490


  0%|▍                                                                                    | 491/100000 [02:50<8:15:00,  3.35it/s]

Iteration: 491


  0%|▍                                                                                    | 492/100000 [02:50<8:16:47,  3.34it/s]

Iteration: 492


  0%|▍                                                                                    | 493/100000 [02:51<8:22:06,  3.30it/s]

Iteration: 493


  0%|▍                                                                                    | 494/100000 [02:51<8:19:06,  3.32it/s]

Iteration: 494


  0%|▍                                                                                    | 495/100000 [02:51<8:16:57,  3.34it/s]

Iteration: 495


  0%|▍                                                                                    | 496/100000 [02:52<8:15:39,  3.35it/s]

Iteration: 496


  0%|▍                                                                                    | 497/100000 [02:52<8:17:30,  3.33it/s]

Iteration: 497


  0%|▍                                                                                    | 498/100000 [02:52<8:14:44,  3.35it/s]

Iteration: 498


  0%|▍                                                                                    | 499/100000 [02:53<8:16:18,  3.34it/s]

Iteration: 499


  0%|▍                                                                                    | 500/100000 [02:53<8:21:14,  3.31it/s]

Iteration: 500

500: train loss: 4.484375 / val loss: 4.525000095367432
[CHECKPOINT]: Saving with loss:  4.525000095367432


  1%|▍                                                                                   | 501/100000 [02:55<26:44:27,  1.03it/s]

Iteration: 501


  1%|▍                                                                                   | 502/100000 [02:56<21:10:19,  1.31it/s]

Iteration: 502


  1%|▍                                                                                   | 503/100000 [02:56<17:17:36,  1.60it/s]

Iteration: 503


  1%|▍                                                                                   | 504/100000 [02:56<14:51:57,  1.86it/s]

Iteration: 504


  1%|▍                                                                                   | 505/100000 [02:57<12:51:40,  2.15it/s]

Iteration: 505


  1%|▍                                                                                   | 506/100000 [02:57<11:30:46,  2.40it/s]

Iteration: 506


  1%|▍                                                                                   | 507/100000 [02:57<10:37:42,  2.60it/s]

Iteration: 507


  1%|▍                                                                                    | 508/100000 [02:58<9:54:49,  2.79it/s]

Iteration: 508


  1%|▍                                                                                    | 509/100000 [02:58<9:24:17,  2.94it/s]

Iteration: 509


  1%|▍                                                                                    | 510/100000 [02:58<9:02:23,  3.06it/s]

Iteration: 510


  1%|▍                                                                                    | 511/100000 [02:58<8:54:44,  3.10it/s]

Iteration: 511


  1%|▍                                                                                    | 512/100000 [02:59<8:42:20,  3.17it/s]

Iteration: 512


  1%|▍                                                                                    | 513/100000 [02:59<8:32:53,  3.23it/s]

Iteration: 513


  1%|▍                                                                                    | 514/100000 [02:59<8:34:42,  3.22it/s]

Iteration: 514


  1%|▍                                                                                    | 515/100000 [03:00<8:26:40,  3.27it/s]

Iteration: 515


  1%|▍                                                                                    | 516/100000 [03:00<8:22:04,  3.30it/s]

Iteration: 516


  1%|▍                                                                                    | 517/100000 [03:00<8:25:23,  3.28it/s]

Iteration: 517


  1%|▍                                                                                    | 518/100000 [03:01<8:20:55,  3.31it/s]

Iteration: 518


  1%|▍                                                                                    | 519/100000 [03:01<8:19:41,  3.32it/s]

Iteration: 519


  1%|▍                                                                                    | 520/100000 [03:01<8:19:33,  3.32it/s]

Iteration: 520


  1%|▍                                                                                    | 521/100000 [03:01<8:15:19,  3.35it/s]

Iteration: 521


  1%|▍                                                                                    | 522/100000 [03:02<8:14:03,  3.36it/s]

Iteration: 522


  1%|▍                                                                                    | 523/100000 [03:02<8:21:46,  3.30it/s]

Iteration: 523


  1%|▍                                                                                    | 524/100000 [03:02<8:18:49,  3.32it/s]

Iteration: 524


  1%|▍                                                                                    | 525/100000 [03:03<8:18:25,  3.33it/s]

Iteration: 525


  1%|▍                                                                                    | 526/100000 [03:03<8:17:25,  3.33it/s]

Iteration: 526


  1%|▍                                                                                    | 527/100000 [03:03<8:17:09,  3.33it/s]

Iteration: 527


  1%|▍                                                                                    | 528/100000 [03:04<8:14:44,  3.35it/s]

Iteration: 528


  1%|▍                                                                                    | 529/100000 [03:04<8:13:30,  3.36it/s]

Iteration: 529


  1%|▍                                                                                    | 530/100000 [03:04<8:21:15,  3.31it/s]

Iteration: 530


  1%|▍                                                                                    | 531/100000 [03:04<8:18:32,  3.33it/s]

Iteration: 531


  1%|▍                                                                                    | 532/100000 [03:05<8:21:41,  3.30it/s]

Iteration: 532


  1%|▍                                                                                    | 533/100000 [03:05<8:19:12,  3.32it/s]

Iteration: 533


  1%|▍                                                                                    | 534/100000 [03:05<8:17:08,  3.33it/s]

Iteration: 534


  1%|▍                                                                                    | 535/100000 [03:06<8:17:36,  3.33it/s]

Iteration: 535


  1%|▍                                                                                    | 536/100000 [03:06<8:15:21,  3.35it/s]

Iteration: 536


  1%|▍                                                                                    | 537/100000 [03:06<8:21:48,  3.30it/s]

Iteration: 537


  1%|▍                                                                                    | 538/100000 [03:07<8:18:08,  3.33it/s]

Iteration: 538


  1%|▍                                                                                    | 539/100000 [03:07<8:17:18,  3.33it/s]

Iteration: 539


  1%|▍                                                                                    | 540/100000 [03:07<8:14:38,  3.35it/s]

Iteration: 540


  1%|▍                                                                                    | 541/100000 [03:07<8:15:02,  3.35it/s]

Iteration: 541


  1%|▍                                                                                    | 542/100000 [03:08<8:14:20,  3.35it/s]

Iteration: 542


  1%|▍                                                                                    | 543/100000 [03:08<8:20:41,  3.31it/s]

Iteration: 543


  1%|▍                                                                                    | 544/100000 [03:08<8:19:05,  3.32it/s]

Iteration: 544


  1%|▍                                                                                    | 545/100000 [03:09<8:16:32,  3.34it/s]

Iteration: 545


  1%|▍                                                                                    | 546/100000 [03:09<8:14:13,  3.35it/s]

Iteration: 546


  1%|▍                                                                                    | 547/100000 [03:09<8:20:48,  3.31it/s]

Iteration: 547


  1%|▍                                                                                    | 548/100000 [03:10<8:18:17,  3.33it/s]

Iteration: 548


  1%|▍                                                                                    | 549/100000 [03:10<8:15:58,  3.34it/s]

Iteration: 549


  1%|▍                                                                                    | 550/100000 [03:10<8:20:39,  3.31it/s]

Iteration: 550

550: train loss: 4.431250095367432 / val loss: 4.481249809265137
[CHECKPOINT]: Saving with loss:  4.481249809265137


  1%|▍                                                                                   | 551/100000 [03:13<26:48:55,  1.03it/s]

Iteration: 551


  1%|▍                                                                                   | 552/100000 [03:13<21:12:47,  1.30it/s]

Iteration: 552


  1%|▍                                                                                   | 553/100000 [03:13<17:20:03,  1.59it/s]

Iteration: 553


  1%|▍                                                                                   | 554/100000 [03:14<14:52:49,  1.86it/s]

Iteration: 554


  1%|▍                                                                                   | 555/100000 [03:14<12:55:09,  2.14it/s]

Iteration: 555


  1%|▍                                                                                   | 556/100000 [03:14<11:29:25,  2.40it/s]

Iteration: 556


  1%|▍                                                                                   | 557/100000 [03:14<10:37:32,  2.60it/s]

Iteration: 557


  1%|▍                                                                                    | 558/100000 [03:15<9:55:13,  2.78it/s]

Iteration: 558


  1%|▍                                                                                    | 559/100000 [03:15<9:23:19,  2.94it/s]

Iteration: 559


  1%|▍                                                                                    | 560/100000 [03:15<9:10:36,  3.01it/s]

Iteration: 560


  1%|▍                                                                                    | 561/100000 [03:16<8:51:33,  3.12it/s]

Iteration: 561


  1%|▍                                                                                    | 562/100000 [03:16<8:43:25,  3.17it/s]

Iteration: 562


  1%|▍                                                                                    | 563/100000 [03:16<8:32:06,  3.24it/s]

Iteration: 563


  1%|▍                                                                                    | 564/100000 [03:17<8:27:12,  3.27it/s]

Iteration: 564


  1%|▍                                                                                    | 565/100000 [03:17<8:22:59,  3.29it/s]

Iteration: 565


  1%|▍                                                                                    | 566/100000 [03:17<8:22:48,  3.30it/s]

Iteration: 566


  1%|▍                                                                                    | 567/100000 [03:17<8:18:49,  3.32it/s]

Iteration: 567


  1%|▍                                                                                    | 568/100000 [03:18<8:25:56,  3.28it/s]

Iteration: 568


  1%|▍                                                                                    | 569/100000 [03:18<8:22:08,  3.30it/s]

Iteration: 569


  1%|▍                                                                                    | 570/100000 [03:18<8:19:44,  3.32it/s]

Iteration: 570


  1%|▍                                                                                    | 571/100000 [03:19<8:18:38,  3.32it/s]

Iteration: 571


  1%|▍                                                                                    | 572/100000 [03:19<8:18:53,  3.32it/s]

Iteration: 572


  1%|▍                                                                                    | 573/100000 [03:19<8:20:09,  3.31it/s]

Iteration: 573


  1%|▍                                                                                    | 574/100000 [03:20<8:20:47,  3.31it/s]

Iteration: 574


  1%|▍                                                                                    | 575/100000 [03:20<8:20:47,  3.31it/s]

Iteration: 575


  1%|▍                                                                                    | 576/100000 [03:20<8:17:41,  3.33it/s]

Iteration: 576


  1%|▍                                                                                    | 577/100000 [03:21<8:18:39,  3.32it/s]

Iteration: 577


  1%|▍                                                                                    | 578/100000 [03:21<8:21:24,  3.30it/s]

Iteration: 578


  1%|▍                                                                                    | 579/100000 [03:21<8:20:58,  3.31it/s]

Iteration: 579


  1%|▍                                                                                    | 580/100000 [03:21<8:17:43,  3.33it/s]

Iteration: 580


  1%|▍                                                                                    | 581/100000 [03:22<8:21:03,  3.31it/s]

Iteration: 581


  1%|▍                                                                                    | 582/100000 [03:22<8:18:50,  3.32it/s]

Iteration: 582


  1%|▍                                                                                    | 583/100000 [03:22<8:17:24,  3.33it/s]

Iteration: 583


  1%|▍                                                                                    | 584/100000 [03:23<8:16:34,  3.34it/s]

Iteration: 584


  1%|▍                                                                                    | 585/100000 [03:23<8:15:00,  3.35it/s]

Iteration: 585


  1%|▍                                                                                    | 586/100000 [03:23<8:18:50,  3.32it/s]

Iteration: 586


  1%|▍                                                                                    | 587/100000 [03:24<8:18:35,  3.32it/s]

Iteration: 587


  1%|▍                                                                                    | 588/100000 [03:24<8:16:32,  3.34it/s]

Iteration: 588


  1%|▌                                                                                    | 589/100000 [03:24<8:22:59,  3.29it/s]

Iteration: 589


  1%|▌                                                                                    | 590/100000 [03:24<8:20:15,  3.31it/s]

Iteration: 590


  1%|▌                                                                                    | 591/100000 [03:25<8:17:08,  3.33it/s]

Iteration: 591


  1%|▌                                                                                    | 592/100000 [03:25<8:17:03,  3.33it/s]

Iteration: 592


  1%|▌                                                                                    | 593/100000 [03:25<8:23:53,  3.29it/s]

Iteration: 593


  1%|▌                                                                                    | 594/100000 [03:26<8:23:40,  3.29it/s]

Iteration: 594


  1%|▌                                                                                    | 595/100000 [03:26<8:20:07,  3.31it/s]

Iteration: 595


  1%|▌                                                                                    | 596/100000 [03:26<8:18:57,  3.32it/s]

Iteration: 596


  1%|▌                                                                                    | 597/100000 [03:27<8:19:52,  3.31it/s]

Iteration: 597


  1%|▌                                                                                    | 598/100000 [03:27<8:19:12,  3.32it/s]

Iteration: 598


  1%|▌                                                                                    | 599/100000 [03:27<8:21:10,  3.31it/s]

Iteration: 599


  1%|▌                                                                                    | 600/100000 [03:27<8:17:05,  3.33it/s]

Iteration: 600

600: train loss: 4.349999904632568 / val loss: 4.418749809265137
[CHECKPOINT]: Saving with loss:  4.418749809265137


  1%|▌                                                                                   | 601/100000 [03:30<26:33:05,  1.04it/s]

Iteration: 601


  1%|▌                                                                                   | 602/100000 [03:30<21:10:07,  1.30it/s]

Iteration: 602


  1%|▌                                                                                   | 603/100000 [03:31<17:16:16,  1.60it/s]

Iteration: 603


  1%|▌                                                                                   | 604/100000 [03:31<15:10:59,  1.82it/s]

Iteration: 604


  1%|▌                                                                                   | 605/100000 [03:31<13:05:15,  2.11it/s]

Iteration: 605


  1%|▌                                                                                   | 606/100000 [03:32<11:37:26,  2.38it/s]

Iteration: 606


  1%|▌                                                                                   | 607/100000 [03:32<10:49:42,  2.55it/s]

Iteration: 607


  1%|▌                                                                                   | 608/100000 [03:32<10:02:17,  2.75it/s]

Iteration: 608


  1%|▌                                                                                    | 609/100000 [03:32<9:29:23,  2.91it/s]

Iteration: 609


  1%|▌                                                                                    | 610/100000 [03:33<9:08:04,  3.02it/s]

Iteration: 610


  1%|▌                                                                                    | 611/100000 [03:33<8:56:50,  3.09it/s]

Iteration: 611


  1%|▌                                                                                    | 612/100000 [03:33<8:45:25,  3.15it/s]

Iteration: 612


  1%|▌                                                                                    | 613/100000 [03:34<8:37:11,  3.20it/s]

Iteration: 613


  1%|▌                                                                                    | 614/100000 [03:34<8:36:28,  3.21it/s]

Iteration: 614


  1%|▌                                                                                    | 615/100000 [03:34<8:27:59,  3.26it/s]

Iteration: 615


  1%|▌                                                                                    | 616/100000 [03:35<8:23:21,  3.29it/s]

Iteration: 616


  1%|▌                                                                                    | 617/100000 [03:35<8:26:57,  3.27it/s]

Iteration: 617


  1%|▌                                                                                    | 618/100000 [03:35<8:21:41,  3.30it/s]

Iteration: 618


  1%|▌                                                                                    | 619/100000 [03:35<8:19:20,  3.32it/s]

Iteration: 619


  1%|▌                                                                                    | 620/100000 [03:36<8:15:51,  3.34it/s]

Iteration: 620


  1%|▌                                                                                    | 621/100000 [03:36<8:22:29,  3.30it/s]

Iteration: 621


  1%|▌                                                                                    | 622/100000 [03:36<8:18:17,  3.32it/s]

Iteration: 622


  1%|▌                                                                                    | 623/100000 [03:37<8:18:23,  3.32it/s]

Iteration: 623


  1%|▌                                                                                    | 624/100000 [03:37<8:21:19,  3.30it/s]

Iteration: 624


  1%|▌                                                                                    | 625/100000 [03:37<8:19:42,  3.31it/s]

Iteration: 625


  1%|▌                                                                                    | 626/100000 [03:38<8:16:56,  3.33it/s]

Iteration: 626


  1%|▌                                                                                    | 627/100000 [03:38<8:22:42,  3.29it/s]

Iteration: 627


  1%|▌                                                                                    | 628/100000 [03:38<8:19:48,  3.31it/s]

Iteration: 628


  1%|▌                                                                                    | 629/100000 [03:38<8:17:36,  3.33it/s]

Iteration: 629


  1%|▌                                                                                    | 630/100000 [03:39<8:20:55,  3.31it/s]

Iteration: 630


  1%|▌                                                                                    | 631/100000 [03:39<8:19:06,  3.32it/s]

Iteration: 631


  1%|▌                                                                                    | 632/100000 [03:39<8:22:16,  3.30it/s]

Iteration: 632


  1%|▌                                                                                    | 633/100000 [03:40<8:19:13,  3.32it/s]

Iteration: 633


  1%|▌                                                                                    | 634/100000 [03:40<8:21:42,  3.30it/s]

Iteration: 634


  1%|▌                                                                                    | 635/100000 [03:40<8:18:53,  3.32it/s]

Iteration: 635


  1%|▌                                                                                    | 636/100000 [03:41<8:23:03,  3.29it/s]

Iteration: 636


  1%|▌                                                                                    | 637/100000 [03:41<8:20:49,  3.31it/s]

Iteration: 637


  1%|▌                                                                                    | 638/100000 [03:41<8:24:01,  3.29it/s]

Iteration: 638


  1%|▌                                                                                    | 639/100000 [03:41<8:20:28,  3.31it/s]

Iteration: 639


  1%|▌                                                                                    | 640/100000 [03:42<8:16:36,  3.33it/s]

Iteration: 640


  1%|▌                                                                                    | 641/100000 [03:42<8:16:14,  3.34it/s]

Iteration: 641


  1%|▌                                                                                    | 642/100000 [03:42<8:15:32,  3.34it/s]

Iteration: 642


  1%|▌                                                                                    | 643/100000 [03:43<8:22:04,  3.30it/s]

Iteration: 643


  1%|▌                                                                                    | 644/100000 [03:43<8:19:14,  3.32it/s]

Iteration: 644


  1%|▌                                                                                    | 645/100000 [03:43<8:17:26,  3.33it/s]

Iteration: 645


  1%|▌                                                                                    | 646/100000 [03:44<8:17:18,  3.33it/s]

Iteration: 646


  1%|▌                                                                                    | 647/100000 [03:44<8:22:46,  3.29it/s]

Iteration: 647


  1%|▌                                                                                    | 648/100000 [03:44<8:17:42,  3.33it/s]

Iteration: 648


  1%|▌                                                                                    | 649/100000 [03:44<8:16:27,  3.34it/s]

Iteration: 649


  1%|▌                                                                                    | 650/100000 [03:45<8:20:41,  3.31it/s]

Iteration: 650

650: train loss: 4.328125 / val loss: 4.421875


  1%|▌                                                                                   | 651/100000 [03:47<24:39:14,  1.12it/s]

Iteration: 651


  1%|▌                                                                                   | 652/100000 [03:47<19:44:13,  1.40it/s]

Iteration: 652


  1%|▌                                                                                   | 653/100000 [03:48<16:17:16,  1.69it/s]

Iteration: 653


  1%|▌                                                                                   | 654/100000 [03:48<13:58:27,  1.97it/s]

Iteration: 654


  1%|▌                                                                                   | 655/100000 [03:48<12:14:39,  2.25it/s]

Iteration: 655


  1%|▌                                                                                   | 656/100000 [03:49<11:07:40,  2.48it/s]

Iteration: 656


  1%|▌                                                                                   | 657/100000 [03:49<10:15:45,  2.69it/s]

Iteration: 657


  1%|▌                                                                                    | 658/100000 [03:49<9:40:36,  2.85it/s]

Iteration: 658


  1%|▌                                                                                    | 659/100000 [03:49<9:13:49,  2.99it/s]

Iteration: 659


  1%|▌                                                                                    | 660/100000 [03:50<8:57:01,  3.08it/s]

Iteration: 660


  1%|▌                                                                                    | 661/100000 [03:50<8:44:12,  3.16it/s]

Iteration: 661


  1%|▌                                                                                    | 662/100000 [03:50<8:42:07,  3.17it/s]

Iteration: 662


  1%|▌                                                                                    | 663/100000 [03:51<8:33:13,  3.23it/s]

Iteration: 663


  1%|▌                                                                                    | 664/100000 [03:51<8:27:53,  3.26it/s]

Iteration: 664


  1%|▌                                                                                    | 665/100000 [03:51<8:22:36,  3.29it/s]

Iteration: 665


  1%|▌                                                                                    | 666/100000 [03:52<8:25:54,  3.27it/s]

Iteration: 666


  1%|▌                                                                                    | 667/100000 [03:52<8:24:32,  3.28it/s]

Iteration: 667


  1%|▌                                                                                    | 668/100000 [03:52<8:20:42,  3.31it/s]

Iteration: 668


  1%|▌                                                                                    | 669/100000 [03:53<8:18:39,  3.32it/s]

Iteration: 669


  1%|▌                                                                                    | 670/100000 [03:53<8:24:39,  3.28it/s]

Iteration: 670


  1%|▌                                                                                    | 671/100000 [03:53<8:22:20,  3.30it/s]

Iteration: 671


  1%|▌                                                                                    | 672/100000 [03:53<8:18:21,  3.32it/s]

Iteration: 672


  1%|▌                                                                                    | 673/100000 [03:54<8:22:44,  3.29it/s]

Iteration: 673


  1%|▌                                                                                    | 674/100000 [03:54<8:20:05,  3.31it/s]

Iteration: 674


  1%|▌                                                                                    | 675/100000 [03:54<8:22:41,  3.29it/s]

Iteration: 675


  1%|▌                                                                                    | 676/100000 [03:55<8:20:13,  3.31it/s]

Iteration: 676


  1%|▌                                                                                    | 677/100000 [03:55<8:19:30,  3.31it/s]

Iteration: 677


  1%|▌                                                                                    | 678/100000 [03:55<8:22:23,  3.30it/s]

Iteration: 678


  1%|▌                                                                                    | 679/100000 [03:56<8:20:07,  3.31it/s]

Iteration: 679


  1%|▌                                                                                    | 680/100000 [03:56<8:17:41,  3.33it/s]

Iteration: 680


  1%|▌                                                                                    | 681/100000 [03:56<8:17:35,  3.33it/s]

Iteration: 681


  1%|▌                                                                                    | 682/100000 [03:56<8:15:30,  3.34it/s]

Iteration: 682


  1%|▌                                                                                    | 683/100000 [03:57<8:21:29,  3.30it/s]

Iteration: 683


  1%|▌                                                                                    | 684/100000 [03:57<8:21:26,  3.30it/s]

Iteration: 684


  1%|▌                                                                                    | 685/100000 [03:57<8:18:13,  3.32it/s]

Iteration: 685


  1%|▌                                                                                    | 686/100000 [03:58<8:15:50,  3.34it/s]

Iteration: 686


  1%|▌                                                                                    | 687/100000 [03:58<8:21:33,  3.30it/s]

Iteration: 687


  1%|▌                                                                                    | 688/100000 [03:58<8:17:46,  3.33it/s]

Iteration: 688


  1%|▌                                                                                    | 689/100000 [03:59<8:23:11,  3.29it/s]

Iteration: 689


  1%|▌                                                                                    | 690/100000 [03:59<8:19:28,  3.31it/s]

Iteration: 690


  1%|▌                                                                                    | 691/100000 [03:59<8:23:50,  3.29it/s]

Iteration: 691


  1%|▌                                                                                    | 692/100000 [03:59<8:21:05,  3.30it/s]

Iteration: 692


  1%|▌                                                                                    | 693/100000 [04:00<8:18:45,  3.32it/s]

Iteration: 693


  1%|▌                                                                                    | 694/100000 [04:00<8:23:26,  3.29it/s]

Iteration: 694


  1%|▌                                                                                    | 695/100000 [04:00<8:21:29,  3.30it/s]

Iteration: 695


  1%|▌                                                                                    | 696/100000 [04:01<8:24:10,  3.28it/s]

Iteration: 696


  1%|▌                                                                                    | 697/100000 [04:01<8:20:59,  3.30it/s]

Iteration: 697


  1%|▌                                                                                    | 698/100000 [04:01<8:17:40,  3.33it/s]

Iteration: 698


  1%|▌                                                                                    | 699/100000 [04:02<8:24:23,  3.28it/s]

Iteration: 699


  1%|▌                                                                                    | 700/100000 [04:02<8:19:45,  3.31it/s]

Iteration: 700

700: train loss: 4.296875 / val loss: 4.346875190734863
[CHECKPOINT]: Saving with loss:  4.346875190734863


  1%|▌                                                                                   | 701/100000 [04:04<26:52:35,  1.03it/s]

Iteration: 701


  1%|▌                                                                                   | 702/100000 [04:05<21:17:02,  1.30it/s]

Iteration: 702


  1%|▌                                                                                   | 703/100000 [04:05<17:28:23,  1.58it/s]

Iteration: 703


  1%|▌                                                                                   | 704/100000 [04:05<14:56:25,  1.85it/s]

Iteration: 704


  1%|▌                                                                                   | 705/100000 [04:06<12:56:14,  2.13it/s]

Iteration: 705


  1%|▌                                                                                   | 706/100000 [04:06<11:36:46,  2.38it/s]

Iteration: 706


  1%|▌                                                                                   | 707/100000 [04:06<10:44:39,  2.57it/s]

Iteration: 707


  1%|▌                                                                                   | 708/100000 [04:07<10:04:21,  2.74it/s]

Iteration: 708


  1%|▌                                                                                    | 709/100000 [04:07<9:23:24,  2.94it/s]

Iteration: 709


  1%|▌                                                                                    | 710/100000 [04:07<9:08:31,  3.02it/s]

Iteration: 710


  1%|▌                                                                                    | 711/100000 [04:07<8:58:45,  3.07it/s]

Iteration: 711


  1%|▌                                                                                    | 712/100000 [04:08<8:44:55,  3.15it/s]

Iteration: 712


  1%|▌                                                                                    | 713/100000 [04:08<8:36:05,  3.21it/s]

Iteration: 713


  1%|▌                                                                                    | 714/100000 [04:08<8:35:59,  3.21it/s]

Iteration: 714


  1%|▌                                                                                    | 715/100000 [04:09<8:35:06,  3.21it/s]

Iteration: 715


  1%|▌                                                                                    | 716/100000 [04:09<8:29:11,  3.25it/s]

Iteration: 716


  1%|▌                                                                                    | 717/100000 [04:09<8:29:51,  3.25it/s]

Iteration: 717


  1%|▌                                                                                    | 718/100000 [04:10<8:29:25,  3.25it/s]

Iteration: 718


  1%|▌                                                                                    | 719/100000 [04:10<8:25:40,  3.27it/s]

Iteration: 719


  1%|▌                                                                                    | 720/100000 [04:10<8:28:29,  3.25it/s]

Iteration: 720


  1%|▌                                                                                    | 721/100000 [04:11<8:23:20,  3.29it/s]

Iteration: 721


  1%|▌                                                                                    | 722/100000 [04:11<8:26:09,  3.27it/s]

Iteration: 722


  1%|▌                                                                                    | 723/100000 [04:11<8:24:07,  3.28it/s]

Iteration: 723


  1%|▌                                                                                    | 724/100000 [04:11<8:26:52,  3.26it/s]

Iteration: 724


  1%|▌                                                                                    | 725/100000 [04:12<8:22:49,  3.29it/s]

Iteration: 725


  1%|▌                                                                                    | 726/100000 [04:12<8:25:45,  3.27it/s]

Iteration: 726


  1%|▌                                                                                    | 727/100000 [04:12<8:21:16,  3.30it/s]

Iteration: 727


  1%|▌                                                                                    | 728/100000 [04:13<8:26:23,  3.27it/s]

Iteration: 728


  1%|▌                                                                                    | 729/100000 [04:13<8:24:04,  3.28it/s]

Iteration: 729


  1%|▌                                                                                    | 730/100000 [04:13<8:25:57,  3.27it/s]

Iteration: 730


  1%|▌                                                                                    | 731/100000 [04:14<8:28:11,  3.26it/s]

Iteration: 731


  1%|▌                                                                                    | 732/100000 [04:14<8:23:52,  3.28it/s]

Iteration: 732


  1%|▌                                                                                    | 733/100000 [04:14<8:26:35,  3.27it/s]

Iteration: 733


  1%|▌                                                                                    | 734/100000 [04:15<8:21:30,  3.30it/s]

Iteration: 734


  1%|▌                                                                                    | 735/100000 [04:15<8:23:41,  3.28it/s]

Iteration: 735


  1%|▋                                                                                    | 736/100000 [04:15<8:21:52,  3.30it/s]

Iteration: 736


  1%|▋                                                                                    | 737/100000 [04:15<8:25:34,  3.27it/s]

Iteration: 737


  1%|▋                                                                                    | 738/100000 [04:16<8:20:51,  3.30it/s]

Iteration: 738


  1%|▋                                                                                    | 739/100000 [04:16<8:17:22,  3.33it/s]

Iteration: 739


  1%|▋                                                                                    | 740/100000 [04:16<8:22:23,  3.29it/s]

Iteration: 740


  1%|▋                                                                                    | 741/100000 [04:17<8:19:38,  3.31it/s]

Iteration: 741


  1%|▋                                                                                    | 742/100000 [04:17<8:16:52,  3.33it/s]

Iteration: 742


  1%|▋                                                                                    | 743/100000 [04:17<8:22:20,  3.29it/s]

Iteration: 743


  1%|▋                                                                                    | 744/100000 [04:18<8:19:24,  3.31it/s]

Iteration: 744


  1%|▋                                                                                    | 745/100000 [04:18<8:17:26,  3.33it/s]

Iteration: 745


  1%|▋                                                                                    | 746/100000 [04:18<8:23:38,  3.28it/s]

Iteration: 746


  1%|▋                                                                                    | 747/100000 [04:18<8:20:32,  3.30it/s]

Iteration: 747


  1%|▋                                                                                    | 748/100000 [04:19<8:19:30,  3.31it/s]

Iteration: 748


  1%|▋                                                                                    | 749/100000 [04:19<8:25:52,  3.27it/s]

Iteration: 749


  1%|▋                                                                                    | 750/100000 [04:19<8:19:54,  3.31it/s]

Iteration: 750

750: train loss: 4.268750190734863 / val loss: 4.240624904632568
[CHECKPOINT]: Saving with loss:  4.240624904632568


  1%|▋                                                                                   | 751/100000 [04:22<27:39:17,  1.00s/it]

Iteration: 751


  1%|▋                                                                                   | 752/100000 [04:22<22:08:20,  1.25it/s]

Iteration: 752


  1%|▋                                                                                   | 753/100000 [04:23<18:09:51,  1.52it/s]

Iteration: 753


  1%|▋                                                                                   | 754/100000 [04:23<15:29:56,  1.78it/s]

Iteration: 754


  1%|▋                                                                                   | 755/100000 [04:23<13:25:43,  2.05it/s]

Iteration: 755


  1%|▋                                                                                   | 756/100000 [04:24<11:53:27,  2.32it/s]

Iteration: 756


  1%|▋                                                                                   | 757/100000 [04:24<11:05:45,  2.48it/s]

Iteration: 757


  1%|▋                                                                                   | 758/100000 [04:24<10:17:55,  2.68it/s]

Iteration: 758


  1%|▋                                                                                    | 759/100000 [04:25<9:47:51,  2.81it/s]

Iteration: 759


  1%|▋                                                                                    | 760/100000 [04:25<9:26:34,  2.92it/s]

Iteration: 760


  1%|▋                                                                                    | 761/100000 [04:25<9:19:04,  2.96it/s]

Iteration: 761


  1%|▋                                                                                    | 762/100000 [04:26<9:12:39,  2.99it/s]

Iteration: 762


  1%|▋                                                                                    | 763/100000 [04:26<8:55:47,  3.09it/s]

Iteration: 763


  1%|▋                                                                                    | 764/100000 [04:26<8:57:54,  3.07it/s]

Iteration: 764


  1%|▋                                                                                    | 765/100000 [04:26<8:43:49,  3.16it/s]

Iteration: 765


  1%|▋                                                                                    | 766/100000 [04:27<8:39:58,  3.18it/s]

Iteration: 766


  1%|▋                                                                                    | 767/100000 [04:27<8:31:51,  3.23it/s]

Iteration: 767


  1%|▋                                                                                    | 768/100000 [04:27<8:33:43,  3.22it/s]

Iteration: 768


  1%|▋                                                                                    | 769/100000 [04:28<8:27:29,  3.26it/s]

Iteration: 769


  1%|▋                                                                                    | 770/100000 [04:28<8:24:11,  3.28it/s]

Iteration: 770


  1%|▋                                                                                    | 771/100000 [04:28<8:26:44,  3.26it/s]

Iteration: 771


  1%|▋                                                                                    | 772/100000 [04:29<8:21:07,  3.30it/s]

Iteration: 772


  1%|▋                                                                                    | 773/100000 [04:29<8:19:34,  3.31it/s]

Iteration: 773


  1%|▋                                                                                    | 774/100000 [04:29<8:23:35,  3.28it/s]

Iteration: 774


  1%|▋                                                                                    | 775/100000 [04:29<8:19:55,  3.31it/s]

Iteration: 775


  1%|▋                                                                                    | 776/100000 [04:30<8:20:15,  3.31it/s]

Iteration: 776


  1%|▋                                                                                    | 777/100000 [04:30<8:20:54,  3.30it/s]

Iteration: 777


  1%|▋                                                                                    | 778/100000 [04:30<8:19:26,  3.31it/s]

Iteration: 778


  1%|▋                                                                                    | 779/100000 [04:31<8:18:28,  3.32it/s]

Iteration: 779


  1%|▋                                                                                    | 780/100000 [04:31<8:23:28,  3.28it/s]

Iteration: 780


  1%|▋                                                                                    | 781/100000 [04:31<8:20:33,  3.30it/s]

Iteration: 781


  1%|▋                                                                                    | 782/100000 [04:32<8:19:53,  3.31it/s]

Iteration: 782


  1%|▋                                                                                    | 783/100000 [04:32<8:18:06,  3.32it/s]

Iteration: 783


  1%|▋                                                                                    | 784/100000 [04:32<8:22:01,  3.29it/s]

Iteration: 784


  1%|▋                                                                                    | 785/100000 [04:33<8:28:57,  3.25it/s]

Iteration: 785


  1%|▋                                                                                    | 786/100000 [04:33<8:29:59,  3.24it/s]

Iteration: 786


  1%|▋                                                                                    | 787/100000 [04:33<8:39:11,  3.18it/s]

Iteration: 787


  1%|▋                                                                                    | 788/100000 [04:33<8:37:51,  3.19it/s]

Iteration: 788


  1%|▋                                                                                    | 789/100000 [04:34<8:44:50,  3.15it/s]

Iteration: 789


  1%|▋                                                                                    | 790/100000 [04:34<8:42:53,  3.16it/s]

Iteration: 790


  1%|▋                                                                                    | 791/100000 [04:34<8:40:01,  3.18it/s]

Iteration: 791


  1%|▋                                                                                    | 792/100000 [04:35<8:45:20,  3.15it/s]

Iteration: 792


  1%|▋                                                                                    | 793/100000 [04:35<8:41:06,  3.17it/s]

Iteration: 793


  1%|▋                                                                                    | 794/100000 [04:35<8:31:49,  3.23it/s]

Iteration: 794


  1%|▋                                                                                    | 795/100000 [04:36<8:33:41,  3.22it/s]

Iteration: 795


  1%|▋                                                                                    | 796/100000 [04:36<8:27:37,  3.26it/s]

Iteration: 796


  1%|▋                                                                                    | 797/100000 [04:36<8:27:43,  3.26it/s]

Iteration: 797


  1%|▋                                                                                    | 798/100000 [04:37<8:33:30,  3.22it/s]

Iteration: 798


  1%|▋                                                                                    | 799/100000 [04:37<8:35:29,  3.21it/s]

Iteration: 799


  1%|▋                                                                                    | 800/100000 [04:37<8:35:15,  3.21it/s]

Iteration: 800

800: train loss: 4.246874809265137 / val loss: 4.243750095367432


  1%|▋                                                                                   | 801/100000 [04:40<25:31:18,  1.08it/s]

Iteration: 801


  1%|▋                                                                                   | 802/100000 [04:40<20:19:25,  1.36it/s]

Iteration: 802


  1%|▋                                                                                   | 803/100000 [04:40<16:40:54,  1.65it/s]

Iteration: 803


  1%|▋                                                                                   | 804/100000 [04:40<14:15:44,  1.93it/s]

Iteration: 804


  1%|▋                                                                                   | 805/100000 [04:41<12:27:25,  2.21it/s]

Iteration: 805


  1%|▋                                                                                   | 806/100000 [04:41<11:11:10,  2.46it/s]

Iteration: 806


  1%|▋                                                                                   | 807/100000 [04:41<10:23:55,  2.65it/s]

Iteration: 807


  1%|▋                                                                                    | 808/100000 [04:42<9:47:27,  2.81it/s]

Iteration: 808


  1%|▋                                                                                    | 809/100000 [04:42<9:19:32,  2.95it/s]

Iteration: 809


  1%|▋                                                                                    | 810/100000 [04:42<9:04:38,  3.04it/s]

Iteration: 810


  1%|▋                                                                                    | 811/100000 [04:43<8:48:28,  3.13it/s]

Iteration: 811


  1%|▋                                                                                    | 812/100000 [04:43<8:38:02,  3.19it/s]

Iteration: 812


  1%|▋                                                                                    | 813/100000 [04:43<8:37:48,  3.19it/s]

Iteration: 813


  1%|▋                                                                                    | 814/100000 [04:44<8:31:56,  3.23it/s]

Iteration: 814


  1%|▋                                                                                    | 815/100000 [04:44<8:26:41,  3.26it/s]

Iteration: 815


  1%|▋                                                                                    | 816/100000 [04:44<8:21:15,  3.30it/s]

Iteration: 816


  1%|▋                                                                                    | 817/100000 [04:44<8:26:08,  3.27it/s]

Iteration: 817


  1%|▋                                                                                    | 818/100000 [04:45<8:31:29,  3.23it/s]

Iteration: 818


  1%|▋                                                                                    | 819/100000 [04:45<8:33:49,  3.22it/s]

Iteration: 819


  1%|▋                                                                                    | 820/100000 [04:45<8:42:13,  3.17it/s]

Iteration: 820


  1%|▋                                                                                    | 821/100000 [04:46<8:40:19,  3.18it/s]

Iteration: 821


  1%|▋                                                                                    | 822/100000 [04:46<8:32:43,  3.22it/s]

Iteration: 822


  1%|▋                                                                                    | 823/100000 [04:46<8:40:16,  3.18it/s]

Iteration: 823


  1%|▋                                                                                    | 824/100000 [04:47<8:32:09,  3.23it/s]

Iteration: 824


  1%|▋                                                                                    | 825/100000 [04:47<8:41:54,  3.17it/s]

Iteration: 825


  1%|▋                                                                                    | 826/100000 [04:47<8:39:25,  3.18it/s]

Iteration: 826


  1%|▋                                                                                    | 827/100000 [04:48<8:37:21,  3.19it/s]

Iteration: 827


  1%|▋                                                                                    | 828/100000 [04:48<8:31:00,  3.23it/s]

Iteration: 828


  1%|▋                                                                                    | 829/100000 [04:48<8:31:18,  3.23it/s]

Iteration: 829


  1%|▋                                                                                    | 830/100000 [04:48<8:31:58,  3.23it/s]

Iteration: 830


  1%|▋                                                                                    | 831/100000 [04:49<8:27:49,  3.25it/s]

Iteration: 831


  1%|▋                                                                                    | 832/100000 [04:49<8:29:02,  3.25it/s]

Iteration: 832


  1%|▋                                                                                    | 833/100000 [04:49<8:32:01,  3.23it/s]

Iteration: 833


  1%|▋                                                                                    | 834/100000 [04:50<8:27:07,  3.26it/s]

Iteration: 834


  1%|▋                                                                                    | 835/100000 [04:50<8:23:19,  3.28it/s]

Iteration: 835


  1%|▋                                                                                    | 836/100000 [04:50<8:26:58,  3.26it/s]

Iteration: 836


  1%|▋                                                                                    | 837/100000 [04:51<8:29:59,  3.24it/s]

Iteration: 837


  1%|▋                                                                                    | 838/100000 [04:51<8:31:41,  3.23it/s]

Iteration: 838


  1%|▋                                                                                    | 839/100000 [04:51<8:41:43,  3.17it/s]

Iteration: 839


  1%|▋                                                                                    | 840/100000 [04:52<8:41:30,  3.17it/s]

Iteration: 840


  1%|▋                                                                                    | 841/100000 [04:52<8:44:35,  3.15it/s]

Iteration: 841


  1%|▋                                                                                    | 842/100000 [04:52<8:45:44,  3.14it/s]

Iteration: 842


  1%|▋                                                                                    | 843/100000 [04:53<8:41:42,  3.17it/s]

Iteration: 843


  1%|▋                                                                                    | 844/100000 [04:53<8:33:56,  3.22it/s]

Iteration: 844


  1%|▋                                                                                    | 845/100000 [04:53<8:34:15,  3.21it/s]

Iteration: 845


  1%|▋                                                                                    | 846/100000 [04:53<8:26:10,  3.26it/s]

Iteration: 846


  1%|▋                                                                                    | 847/100000 [04:54<8:28:11,  3.25it/s]

Iteration: 847


  1%|▋                                                                                    | 848/100000 [04:54<8:24:30,  3.28it/s]

Iteration: 848


  1%|▋                                                                                    | 849/100000 [04:54<8:27:13,  3.26it/s]

Iteration: 849


  1%|▋                                                                                    | 850/100000 [04:55<8:39:51,  3.18it/s]

Iteration: 850

850: train loss: 4.193749904632568 / val loss: 4.224999904632568
[CHECKPOINT]: Saving with loss:  4.224999904632568


  1%|▋                                                                                   | 851/100000 [04:57<27:56:21,  1.01s/it]

Iteration: 851


  1%|▋                                                                                   | 852/100000 [04:58<22:15:20,  1.24it/s]

Iteration: 852


  1%|▋                                                                                   | 853/100000 [04:58<18:11:56,  1.51it/s]

Iteration: 853


  1%|▋                                                                                   | 854/100000 [04:58<15:41:48,  1.75it/s]

Iteration: 854


  1%|▋                                                                                   | 855/100000 [04:59<13:33:55,  2.03it/s]

Iteration: 855


  1%|▋                                                                                   | 856/100000 [04:59<12:03:43,  2.28it/s]

Iteration: 856


  1%|▋                                                                                   | 857/100000 [04:59<11:03:44,  2.49it/s]

Iteration: 857


  1%|▋                                                                                   | 858/100000 [05:00<10:19:05,  2.67it/s]

Iteration: 858


  1%|▋                                                                                    | 859/100000 [05:00<9:47:28,  2.81it/s]

Iteration: 859


  1%|▋                                                                                    | 860/100000 [05:00<9:26:59,  2.91it/s]

Iteration: 860


  1%|▋                                                                                    | 861/100000 [05:01<9:11:47,  2.99it/s]

Iteration: 861


  1%|▋                                                                                    | 862/100000 [05:01<9:00:11,  3.06it/s]

Iteration: 862


  1%|▋                                                                                    | 863/100000 [05:01<8:52:15,  3.10it/s]

Iteration: 863


  1%|▋                                                                                    | 864/100000 [05:01<8:46:44,  3.14it/s]

Iteration: 864


  1%|▋                                                                                    | 865/100000 [05:02<8:54:30,  3.09it/s]

Iteration: 865


  1%|▋                                                                                    | 866/100000 [05:02<8:57:49,  3.07it/s]

Iteration: 866


  1%|▋                                                                                    | 867/100000 [05:02<9:15:09,  2.98it/s]

Iteration: 867


  1%|▋                                                                                    | 868/100000 [05:03<9:05:57,  3.03it/s]

Iteration: 868


  1%|▋                                                                                    | 869/100000 [05:03<9:02:57,  3.04it/s]

Iteration: 869


  1%|▋                                                                                    | 870/100000 [05:03<8:58:47,  3.07it/s]

Iteration: 870


  1%|▋                                                                                    | 871/100000 [05:04<8:45:20,  3.14it/s]

Iteration: 871


  1%|▋                                                                                    | 872/100000 [05:04<8:40:20,  3.18it/s]

Iteration: 872


  1%|▋                                                                                    | 873/100000 [05:04<8:37:34,  3.19it/s]

Iteration: 873


  1%|▋                                                                                    | 874/100000 [05:05<8:29:23,  3.24it/s]

Iteration: 874


  1%|▋                                                                                    | 875/100000 [05:05<8:31:00,  3.23it/s]

Iteration: 875


  1%|▋                                                                                    | 876/100000 [05:05<8:37:59,  3.19it/s]

Iteration: 876


  1%|▋                                                                                    | 877/100000 [05:06<8:48:57,  3.12it/s]

Iteration: 877


  1%|▋                                                                                    | 878/100000 [05:06<8:47:25,  3.13it/s]

Iteration: 878


  1%|▋                                                                                    | 879/100000 [05:06<8:44:54,  3.15it/s]

Iteration: 879


  1%|▋                                                                                    | 880/100000 [05:07<8:41:12,  3.17it/s]

Iteration: 880


  1%|▋                                                                                    | 881/100000 [05:07<8:48:35,  3.13it/s]

Iteration: 881


  1%|▋                                                                                    | 882/100000 [05:07<8:45:22,  3.14it/s]

Iteration: 882


  1%|▊                                                                                    | 883/100000 [05:08<8:54:47,  3.09it/s]

Iteration: 883


  1%|▊                                                                                    | 884/100000 [05:08<8:45:21,  3.14it/s]

Iteration: 884


  1%|▊                                                                                    | 885/100000 [05:08<8:41:40,  3.17it/s]

Iteration: 885


  1%|▊                                                                                    | 886/100000 [05:08<8:40:02,  3.18it/s]

Iteration: 886


  1%|▊                                                                                    | 887/100000 [05:09<8:31:10,  3.23it/s]

Iteration: 887


  1%|▊                                                                                    | 888/100000 [05:09<8:33:16,  3.22it/s]

Iteration: 888


  1%|▊                                                                                    | 889/100000 [05:09<8:41:34,  3.17it/s]

Iteration: 889


  1%|▊                                                                                    | 890/100000 [05:10<8:40:13,  3.18it/s]

Iteration: 890


  1%|▊                                                                                    | 891/100000 [05:10<8:32:10,  3.23it/s]

Iteration: 891


  1%|▊                                                                                    | 892/100000 [05:10<8:26:51,  3.26it/s]

Iteration: 892


  1%|▊                                                                                    | 893/100000 [05:11<8:28:44,  3.25it/s]

Iteration: 893


  1%|▊                                                                                    | 894/100000 [05:11<8:28:44,  3.25it/s]

Iteration: 894


  1%|▊                                                                                    | 895/100000 [05:11<8:41:13,  3.17it/s]

Iteration: 895


  1%|▊                                                                                    | 896/100000 [05:12<8:28:11,  3.25it/s]

Iteration: 896


  1%|▊                                                                                    | 897/100000 [05:12<8:24:54,  3.27it/s]

Iteration: 897


  1%|▊                                                                                    | 898/100000 [05:12<8:35:28,  3.20it/s]

Iteration: 898


  1%|▊                                                                                    | 899/100000 [05:13<8:33:48,  3.21it/s]

Iteration: 899


  1%|▊                                                                                    | 900/100000 [05:13<8:27:11,  3.26it/s]

Iteration: 900

900: train loss: 4.153124809265137 / val loss: 4.181250095367432
[CHECKPOINT]: Saving with loss:  4.181250095367432


  1%|▊                                                                                   | 901/100000 [05:16<28:33:46,  1.04s/it]

Iteration: 901


  1%|▊                                                                                   | 902/100000 [05:16<22:34:54,  1.22it/s]

Iteration: 902


  1%|▊                                                                                   | 903/100000 [05:16<18:21:18,  1.50it/s]

Iteration: 903


  1%|▊                                                                                   | 904/100000 [05:17<15:49:03,  1.74it/s]

Iteration: 904


  1%|▊                                                                                   | 905/100000 [05:17<13:41:06,  2.01it/s]

Iteration: 905


  1%|▊                                                                                   | 906/100000 [05:17<12:05:46,  2.28it/s]

Iteration: 906


  1%|▊                                                                                   | 907/100000 [05:17<11:03:19,  2.49it/s]

Iteration: 907


  1%|▊                                                                                   | 908/100000 [05:18<10:18:02,  2.67it/s]

Iteration: 908


  1%|▊                                                                                    | 909/100000 [05:18<9:40:57,  2.84it/s]

Iteration: 909


  1%|▊                                                                                    | 910/100000 [05:18<9:26:31,  2.92it/s]

Iteration: 910


  1%|▊                                                                                    | 911/100000 [05:19<9:04:55,  3.03it/s]

Iteration: 911


  1%|▊                                                                                    | 912/100000 [05:19<8:54:51,  3.09it/s]

Iteration: 912


  1%|▊                                                                                    | 913/100000 [05:19<8:50:19,  3.11it/s]

Iteration: 913


  1%|▊                                                                                    | 914/100000 [05:20<8:39:53,  3.18it/s]

Iteration: 914


  1%|▊                                                                                    | 915/100000 [05:20<8:35:15,  3.20it/s]

Iteration: 915


  1%|▊                                                                                    | 916/100000 [05:20<8:35:16,  3.20it/s]

Iteration: 916


  1%|▊                                                                                    | 917/100000 [05:21<8:45:04,  3.15it/s]

Iteration: 917


  1%|▊                                                                                    | 918/100000 [05:21<8:39:35,  3.18it/s]

Iteration: 918


  1%|▊                                                                                    | 919/100000 [05:21<8:38:44,  3.18it/s]

Iteration: 919


  1%|▊                                                                                    | 920/100000 [05:22<8:44:41,  3.15it/s]

Iteration: 920


  1%|▊                                                                                    | 921/100000 [05:22<8:43:22,  3.16it/s]

Iteration: 921


  1%|▊                                                                                    | 922/100000 [05:22<8:31:10,  3.23it/s]

Iteration: 922


  1%|▊                                                                                    | 923/100000 [05:22<8:33:31,  3.22it/s]

Iteration: 923


  1%|▊                                                                                    | 924/100000 [05:23<8:40:55,  3.17it/s]

Iteration: 924


  1%|▊                                                                                    | 925/100000 [05:23<8:40:14,  3.17it/s]

Iteration: 925


  1%|▊                                                                                    | 926/100000 [05:23<8:40:41,  3.17it/s]

Iteration: 926


  1%|▊                                                                                    | 927/100000 [05:24<8:39:19,  3.18it/s]

Iteration: 927


  1%|▊                                                                                    | 928/100000 [05:24<8:34:45,  3.21it/s]

Iteration: 928


  1%|▊                                                                                    | 929/100000 [05:24<8:42:13,  3.16it/s]

Iteration: 929


  1%|▊                                                                                    | 930/100000 [05:25<8:37:21,  3.19it/s]

Iteration: 930


  1%|▊                                                                                    | 931/100000 [05:25<8:35:49,  3.20it/s]

Iteration: 931


  1%|▊                                                                                    | 932/100000 [05:25<8:29:22,  3.24it/s]

Iteration: 932


  1%|▊                                                                                    | 933/100000 [05:26<8:31:10,  3.23it/s]

Iteration: 933


  1%|▊                                                                                    | 934/100000 [05:26<8:28:13,  3.25it/s]

Iteration: 934


  1%|▊                                                                                    | 935/100000 [05:26<8:36:32,  3.20it/s]

Iteration: 935


  1%|▊                                                                                    | 936/100000 [05:27<8:29:25,  3.24it/s]

Iteration: 936


  1%|▊                                                                                    | 937/100000 [05:27<8:33:42,  3.21it/s]

Iteration: 937


  1%|▊                                                                                    | 938/100000 [05:27<8:39:28,  3.18it/s]

Iteration: 938


  1%|▊                                                                                    | 939/100000 [05:27<8:46:04,  3.14it/s]

Iteration: 939


  1%|▊                                                                                    | 940/100000 [05:28<8:49:12,  3.12it/s]

Iteration: 940


  1%|▊                                                                                    | 941/100000 [05:28<8:49:27,  3.12it/s]

Iteration: 941


  1%|▊                                                                                    | 942/100000 [05:28<8:57:00,  3.07it/s]

Iteration: 942


  1%|▊                                                                                    | 943/100000 [05:29<8:49:51,  3.12it/s]

Iteration: 943


  1%|▊                                                                                    | 944/100000 [05:29<8:47:22,  3.13it/s]

Iteration: 944


  1%|▊                                                                                    | 945/100000 [05:29<8:44:47,  3.15it/s]

Iteration: 945


  1%|▊                                                                                    | 946/100000 [05:30<8:32:18,  3.22it/s]

Iteration: 946


  1%|▊                                                                                    | 947/100000 [05:30<8:33:36,  3.21it/s]

Iteration: 947


  1%|▊                                                                                    | 948/100000 [05:30<8:38:06,  3.19it/s]

Iteration: 948


  1%|▊                                                                                    | 949/100000 [05:31<8:46:22,  3.14it/s]

Iteration: 949


  1%|▊                                                                                    | 950/100000 [05:31<8:52:46,  3.10it/s]

Iteration: 950

950: train loss: 4.095312595367432 / val loss: 4.203125


  1%|▊                                                                                   | 951/100000 [05:33<25:16:15,  1.09it/s]

Iteration: 951


  1%|▊                                                                                   | 952/100000 [05:34<20:16:38,  1.36it/s]

Iteration: 952


  1%|▊                                                                                   | 953/100000 [05:34<17:01:01,  1.62it/s]

Iteration: 953


  1%|▊                                                                                   | 954/100000 [05:34<14:37:18,  1.88it/s]

Iteration: 954


  1%|▊                                                                                   | 955/100000 [05:35<12:50:40,  2.14it/s]

Iteration: 955


  1%|▊                                                                                   | 956/100000 [05:35<11:35:23,  2.37it/s]

Iteration: 956


  1%|▊                                                                                   | 957/100000 [05:35<10:46:49,  2.55it/s]

Iteration: 957


  1%|▊                                                                                   | 958/100000 [05:36<10:07:33,  2.72it/s]

Iteration: 958


  1%|▊                                                                                    | 959/100000 [05:36<9:31:58,  2.89it/s]

Iteration: 959


  1%|▊                                                                                    | 960/100000 [05:36<9:22:59,  2.93it/s]

Iteration: 960


  1%|▊                                                                                    | 961/100000 [05:36<9:16:15,  2.97it/s]

Iteration: 961


  1%|▊                                                                                    | 962/100000 [05:37<8:56:36,  3.08it/s]

Iteration: 962


  1%|▊                                                                                    | 963/100000 [05:37<8:54:20,  3.09it/s]

Iteration: 963


  1%|▊                                                                                    | 964/100000 [05:37<8:44:25,  3.15it/s]

Iteration: 964


  1%|▊                                                                                    | 965/100000 [05:38<8:40:17,  3.17it/s]

Iteration: 965


  1%|▊                                                                                    | 966/100000 [05:38<8:38:46,  3.18it/s]

Iteration: 966


  1%|▊                                                                                    | 967/100000 [05:38<8:36:49,  3.19it/s]

Iteration: 967


  1%|▊                                                                                    | 968/100000 [05:39<8:31:06,  3.23it/s]

Iteration: 968


  1%|▊                                                                                    | 969/100000 [05:39<8:33:25,  3.21it/s]

Iteration: 969


  1%|▊                                                                                    | 970/100000 [05:39<8:33:10,  3.22it/s]

Iteration: 970


  1%|▊                                                                                    | 971/100000 [05:40<8:43:27,  3.15it/s]

Iteration: 971


  1%|▊                                                                                    | 972/100000 [05:40<8:46:19,  3.14it/s]

Iteration: 972


  1%|▊                                                                                    | 973/100000 [05:40<8:48:24,  3.12it/s]

Iteration: 973


  1%|▊                                                                                    | 974/100000 [05:41<8:53:06,  3.10it/s]

Iteration: 974


  1%|▊                                                                                    | 975/100000 [05:41<8:54:35,  3.09it/s]

Iteration: 975


  1%|▊                                                                                    | 976/100000 [05:41<8:55:03,  3.08it/s]

Iteration: 976


  1%|▊                                                                                    | 977/100000 [05:42<8:55:19,  3.08it/s]

Iteration: 977


  1%|▊                                                                                    | 978/100000 [05:42<8:48:51,  3.12it/s]

Iteration: 978


  1%|▊                                                                                    | 979/100000 [05:42<8:52:15,  3.10it/s]

Iteration: 979


  1%|▊                                                                                    | 980/100000 [05:43<8:47:45,  3.13it/s]

Iteration: 980


  1%|▊                                                                                    | 981/100000 [05:43<8:40:08,  3.17it/s]

Iteration: 981


  1%|▊                                                                                    | 982/100000 [05:43<8:49:13,  3.12it/s]

Iteration: 982


  1%|▊                                                                                    | 983/100000 [05:43<8:46:18,  3.14it/s]

Iteration: 983


  1%|▊                                                                                    | 984/100000 [05:44<8:33:37,  3.21it/s]

Iteration: 984


  1%|▊                                                                                    | 985/100000 [05:44<8:34:38,  3.21it/s]

Iteration: 985


  1%|▊                                                                                    | 986/100000 [05:44<8:40:59,  3.17it/s]

Iteration: 986


  1%|▊                                                                                    | 987/100000 [05:45<8:32:04,  3.22it/s]

Iteration: 987


  1%|▊                                                                                    | 988/100000 [05:45<8:33:43,  3.21it/s]

Iteration: 988


  1%|▊                                                                                    | 989/100000 [05:45<8:35:55,  3.20it/s]

Iteration: 989


  1%|▊                                                                                    | 990/100000 [05:46<8:32:54,  3.22it/s]

Iteration: 990


  1%|▊                                                                                    | 991/100000 [05:46<8:32:39,  3.22it/s]

Iteration: 991


  1%|▊                                                                                    | 992/100000 [05:46<8:26:48,  3.26it/s]

Iteration: 992


  1%|▊                                                                                    | 993/100000 [05:47<8:28:07,  3.25it/s]

Iteration: 993


  1%|▊                                                                                    | 994/100000 [05:47<8:29:44,  3.24it/s]

Iteration: 994


  1%|▊                                                                                    | 995/100000 [05:47<8:24:59,  3.27it/s]

Iteration: 995


  1%|▊                                                                                    | 996/100000 [05:47<8:29:26,  3.24it/s]

Iteration: 996


  1%|▊                                                                                    | 997/100000 [05:48<8:29:29,  3.24it/s]

Iteration: 997


  1%|▊                                                                                    | 998/100000 [05:48<8:32:44,  3.22it/s]

Iteration: 998


  1%|▊                                                                                    | 999/100000 [05:48<8:32:09,  3.22it/s]

Iteration: 999


  1%|▊                                                                                   | 1000/100000 [05:49<8:32:25,  3.22it/s]

Iteration: 1000

1000: train loss: 4.092187404632568 / val loss: 4.079687595367432
[CHECKPOINT]: Saving with loss:  4.079687595367432


  1%|▊                                                                                  | 1001/100000 [05:51<27:43:19,  1.01s/it]

Iteration: 1001


  1%|▊                                                                                  | 1002/100000 [05:52<21:52:25,  1.26it/s]

Iteration: 1002


  1%|▊                                                                                  | 1003/100000 [05:52<17:55:01,  1.53it/s]

Iteration: 1003


  1%|▊                                                                                  | 1004/100000 [05:52<15:19:37,  1.79it/s]

Iteration: 1004


  1%|▊                                                                                  | 1005/100000 [05:53<13:10:13,  2.09it/s]

Iteration: 1005


  1%|▊                                                                                  | 1006/100000 [05:53<11:41:12,  2.35it/s]

Iteration: 1006


  1%|▊                                                                                  | 1007/100000 [05:53<10:53:13,  2.53it/s]

Iteration: 1007


  1%|▊                                                                                  | 1008/100000 [05:54<10:05:03,  2.73it/s]

Iteration: 1008


  1%|▊                                                                                   | 1009/100000 [05:54<9:37:34,  2.86it/s]

Iteration: 1009


  1%|▊                                                                                   | 1010/100000 [05:54<9:25:38,  2.92it/s]

Iteration: 1010


  1%|▊                                                                                   | 1011/100000 [05:54<9:10:11,  3.00it/s]

Iteration: 1011


  1%|▊                                                                                   | 1012/100000 [05:55<8:59:48,  3.06it/s]

Iteration: 1012


  1%|▊                                                                                   | 1013/100000 [05:55<8:52:42,  3.10it/s]

Iteration: 1013


  1%|▊                                                                                   | 1014/100000 [05:55<8:41:00,  3.17it/s]

Iteration: 1014


  1%|▊                                                                                   | 1015/100000 [05:56<8:39:08,  3.18it/s]

Iteration: 1015


  1%|▊                                                                                   | 1016/100000 [05:56<8:36:32,  3.19it/s]

Iteration: 1016


  1%|▊                                                                                   | 1017/100000 [05:56<8:35:48,  3.20it/s]

Iteration: 1017


  1%|▊                                                                                   | 1018/100000 [05:57<8:36:30,  3.19it/s]

Iteration: 1018


  1%|▊                                                                                   | 1019/100000 [05:57<8:36:01,  3.20it/s]

Iteration: 1019


  1%|▊                                                                                   | 1020/100000 [05:57<8:36:16,  3.20it/s]

Iteration: 1020


  1%|▊                                                                                   | 1021/100000 [05:58<8:35:31,  3.20it/s]

Iteration: 1021


  1%|▊                                                                                   | 1022/100000 [05:58<8:34:39,  3.21it/s]

Iteration: 1022


  1%|▊                                                                                   | 1023/100000 [05:58<8:34:32,  3.21it/s]

Iteration: 1023


  1%|▊                                                                                   | 1024/100000 [05:59<8:33:39,  3.21it/s]

Iteration: 1024


  1%|▊                                                                                   | 1025/100000 [05:59<8:27:29,  3.25it/s]

Iteration: 1025


  1%|▊                                                                                   | 1026/100000 [05:59<8:31:03,  3.23it/s]

Iteration: 1026


  1%|▊                                                                                   | 1027/100000 [05:59<8:37:44,  3.19it/s]

Iteration: 1027


  1%|▊                                                                                   | 1028/100000 [06:00<8:39:19,  3.18it/s]

Iteration: 1028


  1%|▊                                                                                   | 1029/100000 [06:00<8:31:36,  3.22it/s]

Iteration: 1029


  1%|▊                                                                                   | 1030/100000 [06:00<8:38:33,  3.18it/s]

Iteration: 1030


  1%|▊                                                                                   | 1031/100000 [06:01<8:37:44,  3.19it/s]

Iteration: 1031


  1%|▊                                                                                   | 1032/100000 [06:01<8:35:52,  3.20it/s]

Iteration: 1032


  1%|▊                                                                                  | 1033/100000 [06:04<31:25:55,  1.14s/it]

Iteration: 1033


  1%|▊                                                                                  | 1034/100000 [06:09<63:53:02,  2.32s/it]

Iteration: 1034


  1%|▊                                                                                  | 1035/100000 [06:15<91:44:37,  3.34s/it]

Iteration: 1035


  1%|▊                                                                                  | 1036/100000 [06:17<85:11:35,  3.10s/it]

Iteration: 1036


  1%|▊                                                                                  | 1037/100000 [06:18<62:29:22,  2.27s/it]

Iteration: 1037


  1%|▊                                                                                  | 1038/100000 [06:18<46:12:45,  1.68s/it]

Iteration: 1038


  1%|▊                                                                                  | 1039/100000 [06:18<34:45:45,  1.26s/it]

Iteration: 1039


  1%|▊                                                                                  | 1040/100000 [06:19<27:03:21,  1.02it/s]

Iteration: 1040


  1%|▊                                                                                  | 1041/100000 [06:19<21:23:24,  1.29it/s]

Iteration: 1041


  1%|▊                                                                                  | 1042/100000 [06:19<17:36:24,  1.56it/s]

Iteration: 1042


  1%|▊                                                                                  | 1043/100000 [06:20<14:53:06,  1.85it/s]

Iteration: 1043


  1%|▊                                                                                  | 1044/100000 [06:20<12:51:04,  2.14it/s]

Iteration: 1044


  1%|▊                                                                                  | 1045/100000 [06:20<11:30:32,  2.39it/s]

Iteration: 1045


  1%|▊                                                                                  | 1046/100000 [06:21<10:35:11,  2.60it/s]

Iteration: 1046


  1%|▊                                                                                  | 1047/100000 [06:21<10:06:17,  2.72it/s]

Iteration: 1047


  1%|▉                                                                                   | 1048/100000 [06:21<9:36:22,  2.86it/s]

Iteration: 1048


  1%|▉                                                                                   | 1049/100000 [06:21<9:17:58,  2.96it/s]

Iteration: 1049


  1%|▉                                                                                   | 1050/100000 [06:22<9:09:59,  3.00it/s]

Iteration: 1050

1050: train loss: 4.074999809265137 / val loss: 4.076562404632568
[CHECKPOINT]: Saving with loss:  4.076562404632568


  1%|▊                                                                                  | 1051/100000 [06:24<27:36:13,  1.00s/it]

Iteration: 1051


  1%|▊                                                                                  | 1052/100000 [06:25<21:55:24,  1.25it/s]

Iteration: 1052


  1%|▊                                                                                  | 1053/100000 [06:25<17:52:18,  1.54it/s]

Iteration: 1053


  1%|▊                                                                                  | 1054/100000 [06:25<15:14:58,  1.80it/s]

Iteration: 1054


  1%|▉                                                                                  | 1055/100000 [06:26<13:12:41,  2.08it/s]

Iteration: 1055


  1%|▉                                                                                  | 1056/100000 [06:26<11:42:29,  2.35it/s]

Iteration: 1056


  1%|▉                                                                                  | 1057/100000 [06:26<10:48:47,  2.54it/s]

Iteration: 1057


  1%|▉                                                                                  | 1058/100000 [06:27<10:06:26,  2.72it/s]

Iteration: 1058


  1%|▉                                                                                   | 1059/100000 [06:27<9:39:06,  2.85it/s]

Iteration: 1059


  1%|▉                                                                                   | 1060/100000 [06:27<9:21:43,  2.94it/s]

Iteration: 1060


  1%|▉                                                                                   | 1061/100000 [06:27<8:58:25,  3.06it/s]

Iteration: 1061


  1%|▉                                                                                   | 1062/100000 [06:28<8:51:53,  3.10it/s]

Iteration: 1062


  1%|▉                                                                                   | 1063/100000 [06:28<8:46:27,  3.13it/s]

Iteration: 1063


  1%|▉                                                                                   | 1064/100000 [06:28<8:42:22,  3.16it/s]

Iteration: 1064


  1%|▉                                                                                   | 1065/100000 [06:29<8:38:55,  3.18it/s]

Iteration: 1065


  1%|▉                                                                                   | 1066/100000 [06:29<8:30:53,  3.23it/s]

Iteration: 1066


  1%|▉                                                                                   | 1067/100000 [06:29<8:30:01,  3.23it/s]

Iteration: 1067


  1%|▉                                                                                   | 1068/100000 [06:30<8:24:27,  3.27it/s]

Iteration: 1068


  1%|▉                                                                                   | 1069/100000 [06:30<8:20:38,  3.29it/s]

Iteration: 1069


  1%|▉                                                                                   | 1070/100000 [06:30<8:24:18,  3.27it/s]

Iteration: 1070


  1%|▉                                                                                   | 1071/100000 [06:31<8:30:19,  3.23it/s]

Iteration: 1071


  1%|▉                                                                                   | 1072/100000 [06:31<8:21:37,  3.29it/s]

Iteration: 1072


  1%|▉                                                                                   | 1073/100000 [06:31<8:34:17,  3.21it/s]

Iteration: 1073


  1%|▉                                                                                   | 1074/100000 [06:31<8:25:31,  3.26it/s]

Iteration: 1074


  1%|▉                                                                                   | 1075/100000 [06:32<8:29:13,  3.24it/s]

Iteration: 1075


  1%|▉                                                                                   | 1076/100000 [06:32<8:29:03,  3.24it/s]

Iteration: 1076


  1%|▉                                                                                   | 1077/100000 [06:32<8:25:56,  3.26it/s]

Iteration: 1077


  1%|▉                                                                                   | 1078/100000 [06:33<8:27:39,  3.25it/s]

Iteration: 1078


  1%|▉                                                                                   | 1079/100000 [06:33<8:19:37,  3.30it/s]

Iteration: 1079


  1%|▉                                                                                   | 1080/100000 [06:33<8:24:50,  3.27it/s]

Iteration: 1080


  1%|▉                                                                                   | 1081/100000 [06:34<8:20:30,  3.29it/s]

Iteration: 1081


  1%|▉                                                                                   | 1082/100000 [06:34<8:21:52,  3.28it/s]

Iteration: 1082


  1%|▉                                                                                   | 1083/100000 [06:34<8:18:54,  3.30it/s]

Iteration: 1083


  1%|▉                                                                                   | 1084/100000 [06:35<8:18:20,  3.31it/s]

Iteration: 1084


  1%|▉                                                                                   | 1085/100000 [06:35<8:13:52,  3.34it/s]

Iteration: 1085


  1%|▉                                                                                   | 1086/100000 [06:35<8:21:41,  3.29it/s]

Iteration: 1086


  1%|▉                                                                                   | 1087/100000 [06:35<8:26:27,  3.26it/s]

Iteration: 1087


  1%|▉                                                                                   | 1088/100000 [06:36<8:28:02,  3.24it/s]

Iteration: 1088


  1%|▉                                                                                   | 1089/100000 [06:36<8:30:20,  3.23it/s]

Iteration: 1089


  1%|▉                                                                                   | 1090/100000 [06:36<8:30:49,  3.23it/s]

Iteration: 1090


  1%|▉                                                                                   | 1091/100000 [06:37<8:33:37,  3.21it/s]

Iteration: 1091


  1%|▉                                                                                   | 1092/100000 [06:37<8:32:08,  3.22it/s]

Iteration: 1092


  1%|▉                                                                                   | 1093/100000 [06:37<8:28:52,  3.24it/s]

Iteration: 1093


  1%|▉                                                                                   | 1094/100000 [06:38<8:35:10,  3.20it/s]

Iteration: 1094


  1%|▉                                                                                   | 1095/100000 [06:38<8:28:39,  3.24it/s]

Iteration: 1095


  1%|▉                                                                                   | 1096/100000 [06:38<8:32:52,  3.21it/s]

Iteration: 1096


  1%|▉                                                                                   | 1097/100000 [06:39<8:32:15,  3.22it/s]

Iteration: 1097


  1%|▉                                                                                   | 1098/100000 [06:39<8:34:21,  3.20it/s]

Iteration: 1098


  1%|▉                                                                                   | 1099/100000 [06:39<8:33:59,  3.21it/s]

Iteration: 1099


  1%|▉                                                                                   | 1100/100000 [06:39<8:32:05,  3.22it/s]

Iteration: 1100

1100: train loss: 4.028124809265137 / val loss: 4.089062690734863


  1%|▉                                                                                  | 1101/100000 [06:42<24:52:51,  1.10it/s]

Iteration: 1101


  1%|▉                                                                                  | 1102/100000 [06:42<19:58:07,  1.38it/s]

Iteration: 1102


  1%|▉                                                                                  | 1103/100000 [06:42<16:31:29,  1.66it/s]

Iteration: 1103


  1%|▉                                                                                  | 1104/100000 [06:43<14:02:20,  1.96it/s]

Iteration: 1104


  1%|▉                                                                                  | 1105/100000 [06:43<12:20:38,  2.23it/s]

Iteration: 1105


  1%|▉                                                                                  | 1106/100000 [06:43<11:08:44,  2.46it/s]

Iteration: 1106


  1%|▉                                                                                  | 1107/100000 [06:44<10:12:36,  2.69it/s]

Iteration: 1107


  1%|▉                                                                                   | 1108/100000 [06:44<9:42:08,  2.83it/s]

Iteration: 1108


  1%|▉                                                                                   | 1109/100000 [06:44<9:21:53,  2.93it/s]

Iteration: 1109


  1%|▉                                                                                   | 1110/100000 [06:45<9:10:28,  2.99it/s]

Iteration: 1110


  1%|▉                                                                                   | 1111/100000 [06:45<8:55:12,  3.08it/s]

Iteration: 1111


  1%|▉                                                                                   | 1112/100000 [06:45<8:50:43,  3.11it/s]

Iteration: 1112


  1%|▉                                                                                   | 1113/100000 [06:45<8:45:25,  3.14it/s]

Iteration: 1113


  1%|▉                                                                                   | 1114/100000 [06:46<8:42:21,  3.16it/s]

Iteration: 1114


  1%|▉                                                                                   | 1115/100000 [06:46<8:42:01,  3.16it/s]

Iteration: 1115


  1%|▉                                                                                   | 1116/100000 [06:46<8:39:01,  3.18it/s]

Iteration: 1116


  1%|▉                                                                                   | 1117/100000 [06:47<8:32:07,  3.22it/s]

Iteration: 1117


  1%|▉                                                                                   | 1118/100000 [06:47<8:38:26,  3.18it/s]

Iteration: 1118


  1%|▉                                                                                   | 1119/100000 [06:47<8:38:52,  3.18it/s]

Iteration: 1119


  1%|▉                                                                                   | 1120/100000 [06:48<8:35:26,  3.20it/s]

Iteration: 1120


  1%|▉                                                                                   | 1121/100000 [06:48<8:42:34,  3.15it/s]

Iteration: 1121


  1%|▉                                                                                   | 1122/100000 [06:48<8:40:31,  3.17it/s]

Iteration: 1122


  1%|▉                                                                                   | 1123/100000 [06:49<8:39:20,  3.17it/s]

Iteration: 1123


  1%|▉                                                                                   | 1124/100000 [06:49<8:52:36,  3.09it/s]

Iteration: 1124


  1%|▉                                                                                   | 1125/100000 [06:49<8:45:21,  3.14it/s]

Iteration: 1125


  1%|▉                                                                                   | 1126/100000 [06:50<8:44:27,  3.14it/s]

Iteration: 1126


  1%|▉                                                                                   | 1127/100000 [06:50<8:33:26,  3.21it/s]

Iteration: 1127


  1%|▉                                                                                   | 1128/100000 [06:50<8:32:35,  3.21it/s]

Iteration: 1128


  1%|▉                                                                                   | 1129/100000 [06:50<8:33:04,  3.21it/s]

Iteration: 1129


  1%|▉                                                                                   | 1130/100000 [06:51<8:31:40,  3.22it/s]

Iteration: 1130


  1%|▉                                                                                   | 1131/100000 [06:51<8:25:52,  3.26it/s]

Iteration: 1131


  1%|▉                                                                                   | 1132/100000 [06:51<8:28:55,  3.24it/s]

Iteration: 1132


  1%|▉                                                                                   | 1133/100000 [06:52<8:34:11,  3.20it/s]

Iteration: 1133


  1%|▉                                                                                   | 1134/100000 [06:52<8:48:39,  3.12it/s]

Iteration: 1134


  1%|▉                                                                                   | 1135/100000 [06:52<8:50:57,  3.10it/s]

Iteration: 1135


  1%|▉                                                                                   | 1136/100000 [06:53<8:51:59,  3.10it/s]

Iteration: 1136


  1%|▉                                                                                   | 1137/100000 [06:53<8:56:17,  3.07it/s]

Iteration: 1137


  1%|▉                                                                                   | 1138/100000 [06:53<8:48:12,  3.12it/s]

Iteration: 1138


  1%|▉                                                                                   | 1139/100000 [06:54<8:39:34,  3.17it/s]

Iteration: 1139


  1%|▉                                                                                   | 1140/100000 [06:54<8:37:30,  3.18it/s]

Iteration: 1140


  1%|▉                                                                                   | 1141/100000 [06:54<8:29:48,  3.23it/s]

Iteration: 1141


  1%|▉                                                                                   | 1142/100000 [06:55<8:25:15,  3.26it/s]

Iteration: 1142


  1%|▉                                                                                   | 1143/100000 [06:55<8:27:54,  3.24it/s]

Iteration: 1143


  1%|▉                                                                                   | 1144/100000 [06:55<8:22:17,  3.28it/s]

Iteration: 1144


  1%|▉                                                                                   | 1145/100000 [06:55<8:27:36,  3.25it/s]

Iteration: 1145


  1%|▉                                                                                   | 1146/100000 [06:56<8:37:14,  3.19it/s]

Iteration: 1146


  1%|▉                                                                                   | 1147/100000 [06:56<8:36:34,  3.19it/s]

Iteration: 1147


  1%|▉                                                                                   | 1148/100000 [06:56<8:40:40,  3.16it/s]

Iteration: 1148


  1%|▉                                                                                   | 1149/100000 [06:57<8:35:17,  3.20it/s]

Iteration: 1149


  1%|▉                                                                                   | 1150/100000 [06:57<8:32:47,  3.21it/s]

Iteration: 1150

1150: train loss: 3.9515624046325684 / val loss: 3.9921875
[CHECKPOINT]: Saving with loss:  3.9921875


  1%|▉                                                                                  | 1151/100000 [07:00<27:34:25,  1.00s/it]

Iteration: 1151


  1%|▉                                                                                  | 1152/100000 [07:00<21:59:17,  1.25it/s]

Iteration: 1152


  1%|▉                                                                                  | 1153/100000 [07:00<17:48:45,  1.54it/s]

Iteration: 1153


  1%|▉                                                                                  | 1154/100000 [07:01<15:20:59,  1.79it/s]

Iteration: 1154


  1%|▉                                                                                  | 1155/100000 [07:01<13:16:25,  2.07it/s]

Iteration: 1155


  1%|▉                                                                                  | 1156/100000 [07:01<11:50:42,  2.32it/s]

Iteration: 1156


  1%|▉                                                                                  | 1157/100000 [07:02<10:53:08,  2.52it/s]

Iteration: 1157


  1%|▉                                                                                  | 1158/100000 [07:02<10:10:27,  2.70it/s]

Iteration: 1158


  1%|▉                                                                                   | 1159/100000 [07:02<9:35:41,  2.86it/s]

Iteration: 1159


  1%|▉                                                                                   | 1160/100000 [07:03<9:13:58,  2.97it/s]

Iteration: 1160


  1%|▉                                                                                   | 1161/100000 [07:03<9:03:23,  3.03it/s]

Iteration: 1161


  1%|▉                                                                                   | 1162/100000 [07:03<8:54:31,  3.08it/s]

Iteration: 1162


  1%|▉                                                                                   | 1163/100000 [07:03<8:45:23,  3.14it/s]

Iteration: 1163


  1%|▉                                                                                   | 1164/100000 [07:04<8:43:50,  3.14it/s]

Iteration: 1164


  1%|▉                                                                                   | 1165/100000 [07:04<8:32:56,  3.21it/s]

Iteration: 1165


  1%|▉                                                                                   | 1166/100000 [07:04<8:33:14,  3.21it/s]

Iteration: 1166


  1%|▉                                                                                   | 1167/100000 [07:05<8:33:18,  3.21it/s]

Iteration: 1167


  1%|▉                                                                                   | 1168/100000 [07:05<8:34:20,  3.20it/s]

Iteration: 1168


  1%|▉                                                                                   | 1169/100000 [07:05<8:34:01,  3.20it/s]

Iteration: 1169


  1%|▉                                                                                   | 1170/100000 [07:06<8:28:02,  3.24it/s]

Iteration: 1170


  1%|▉                                                                                   | 1171/100000 [07:06<8:29:31,  3.23it/s]

Iteration: 1171


  1%|▉                                                                                   | 1172/100000 [07:06<8:28:10,  3.24it/s]

Iteration: 1172


  1%|▉                                                                                   | 1173/100000 [07:07<8:28:46,  3.24it/s]

Iteration: 1173


  1%|▉                                                                                   | 1174/100000 [07:07<8:25:32,  3.26it/s]

Iteration: 1174


  1%|▉                                                                                   | 1175/100000 [07:07<8:21:13,  3.29it/s]

Iteration: 1175


  1%|▉                                                                                   | 1176/100000 [07:07<8:26:31,  3.25it/s]

Iteration: 1176


  1%|▉                                                                                   | 1177/100000 [07:08<8:34:53,  3.20it/s]

Iteration: 1177


  1%|▉                                                                                   | 1178/100000 [07:08<8:42:55,  3.15it/s]

Iteration: 1178


  1%|▉                                                                                   | 1179/100000 [07:08<8:47:02,  3.13it/s]

Iteration: 1179


  1%|▉                                                                                   | 1180/100000 [07:09<8:35:26,  3.20it/s]

Iteration: 1180


  1%|▉                                                                                   | 1181/100000 [07:09<8:29:48,  3.23it/s]

Iteration: 1181


  1%|▉                                                                                   | 1182/100000 [07:09<8:24:46,  3.26it/s]

Iteration: 1182


  1%|▉                                                                                   | 1183/100000 [07:10<8:26:22,  3.25it/s]

Iteration: 1183


  1%|▉                                                                                   | 1184/100000 [07:10<8:22:13,  3.28it/s]

Iteration: 1184


  1%|▉                                                                                   | 1185/100000 [07:10<8:27:14,  3.25it/s]

Iteration: 1185


  1%|▉                                                                                   | 1186/100000 [07:11<8:29:14,  3.23it/s]

Iteration: 1186


  1%|▉                                                                                   | 1187/100000 [07:11<8:31:04,  3.22it/s]

Iteration: 1187


  1%|▉                                                                                   | 1188/100000 [07:11<8:30:52,  3.22it/s]

Iteration: 1188


  1%|▉                                                                                   | 1189/100000 [07:11<8:32:41,  3.21it/s]

Iteration: 1189


  1%|▉                                                                                   | 1190/100000 [07:12<8:25:16,  3.26it/s]

Iteration: 1190


  1%|█                                                                                   | 1191/100000 [07:12<8:27:31,  3.24it/s]

Iteration: 1191


  1%|█                                                                                   | 1192/100000 [07:12<8:30:34,  3.23it/s]

Iteration: 1192


  1%|█                                                                                   | 1193/100000 [07:13<8:31:13,  3.22it/s]

Iteration: 1193


  1%|█                                                                                   | 1194/100000 [07:13<8:31:48,  3.22it/s]

Iteration: 1194


  1%|█                                                                                   | 1195/100000 [07:13<8:24:10,  3.27it/s]

Iteration: 1195


  1%|█                                                                                   | 1196/100000 [07:14<8:27:35,  3.24it/s]

Iteration: 1196


  1%|█                                                                                   | 1197/100000 [07:14<8:24:26,  3.26it/s]

Iteration: 1197


  1%|█                                                                                   | 1198/100000 [07:14<8:25:37,  3.26it/s]

Iteration: 1198


  1%|█                                                                                   | 1199/100000 [07:15<8:42:09,  3.15it/s]

Iteration: 1199


  1%|█                                                                                   | 1200/100000 [07:15<8:49:26,  3.11it/s]

Iteration: 1200

1200: train loss: 3.8812499046325684 / val loss: 4.048437595367432


  1%|▉                                                                                  | 1201/100000 [07:17<25:23:40,  1.08it/s]

Iteration: 1201


  1%|▉                                                                                  | 1202/100000 [07:18<20:20:20,  1.35it/s]

Iteration: 1202


  1%|▉                                                                                  | 1203/100000 [07:18<16:56:31,  1.62it/s]

Iteration: 1203


  1%|▉                                                                                  | 1204/100000 [07:18<14:25:49,  1.90it/s]

Iteration: 1204


  1%|█                                                                                  | 1205/100000 [07:19<12:51:43,  2.13it/s]

Iteration: 1205


  1%|█                                                                                  | 1206/100000 [07:19<11:33:23,  2.37it/s]

Iteration: 1206


  1%|█                                                                                  | 1207/100000 [07:19<10:45:46,  2.55it/s]

Iteration: 1207


  1%|█                                                                                   | 1208/100000 [07:19<9:59:52,  2.74it/s]

Iteration: 1208


  1%|█                                                                                   | 1209/100000 [07:20<9:38:10,  2.85it/s]

Iteration: 1209


  1%|█                                                                                   | 1210/100000 [07:20<9:19:51,  2.94it/s]

Iteration: 1210


  1%|█                                                                                   | 1211/100000 [07:20<9:03:05,  3.03it/s]

Iteration: 1211


  1%|█                                                                                   | 1212/100000 [07:21<8:54:55,  3.08it/s]

Iteration: 1212


  1%|█                                                                                   | 1213/100000 [07:21<8:49:30,  3.11it/s]

Iteration: 1213


  1%|█                                                                                   | 1214/100000 [07:21<8:41:12,  3.16it/s]

Iteration: 1214


  1%|█                                                                                   | 1215/100000 [07:22<8:39:59,  3.17it/s]

Iteration: 1215


  1%|█                                                                                   | 1216/100000 [07:22<8:38:47,  3.17it/s]

Iteration: 1216


  1%|█                                                                                   | 1217/100000 [07:22<8:35:47,  3.19it/s]

Iteration: 1217


  1%|█                                                                                   | 1218/100000 [07:23<8:36:11,  3.19it/s]

Iteration: 1218


  1%|█                                                                                   | 1219/100000 [07:23<8:28:34,  3.24it/s]

Iteration: 1219


  1%|█                                                                                   | 1220/100000 [07:23<8:37:53,  3.18it/s]

Iteration: 1220


  1%|█                                                                                   | 1221/100000 [07:24<8:30:36,  3.22it/s]

Iteration: 1221


  1%|█                                                                                   | 1222/100000 [07:24<8:31:10,  3.22it/s]

Iteration: 1222


  1%|█                                                                                   | 1223/100000 [07:24<8:31:58,  3.22it/s]

Iteration: 1223


  1%|█                                                                                   | 1224/100000 [07:24<8:26:45,  3.25it/s]

Iteration: 1224


  1%|█                                                                                   | 1225/100000 [07:25<8:25:12,  3.26it/s]

Iteration: 1225


  1%|█                                                                                   | 1226/100000 [07:25<8:29:37,  3.23it/s]

Iteration: 1226


  1%|█                                                                                   | 1227/100000 [07:25<8:25:47,  3.25it/s]

Iteration: 1227


  1%|█                                                                                   | 1228/100000 [07:26<8:24:17,  3.26it/s]

Iteration: 1228


  1%|█                                                                                   | 1229/100000 [07:26<8:34:02,  3.20it/s]

Iteration: 1229


  1%|█                                                                                   | 1230/100000 [07:26<8:27:30,  3.24it/s]

Iteration: 1230


  1%|█                                                                                   | 1231/100000 [07:27<8:33:36,  3.21it/s]

Iteration: 1231


  1%|█                                                                                   | 1232/100000 [07:27<8:36:50,  3.19it/s]

Iteration: 1232


  1%|█                                                                                   | 1233/100000 [07:27<8:35:50,  3.19it/s]

Iteration: 1233


  1%|█                                                                                   | 1234/100000 [07:28<8:38:55,  3.17it/s]

Iteration: 1234


  1%|█                                                                                   | 1235/100000 [07:28<8:32:56,  3.21it/s]

Iteration: 1235


  1%|█                                                                                   | 1236/100000 [07:28<8:31:43,  3.22it/s]

Iteration: 1236


  1%|█                                                                                   | 1237/100000 [07:28<8:26:22,  3.25it/s]

Iteration: 1237


  1%|█                                                                                   | 1238/100000 [07:29<8:26:30,  3.25it/s]

Iteration: 1238


  1%|█                                                                                   | 1239/100000 [07:29<8:22:22,  3.28it/s]

Iteration: 1239


  1%|█                                                                                   | 1240/100000 [07:29<8:32:47,  3.21it/s]

Iteration: 1240


  1%|█                                                                                   | 1241/100000 [07:30<8:26:23,  3.25it/s]

Iteration: 1241


  1%|█                                                                                   | 1242/100000 [07:30<8:28:41,  3.24it/s]

Iteration: 1242


  1%|█                                                                                   | 1243/100000 [07:30<8:27:57,  3.24it/s]

Iteration: 1243


  1%|█                                                                                   | 1244/100000 [07:31<8:23:11,  3.27it/s]

Iteration: 1244


  1%|█                                                                                   | 1245/100000 [07:31<8:19:27,  3.30it/s]

Iteration: 1245


  1%|█                                                                                   | 1246/100000 [07:31<8:25:33,  3.26it/s]

Iteration: 1246


  1%|█                                                                                   | 1247/100000 [07:32<8:19:42,  3.29it/s]

Iteration: 1247


  1%|█                                                                                   | 1248/100000 [07:32<8:22:44,  3.27it/s]

Iteration: 1248


  1%|█                                                                                   | 1249/100000 [07:32<8:17:46,  3.31it/s]

Iteration: 1249


  1%|█                                                                                   | 1250/100000 [07:32<8:22:53,  3.27it/s]

Iteration: 1250

1250: train loss: 3.917187452316284 / val loss: 3.9937500953674316


  1%|█                                                                                  | 1251/100000 [07:35<25:06:34,  1.09it/s]

Iteration: 1251


  1%|█                                                                                  | 1252/100000 [07:35<20:09:24,  1.36it/s]

Iteration: 1252


  1%|█                                                                                  | 1253/100000 [07:35<16:47:34,  1.63it/s]

Iteration: 1253


  1%|█                                                                                  | 1254/100000 [07:36<14:15:44,  1.92it/s]

Iteration: 1254


  1%|█                                                                                  | 1255/100000 [07:36<12:34:57,  2.18it/s]

Iteration: 1255


  1%|█                                                                                  | 1256/100000 [07:36<11:28:20,  2.39it/s]

Iteration: 1256


  1%|█                                                                                  | 1257/100000 [07:37<10:47:59,  2.54it/s]

Iteration: 1257


  1%|█                                                                                  | 1258/100000 [07:37<10:05:47,  2.72it/s]

Iteration: 1258


  1%|█                                                                                   | 1259/100000 [07:37<9:53:02,  2.77it/s]

Iteration: 1259


  1%|█                                                                                   | 1260/100000 [07:38<9:39:53,  2.84it/s]

Iteration: 1260


  1%|█                                                                                   | 1261/100000 [07:38<9:23:44,  2.92it/s]

Iteration: 1261


  1%|█                                                                                   | 1262/100000 [07:38<9:01:58,  3.04it/s]

Iteration: 1262


  1%|█                                                                                   | 1263/100000 [07:39<9:00:41,  3.04it/s]

Iteration: 1263


  1%|█                                                                                   | 1264/100000 [07:39<8:52:27,  3.09it/s]

Iteration: 1264


  1%|█                                                                                   | 1265/100000 [07:39<8:39:04,  3.17it/s]

Iteration: 1265


  1%|█                                                                                   | 1266/100000 [07:40<8:38:33,  3.17it/s]

Iteration: 1266


  1%|█                                                                                   | 1267/100000 [07:40<8:38:37,  3.17it/s]

Iteration: 1267


  1%|█                                                                                   | 1268/100000 [07:40<8:41:13,  3.16it/s]

Iteration: 1268


  1%|█                                                                                   | 1269/100000 [07:41<8:42:58,  3.15it/s]

Iteration: 1269


  1%|█                                                                                   | 1270/100000 [07:41<8:49:34,  3.11it/s]

Iteration: 1270


  1%|█                                                                                   | 1271/100000 [07:41<8:52:13,  3.09it/s]

Iteration: 1271


  1%|█                                                                                   | 1272/100000 [07:42<8:49:59,  3.10it/s]

Iteration: 1272


  1%|█                                                                                   | 1273/100000 [07:42<8:47:37,  3.12it/s]

Iteration: 1273


  1%|█                                                                                   | 1274/100000 [07:42<8:41:20,  3.16it/s]

Iteration: 1274


  1%|█                                                                                   | 1275/100000 [07:42<8:47:52,  3.12it/s]

Iteration: 1275


  1%|█                                                                                   | 1276/100000 [07:43<8:45:50,  3.13it/s]

Iteration: 1276


  1%|█                                                                                   | 1277/100000 [07:43<8:37:17,  3.18it/s]

Iteration: 1277


  1%|█                                                                                   | 1278/100000 [07:43<8:28:36,  3.24it/s]

Iteration: 1278


  1%|█                                                                                   | 1279/100000 [07:44<8:30:09,  3.23it/s]

Iteration: 1279


  1%|█                                                                                   | 1280/100000 [07:44<8:29:15,  3.23it/s]

Iteration: 1280


  1%|█                                                                                   | 1281/100000 [07:44<8:23:39,  3.27it/s]

Iteration: 1281


  1%|█                                                                                   | 1282/100000 [07:45<8:27:05,  3.24it/s]

Iteration: 1282


  1%|█                                                                                   | 1283/100000 [07:45<8:29:46,  3.23it/s]

Iteration: 1283


  1%|█                                                                                   | 1284/100000 [07:45<8:28:39,  3.23it/s]

Iteration: 1284


  1%|█                                                                                   | 1285/100000 [07:46<8:22:17,  3.28it/s]

Iteration: 1285


  1%|█                                                                                   | 1286/100000 [07:46<8:25:47,  3.25it/s]

Iteration: 1286


  1%|█                                                                                   | 1287/100000 [07:46<8:35:42,  3.19it/s]

Iteration: 1287


  1%|█                                                                                   | 1288/100000 [07:46<8:37:15,  3.18it/s]

Iteration: 1288


  1%|█                                                                                   | 1289/100000 [07:47<8:36:47,  3.18it/s]

Iteration: 1289


  1%|█                                                                                   | 1290/100000 [07:47<8:41:46,  3.15it/s]

Iteration: 1290


  1%|█                                                                                   | 1291/100000 [07:47<8:47:19,  3.12it/s]

Iteration: 1291


  1%|█                                                                                   | 1292/100000 [07:48<8:49:20,  3.11it/s]

Iteration: 1292


  1%|█                                                                                   | 1293/100000 [07:48<8:44:32,  3.14it/s]

Iteration: 1293


  1%|█                                                                                   | 1294/100000 [07:48<8:43:22,  3.14it/s]

Iteration: 1294


  1%|█                                                                                   | 1295/100000 [07:49<8:36:43,  3.18it/s]

Iteration: 1295


  1%|█                                                                                   | 1296/100000 [07:49<8:30:51,  3.22it/s]

Iteration: 1296


  1%|█                                                                                   | 1297/100000 [07:49<8:24:13,  3.26it/s]

Iteration: 1297


  1%|█                                                                                   | 1298/100000 [07:50<8:26:00,  3.25it/s]

Iteration: 1298


  1%|█                                                                                   | 1299/100000 [07:50<8:22:11,  3.28it/s]

Iteration: 1299


  1%|█                                                                                   | 1300/100000 [07:50<8:26:20,  3.25it/s]

Iteration: 1300

1300: train loss: 3.862499952316284 / val loss: 3.957812547683716
[CHECKPOINT]: Saving with loss:  3.957812547683716


  1%|█                                                                                  | 1301/100000 [07:53<27:19:37,  1.00it/s]

Iteration: 1301


  1%|█                                                                                  | 1302/100000 [07:53<21:34:50,  1.27it/s]

Iteration: 1302


  1%|█                                                                                  | 1303/100000 [07:53<17:34:41,  1.56it/s]

Iteration: 1303


  1%|█                                                                                  | 1304/100000 [07:55<28:05:08,  1.02s/it]

Iteration: 1304


  1%|█                                                                                  | 1305/100000 [08:00<57:55:07,  2.11s/it]

Iteration: 1305


  1%|█                                                                                  | 1306/100000 [08:04<74:41:23,  2.72s/it]

Iteration: 1306


  1%|█                                                                                  | 1307/100000 [08:09<91:26:04,  3.34s/it]

Iteration: 1307


  1%|█                                                                                  | 1308/100000 [08:09<66:40:21,  2.43s/it]

Iteration: 1308


  1%|█                                                                                  | 1309/100000 [08:10<49:07:27,  1.79s/it]

Iteration: 1309


  1%|█                                                                                  | 1310/100000 [08:10<36:50:43,  1.34s/it]

Iteration: 1310


  1%|█                                                                                  | 1311/100000 [08:10<28:28:04,  1.04s/it]

Iteration: 1311


  1%|█                                                                                  | 1312/100000 [08:10<22:20:21,  1.23it/s]

Iteration: 1312


  1%|█                                                                                  | 1313/100000 [08:11<18:05:23,  1.52it/s]

Iteration: 1313


  1%|█                                                                                  | 1314/100000 [08:11<15:13:41,  1.80it/s]

Iteration: 1314


  1%|█                                                                                  | 1315/100000 [08:11<13:07:41,  2.09it/s]

Iteration: 1315


  1%|█                                                                                  | 1316/100000 [08:12<11:36:18,  2.36it/s]

Iteration: 1316


  1%|█                                                                                  | 1317/100000 [08:12<10:31:49,  2.60it/s]

Iteration: 1317


  1%|█                                                                                   | 1318/100000 [08:12<9:56:51,  2.76it/s]

Iteration: 1318


  1%|█                                                                                   | 1319/100000 [08:13<9:27:17,  2.90it/s]

Iteration: 1319


  1%|█                                                                                   | 1320/100000 [08:13<9:03:37,  3.03it/s]

Iteration: 1320


  1%|█                                                                                   | 1321/100000 [08:13<8:54:45,  3.08it/s]

Iteration: 1321


  1%|█                                                                                   | 1322/100000 [08:13<8:40:46,  3.16it/s]

Iteration: 1322


  1%|█                                                                                   | 1323/100000 [08:14<8:31:19,  3.22it/s]

Iteration: 1323


  1%|█                                                                                   | 1324/100000 [08:14<8:26:03,  3.25it/s]

Iteration: 1324


  1%|█                                                                                   | 1325/100000 [08:14<8:19:39,  3.29it/s]

Iteration: 1325


  1%|█                                                                                   | 1326/100000 [08:15<8:18:02,  3.30it/s]

Iteration: 1326


  1%|█                                                                                   | 1327/100000 [08:15<8:14:21,  3.33it/s]

Iteration: 1327


  1%|█                                                                                   | 1328/100000 [08:15<8:10:25,  3.35it/s]

Iteration: 1328


  1%|█                                                                                   | 1329/100000 [08:16<8:17:49,  3.30it/s]

Iteration: 1329


  1%|█                                                                                   | 1330/100000 [08:16<8:18:48,  3.30it/s]

Iteration: 1330


  1%|█                                                                                   | 1331/100000 [08:16<8:15:52,  3.32it/s]

Iteration: 1331


  1%|█                                                                                   | 1332/100000 [08:16<8:15:18,  3.32it/s]

Iteration: 1332


  1%|█                                                                                   | 1333/100000 [08:17<8:18:05,  3.30it/s]

Iteration: 1333


  1%|█                                                                                   | 1334/100000 [08:17<8:23:21,  3.27it/s]

Iteration: 1334


  1%|█                                                                                   | 1335/100000 [08:17<8:20:42,  3.28it/s]

Iteration: 1335


  1%|█                                                                                   | 1336/100000 [08:18<8:16:05,  3.31it/s]

Iteration: 1336


  1%|█                                                                                   | 1337/100000 [08:18<8:16:40,  3.31it/s]

Iteration: 1337


  1%|█                                                                                   | 1338/100000 [08:18<8:21:42,  3.28it/s]

Iteration: 1338


  1%|█                                                                                   | 1339/100000 [08:19<8:24:50,  3.26it/s]

Iteration: 1339


  1%|█▏                                                                                  | 1340/100000 [08:19<8:27:26,  3.24it/s]

Iteration: 1340


  1%|█▏                                                                                  | 1341/100000 [08:19<8:36:50,  3.18it/s]

Iteration: 1341


  1%|█▏                                                                                  | 1342/100000 [08:20<8:29:41,  3.23it/s]

Iteration: 1342


  1%|█▏                                                                                  | 1343/100000 [08:20<8:36:59,  3.18it/s]

Iteration: 1343


  1%|█▏                                                                                  | 1344/100000 [08:20<8:37:16,  3.18it/s]

Iteration: 1344


  1%|█▏                                                                                  | 1345/100000 [08:21<8:36:26,  3.18it/s]

Iteration: 1345


  1%|█▏                                                                                  | 1346/100000 [08:21<8:36:27,  3.18it/s]

Iteration: 1346


  1%|█▏                                                                                  | 1347/100000 [08:21<8:34:16,  3.20it/s]

Iteration: 1347


  1%|█▏                                                                                  | 1348/100000 [08:21<8:33:07,  3.20it/s]

Iteration: 1348


  1%|█▏                                                                                  | 1349/100000 [08:22<8:26:13,  3.25it/s]

Iteration: 1349


  1%|█▏                                                                                  | 1350/100000 [08:22<8:22:59,  3.27it/s]

Iteration: 1350

1350: train loss: 3.8375000953674316 / val loss: 3.903125047683716
[CHECKPOINT]: Saving with loss:  3.903125047683716


  1%|█                                                                                  | 1351/100000 [08:25<27:19:33,  1.00it/s]

Iteration: 1351


  1%|█                                                                                  | 1352/100000 [08:25<21:44:31,  1.26it/s]

Iteration: 1352


  1%|█                                                                                  | 1353/100000 [08:25<17:36:04,  1.56it/s]

Iteration: 1353


  1%|█                                                                                  | 1354/100000 [08:26<15:02:46,  1.82it/s]

Iteration: 1354


  1%|█                                                                                  | 1355/100000 [08:26<13:04:17,  2.10it/s]

Iteration: 1355


  1%|█▏                                                                                 | 1356/100000 [08:26<11:36:14,  2.36it/s]

Iteration: 1356


  1%|█▏                                                                                 | 1357/100000 [08:27<10:48:28,  2.54it/s]

Iteration: 1357


  1%|█▏                                                                                 | 1358/100000 [08:27<10:00:48,  2.74it/s]

Iteration: 1358


  1%|█▏                                                                                  | 1359/100000 [08:27<9:36:32,  2.85it/s]

Iteration: 1359


  1%|█▏                                                                                  | 1360/100000 [08:27<9:15:30,  2.96it/s]

Iteration: 1360


  1%|█▏                                                                                  | 1361/100000 [08:28<9:04:48,  3.02it/s]

Iteration: 1361


  1%|█▏                                                                                  | 1362/100000 [08:28<8:53:41,  3.08it/s]

Iteration: 1362


  1%|█▏                                                                                  | 1363/100000 [08:28<8:41:02,  3.16it/s]

Iteration: 1363


  1%|█▏                                                                                  | 1364/100000 [08:29<8:41:49,  3.15it/s]

Iteration: 1364


  1%|█▏                                                                                  | 1365/100000 [08:29<8:37:13,  3.18it/s]

Iteration: 1365


  1%|█▏                                                                                  | 1366/100000 [08:29<8:30:04,  3.22it/s]

Iteration: 1366


  1%|█▏                                                                                  | 1367/100000 [08:30<8:26:24,  3.25it/s]

Iteration: 1367


  1%|█▏                                                                                  | 1368/100000 [08:30<8:26:03,  3.25it/s]

Iteration: 1368


  1%|█▏                                                                                  | 1369/100000 [08:30<8:28:01,  3.24it/s]

Iteration: 1369


  1%|█▏                                                                                  | 1370/100000 [08:31<8:30:11,  3.22it/s]

Iteration: 1370


  1%|█▏                                                                                  | 1371/100000 [08:31<8:28:42,  3.23it/s]

Iteration: 1371


  1%|█▏                                                                                  | 1372/100000 [08:31<8:25:53,  3.25it/s]

Iteration: 1372


  1%|█▏                                                                                  | 1373/100000 [08:31<8:26:50,  3.24it/s]

Iteration: 1373


  1%|█▏                                                                                  | 1374/100000 [08:32<8:22:01,  3.27it/s]

Iteration: 1374


  1%|█▏                                                                                  | 1375/100000 [08:32<8:20:11,  3.29it/s]

Iteration: 1375


  1%|█▏                                                                                  | 1376/100000 [08:32<8:15:38,  3.32it/s]

Iteration: 1376


  1%|█▏                                                                                  | 1377/100000 [08:33<8:21:36,  3.28it/s]

Iteration: 1377


  1%|█▏                                                                                  | 1378/100000 [08:33<8:17:18,  3.31it/s]

Iteration: 1378


  1%|█▏                                                                                 | 1378/100000 [08:33<10:12:50,  2.68it/s]


Training interrupted. Cleaning up......
GPU memory released


In [63]:
!nvidia-smi


Mon Oct 28 23:31:03 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 546.80                 Driver Version: 546.80       CUDA Version: 12.3     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                     TCC/WDDM  | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  NVIDIA GeForce RTX 3060 ...  WDDM  | 00000000:01:00.0  On |                  N/A |
| N/A   81C    P0              79W /  80W |   4201MiB /  6144MiB |     98%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    