In [1]:
from toy_model import *
wandb.login()

  from .autonotebook import tqdm as notebook_tqdm
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


True

In [2]:
T0 = np.array([
    [0, 1, 0],
    [0, 0, 1],
    [0, 0, 0.5]
])
T1 = np.array([
    [0, 0, 0],
    [0, 0, 0],
    [0.5, 0, 0]
])
dataset = MarkovData(10000, 30, 3, 2, [T0, T1])

In [3]:
#get bigram stats
stat_data=MarkovData(1000, 300, 3, 2, [T0, T1],seed=40)
x=torch.stack(stat_data.data)
# Count bigrams
bigram_counts = {'00': 0, '01': 0, '10': 0, '11': 0}
total_bigrams = 0

for i in range(x.shape[0]):  # for each sequence
    for j in range(x.shape[1] - 1):  # for each position (except last)
        bigram = f"{x[i, j].item()}{x[i, j+1].item()}"
        if bigram in bigram_counts:
            bigram_counts[bigram] += 1
        total_bigrams += 1

print(f"\nBigram frequencies:")
for pattern, count in bigram_counts.items():
    freq = count / total_bigrams
    print(f"{pattern}: {count:,} occurrences ({freq:.4f} = {freq*100:.2f}%)")

print(f"\nTotal bigrams: {total_bigrams:,}")
print(f"Expected total: {x.shape[0] * (x.shape[1] - 1):,}")


Bigram frequencies:
00: 149,392 occurrences (0.4996 = 49.96%)
01: 74,847 occurrences (0.2503 = 25.03%)
10: 74,761 occurrences (0.2500 = 25.00%)
11: 0 occurrences (0.0000 = 0.00%)

Total bigrams: 299,000
Expected total: 299,000


In [4]:
def get_trigram_stats(x):
    """
    Compute trigram frequencies from sequence data.
    
    Args:
        x: Tensor of shape (batch_size, sequence_length) containing integer sequences
    
    Returns:
        dict: Trigram frequencies and counts
    """
    trigram_counts = {
        '000': 0, '001': 0, '010': 0, '011': 0,
        '100': 0, '101': 0, '110': 0, '111': 0
    }
    total_trigrams = 0
    
    for i in range(x.shape[0]):  # for each sequence
        for j in range(x.shape[1] - 2):  # for each position (except last 2)
            trigram = f"{x[i, j].item()}{x[i, j+1].item()}{x[i, j+2].item()}"
            if trigram in trigram_counts:
                trigram_counts[trigram] += 1
            total_trigrams += 1
    
    # Convert to frequencies
    trigram_freqs = {}
    for pattern, count in trigram_counts.items():
        freq = count / total_trigrams if total_trigrams > 0 else 0
        trigram_freqs[pattern] = freq
    
    print(f"\nTrigram frequencies:")
    for pattern, count in trigram_counts.items():
        freq = trigram_freqs[pattern]
        print(f"{pattern}: {count:,} occurrences ({freq:.4f} = {freq*100:.2f}%)")
    print(f"\nTotal trigrams: {total_trigrams:,}")
    print(f"Expected total: {x.shape[0] * (x.shape[1] - 2):,}")
    
    return trigram_freqs, trigram_counts, total_trigrams


In [5]:
stat_data=MarkovData(1000, 300, 3, 2, [T0, T1],seed=40)
x=torch.stack(stat_data.data)
trigram_freqs, trigram_counts, total_trigrams = get_trigram_stats(x)


Trigram frequencies:
000: 74,305 occurrences (0.2493 = 24.93%)
001: 74,584 occurrences (0.2503 = 25.03%)
010: 74,609 occurrences (0.2504 = 25.04%)
011: 0 occurrences (0.0000 = 0.00%)
100: 74,502 occurrences (0.2500 = 25.00%)
101: 0 occurrences (0.0000 = 0.00%)
110: 0 occurrences (0.0000 = 0.00%)
111: 0 occurrences (0.0000 = 0.00%)

Total trigrams: 298,000
Expected total: 298,000


In [None]:
# Metric Codes

def bigram_kl(model):
    T0 = np.array([
        [0, 1, 0],
        [0, 0, 1],
        [0, 0, 0.5]
    ])
    T1 = np.array([
        [0, 0, 0],
        [0, 0, 1],
        [0.5, 0, 0]
    ])

    # Generate test data
    test_data = MarkovData(50, 30, 3, 2, [T0, T1], seed=42)
    x = torch.stack(test_data.data)  # shape: (50, 30)

    # Build dist1 manually based on x values
    dist1 = torch.zeros(x.size(0), x.size(1), 2)  # shape: (50, 30, 2)
    dist1[..., 0] = torch.where(x == 1, 1, 0.5)  # P(0)
    dist1[..., 1] = torch.where(x == 1, 0, 0.5)  # P(1)

    # Get predicted probabilities from model
    dist2 = model(x).softmax(dim=-1)  # shape: (50, 30, 2)

    # Avoid log(0) by clamping very small values
    eps = 1e-8
    dist1_clamped = dist1.clamp(min=eps)
    dist2_clamped = dist2.clamp(min=eps)

    # Compute KL divergence manually: sum_i P(i) * log(P(i)/Q(i))
    def kl(dist1_clamped, dist2_clamped):
        kl = dist1_clamped * (dist1_clamped.log() - dist2_clamped.log())  # shape: (4, 10, 2)
        kl_sum = kl.sum(dim=-1)  # sum over distribution axis → shape: (4, 10)
        return kl_sum.mean().item()  # scalar

    return kl(dist1_clamped, dist2_clamped), kl(dist2_clamped, dist1_clamped)

def trigram_kl(model):
    """
    Compute KL divergence between true Mealy process and model predictions
    based on trigram-level distributions.
    """
    T0 = np.array([
        [0, 1, 0],
        [0, 0, 1],
        [0, 0, 0.5]
    ])
    T1 = np.array([
        [0, 0, 0],
        [0, 0, 0],
        [0.5, 0, 0]
    ])
    
    # Generate test data
    test_data = MarkovData(50, 32, 3, 2, [T0, T1], seed=42)
    x = torch.stack(test_data.data)  # shape: (50, 32)
    
    batch_size, seq_len = x.shape
    dist1 = torch.zeros(batch_size, seq_len, 2)  # shape: (50, 32, 2)
    
    # For trigram-based prediction, we need the last two tokens to predict the next
    for i in range(batch_size):
        for j in range(2, seq_len):  
            prev_bigram = f"{x[i, j-2].item()}{x[i, j-1].item()}"
            
            if prev_bigram == "00":
                dist1[i, j, 0] = 0.5    # P(0|00) 
                dist1[i, j, 1] = 0.5    # P(1|00)
            elif prev_bigram == "01":
                dist1[i, j, 0] = 1.0    # P(0|01)
                dist1[i, j, 1] = 0.0    # P(1|01)
            elif prev_bigram == "10":
                dist1[i, j, 0] = 1.0    # P(0|10)
                dist1[i, j, 1] = 0.0    # P(1|10)
            elif prev_bigram == "11":
                # This should never occur in the true process
                dist1[i, j, 0] = 0.5    # Fallback
                dist1[i, j, 1] = 0.5
    
    # Get predicted probabilities from model
    dist2 = model(x).softmax(dim=-1)  # shape: (50, 32, 2)
    
    # Only evaluate positions where we have trigram context (positions 2 and beyond)
    dist1_eval = dist1[:, 2:, :]
    dist2_eval = dist2[:, 2:, :]
    
    # Avoid log(0) by clamping very small values
    eps = 1e-8
    dist1_clamped = dist1_eval.clamp(min=eps)
    dist2_clamped = dist2_eval.clamp(min=eps)
    
    # Compute KL divergence manually: sum_i P(i) * log(P(i)/Q(i))
    def kl(dist1_clamped, dist2_clamped):
        kl_div = dist1_clamped * (dist1_clamped.log() - dist2_clamped.log())
        kl_sum = kl_div.sum(dim=-1)  # sum over distribution axis
        return kl_sum.mean().item()  # scalar
    
    return kl(dist1_clamped, dist2_clamped), kl(dist2_clamped, dist1_clamped)

def markov_kl(model):
    T0 = np.array([
        [0, 1, 0],
        [0, 0, 1],
        [0, 0, 0.5]
    ])
    T1 = np.array([
        [0, 0, 0],
        [0, 0, 1],
        [0.5, 0, 0]
    ])

    # Generate test data
    test_data = MarkovData(50, 30, 3, 2, [T0, T1], seed=42)
    x = torch.stack(test_data.data)  # shape: (4, 10)
    dist1 = []
    for etas in test_data.states:
        dist1.append([test_data.model.token_probabilities(eta) for eta in etas])
    dist1 = torch.tensor(np.array(dist1))[:, 1:, :]

    # Get predicted probabilities from model
    dist2 = model(x).softmax(dim=-1)  # shape: (4, 10, 2)

    # Avoid log(0) by clamping very small values
    eps = 1e-8
    dist1_clamped = dist1.clamp(min=eps)
    dist2_clamped = dist2.clamp(min=eps)

    # Compute KL divergence manually: sum_i P(i) * log(P(i)/Q(i))
    def kl(dist1_clamped, dist2_clamped):
        kl = dist1_clamped * (dist1_clamped.log() - dist2_clamped.log())  # shape: (4, 10, 2)
        kl_sum = kl.sum(dim=-1)  # sum over distribution axis → shape: (4, 10)
        return kl_sum.mean().item()  # scalar

    return kl(dist1_clamped, dist2_clamped), kl(dist2_clamped, dist1_clamped)

def test_on_all(model, gen_len, start=2):
    assert gen_len % 3 == 0, 'gen_len should be a multiple of 3'
    # Building the test data
    test_seq = [0, 1, None] * (gen_len // 3)
    all_seq = [test_seq]
    temp = []
    for i in range(2, len(test_seq), 3):
        for j in range(len(all_seq)):
            temp.append(all_seq[j].copy())
            temp[-1][i] = 0
            temp.append(all_seq[j].copy())
            temp[-1][i] = 1
        all_seq, temp = temp, []
    all_seq = [all_seq, [i[-1:] + i[:-1] for i in all_seq], [i[-2:] + i[:-2] for i in all_seq]]
    all_seq = [torch.tensor(seq, dtype=torch.int64) for seq in all_seq]
    
    # Testing the model
    errors = []
    acc = 0.0
    for i in range(3):
        preds = model(all_seq[i])[:,:-1,:].argmax(dim=-1)
        err = torch.where(all_seq[i][:,1:] != preds, 1, 0)
        for j in range(gen_len // 3):
            err[:, (3*j + i + 1) % (gen_len - 1)] = 0
        err[:,:start] = 0
        errors.append(err)
        acc += (1 - err.sum() / (err.shape[0] * err.shape[1])) * 100
    acc /= 3
    print(f'Accuracy: {acc:.2f} %')
    return torch.cat([all_seq[i][errors[i].sum(dim=-1).nonzero().flatten().tolist()] for i in range(3)])

# A1 - Base Model

In [7]:
model = train_model(
            dataset=dataset,
            n_epochs=50,
            n_layers=1,
            batch_size=64,
            d_model=16,
            attn_only=True,
            lr=0.1,
            wandb=True,
            wandb_project_name='ICL',
            save_dir='A1/proc1/',
            save_every=5
        )
wandb.finish()

Moving model to device:  cpu


  2%|▏         | 1/50 [00:00<00:21,  2.28it/s]

Epoch 1 Samples 8000 Step 124 Training Loss 0.4930770993232727
Epoch 1 Validation Loss 0.49595534801483154


  4%|▍         | 2/50 [00:00<00:18,  2.58it/s]

Epoch 2 Samples 8000 Step 124 Training Loss 0.4825283885002136
Epoch 2 Validation Loss 0.48329704999923706


  6%|▌         | 3/50 [00:01<00:17,  2.66it/s]

Epoch 3 Samples 8000 Step 124 Training Loss 0.4719834625720978
Epoch 3 Validation Loss 0.47519993782043457


  8%|▊         | 4/50 [00:01<00:17,  2.69it/s]

Epoch 4 Samples 8000 Step 124 Training Loss 0.4717867076396942
Epoch 4 Validation Loss 0.4697587490081787


 10%|█         | 5/50 [00:01<00:16,  2.74it/s]

Epoch 5 Samples 8000 Step 124 Training Loss 0.4640764892101288
Epoch 5 Validation Loss 0.4666685163974762


 12%|█▏        | 6/50 [00:02<00:17,  2.57it/s]

Epoch 6 Samples 8000 Step 124 Training Loss 0.4698444902896881
Epoch 6 Validation Loss 0.4651142954826355


 14%|█▍        | 7/50 [00:02<00:18,  2.26it/s]

Epoch 7 Samples 8000 Step 124 Training Loss 0.4586086571216583
Epoch 7 Validation Loss 0.46414029598236084


 16%|█▌        | 8/50 [00:03<00:18,  2.33it/s]

Epoch 8 Samples 8000 Step 124 Training Loss 0.45675894618034363
Epoch 8 Validation Loss 0.4634513854980469


 18%|█▊        | 9/50 [00:03<00:17,  2.35it/s]

Epoch 9 Samples 8000 Step 124 Training Loss 0.45971277356147766
Epoch 9 Validation Loss 0.46270138025283813


 20%|██        | 10/50 [00:04<00:16,  2.38it/s]

Epoch 10 Samples 8000 Step 124 Training Loss 0.46067723631858826
Epoch 10 Validation Loss 0.46203258633613586


 22%|██▏       | 11/50 [00:04<00:15,  2.50it/s]

Epoch 11 Samples 8000 Step 124 Training Loss 0.45584574341773987
Epoch 11 Validation Loss 0.4629868268966675


 24%|██▍       | 12/50 [00:04<00:14,  2.55it/s]

Epoch 12 Samples 8000 Step 124 Training Loss 0.46004295349121094
Epoch 12 Validation Loss 0.4608277678489685


 26%|██▌       | 13/50 [00:05<00:14,  2.50it/s]

Epoch 13 Samples 8000 Step 124 Training Loss 0.463871031999588
Epoch 13 Validation Loss 0.46042293310165405


 28%|██▊       | 14/50 [00:05<00:14,  2.46it/s]

Epoch 14 Samples 8000 Step 124 Training Loss 0.4580078721046448
Epoch 14 Validation Loss 0.4597514271736145


 30%|███       | 15/50 [00:06<00:14,  2.45it/s]

Epoch 15 Samples 8000 Step 124 Training Loss 0.4533943235874176
Epoch 15 Validation Loss 0.45993176102638245


 32%|███▏      | 16/50 [00:06<00:14,  2.43it/s]

Epoch 16 Samples 8000 Step 124 Training Loss 0.463189035654068
Epoch 16 Validation Loss 0.45820555090904236


 34%|███▍      | 17/50 [00:06<00:13,  2.46it/s]

Epoch 17 Samples 8000 Step 124 Training Loss 0.4687129855155945
Epoch 17 Validation Loss 0.4577281177043915


 36%|███▌      | 18/50 [00:07<00:12,  2.57it/s]

Epoch 18 Samples 8000 Step 124 Training Loss 0.45643243193626404
Epoch 18 Validation Loss 0.4614095389842987


 38%|███▊      | 19/50 [00:07<00:11,  2.62it/s]

Epoch 19 Samples 8000 Step 124 Training Loss 0.4499620199203491
Epoch 19 Validation Loss 0.45726659893989563


 40%|████      | 20/50 [00:07<00:11,  2.65it/s]

Epoch 20 Samples 8000 Step 124 Training Loss 0.4513970613479614
Epoch 20 Validation Loss 0.4557045102119446


 42%|████▏     | 21/50 [00:08<00:12,  2.42it/s]

Epoch 21 Samples 8000 Step 124 Training Loss 0.4499674141407013
Epoch 21 Validation Loss 0.45792946219444275


 44%|████▍     | 22/50 [00:08<00:11,  2.36it/s]

Epoch 22 Samples 8000 Step 124 Training Loss 0.4491993188858032
Epoch 22 Validation Loss 0.4558751583099365


 46%|████▌     | 23/50 [00:09<00:11,  2.39it/s]

Epoch 23 Samples 8000 Step 124 Training Loss 0.457042932510376
Epoch 23 Validation Loss 0.455463171005249


 48%|████▊     | 24/50 [00:09<00:10,  2.39it/s]

Epoch 24 Samples 8000 Step 124 Training Loss 0.4568024277687073
Epoch 24 Validation Loss 0.453528493642807


 50%|█████     | 25/50 [00:10<00:10,  2.42it/s]

Epoch 25 Samples 8000 Step 124 Training Loss 0.4537176191806793
Epoch 25 Validation Loss 0.45194247364997864


 52%|█████▏    | 26/50 [00:10<00:10,  2.39it/s]

Epoch 26 Samples 8000 Step 124 Training Loss 0.449684202671051
Epoch 26 Validation Loss 0.4511968493461609


 54%|█████▍    | 27/50 [00:10<00:09,  2.40it/s]

Epoch 27 Samples 8000 Step 124 Training Loss 0.4494398236274719
Epoch 27 Validation Loss 0.4497971832752228


 56%|█████▌    | 28/50 [00:11<00:09,  2.40it/s]

Epoch 28 Samples 8000 Step 124 Training Loss 0.4546521008014679
Epoch 28 Validation Loss 0.4534456431865692


 58%|█████▊    | 29/50 [00:11<00:08,  2.41it/s]

Epoch 29 Samples 8000 Step 124 Training Loss 0.4565465748310089
Epoch 29 Validation Loss 0.4491000473499298


 60%|██████    | 30/50 [00:12<00:08,  2.39it/s]

Epoch 30 Samples 8000 Step 124 Training Loss 0.44805067777633667
Epoch 30 Validation Loss 0.4464956223964691


 62%|██████▏   | 31/50 [00:12<00:08,  2.14it/s]

Epoch 31 Samples 8000 Step 124 Training Loss 0.44179442524909973
Epoch 31 Validation Loss 0.445743203163147


 64%|██████▍   | 32/50 [00:13<00:08,  2.24it/s]

Epoch 32 Samples 8000 Step 124 Training Loss 0.44382768869400024
Epoch 32 Validation Loss 0.44571101665496826


 66%|██████▌   | 33/50 [00:13<00:07,  2.35it/s]

Epoch 33 Samples 8000 Step 124 Training Loss 0.44582438468933105
Epoch 33 Validation Loss 0.44274473190307617


 68%|██████▊   | 34/50 [00:13<00:06,  2.44it/s]

Epoch 34 Samples 8000 Step 124 Training Loss 0.43358293175697327
Epoch 34 Validation Loss 0.4419269561767578


 70%|███████   | 35/50 [00:14<00:05,  2.53it/s]

Epoch 35 Samples 8000 Step 124 Training Loss 0.44564905762672424
Epoch 35 Validation Loss 0.44032686948776245


 72%|███████▏  | 36/50 [00:14<00:05,  2.34it/s]

Epoch 36 Samples 8000 Step 124 Training Loss 0.4414571225643158
Epoch 36 Validation Loss 0.4408358335494995


 74%|███████▍  | 37/50 [00:15<00:05,  2.40it/s]

Epoch 37 Samples 8000 Step 124 Training Loss 0.442792683839798
Epoch 37 Validation Loss 0.43556681275367737


 76%|███████▌  | 38/50 [00:15<00:04,  2.48it/s]

Epoch 38 Samples 8000 Step 124 Training Loss 0.43796032667160034
Epoch 38 Validation Loss 0.434333860874176


 78%|███████▊  | 39/50 [00:15<00:04,  2.50it/s]

Epoch 39 Samples 8000 Step 124 Training Loss 0.4291817247867584
Epoch 39 Validation Loss 0.43257591128349304


 80%|████████  | 40/50 [00:16<00:04,  2.50it/s]

Epoch 40 Samples 8000 Step 124 Training Loss 0.43951407074928284
Epoch 40 Validation Loss 0.43044471740722656


 82%|████████▏ | 41/50 [00:16<00:03,  2.50it/s]

Epoch 41 Samples 8000 Step 124 Training Loss 0.4164983928203583
Epoch 41 Validation Loss 0.4260748624801636


 84%|████████▍ | 42/50 [00:17<00:03,  2.43it/s]

Epoch 42 Samples 8000 Step 124 Training Loss 0.42477330565452576
Epoch 42 Validation Loss 0.42281869053840637


 86%|████████▌ | 43/50 [00:17<00:02,  2.40it/s]

Epoch 43 Samples 8000 Step 124 Training Loss 0.41281723976135254
Epoch 43 Validation Loss 0.42104485630989075


 88%|████████▊ | 44/50 [00:18<00:02,  2.31it/s]

Epoch 44 Samples 8000 Step 124 Training Loss 0.4225577414035797
Epoch 44 Validation Loss 0.4240054488182068


 90%|█████████ | 45/50 [00:18<00:02,  2.38it/s]

Epoch 45 Samples 8000 Step 124 Training Loss 0.42879509925842285
Epoch 45 Validation Loss 0.4238857328891754


 92%|█████████▏| 46/50 [00:18<00:01,  2.48it/s]

Epoch 46 Samples 8000 Step 124 Training Loss 0.42447471618652344
Epoch 46 Validation Loss 0.42857056856155396


 94%|█████████▍| 47/50 [00:19<00:01,  2.48it/s]

Epoch 47 Samples 8000 Step 124 Training Loss 0.40818047523498535
Epoch 47 Validation Loss 0.4120808243751526


 96%|█████████▌| 48/50 [00:19<00:00,  2.40it/s]

Epoch 48 Samples 8000 Step 124 Training Loss 0.405640184879303
Epoch 48 Validation Loss 0.4070811867713928


 98%|█████████▊| 49/50 [00:20<00:00,  2.38it/s]

Epoch 49 Samples 8000 Step 124 Training Loss 0.4067683815956116
Epoch 49 Validation Loss 0.39814648032188416


100%|██████████| 50/50 [00:20<00:00,  2.44it/s]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 50 Samples 8000 Step 124 Training Loss 0.39211350679397583
Epoch 50 Validation Loss 0.3948310315608978


0,1
epoch,▁▁▂▂▂▂▂▂▂▂▃▃▃▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▇▇▇▇▇███████
samples,▄▅▂▅▇█▇▂▇▂▇▁▆▄▁█▂▃▂▁▆▁▃▄▇▆▇█▁▆▁▃▄▄▆▇▆▇▄█
train_loss,█▇▅▅▅▄▄▄▅▅▄▄▄▄▄▄▅▄▄▅▄▄▄▄▄▄▅▄▄▃▄▃▂▂▃▃▁▂▁▁
val_loss,█▇▇▆▆▆▆▆▆▆▆▅▆▅▅▅▅▅▅▅▅▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▃▂▁

0,1
epoch,50.0
samples,8000.0
train_loss,0.39211
val_loss,0.39483


In [9]:
model_name = 'A1'
print(model_name)
print('B||M = KL(Bigram || Model), M||B = KL(Model || Bigram)', end='\n\n')
for i in range(5, 300, 5):
    try:
        bm, mb = bigram_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

A1
B||M = KL(Bigram || Model), M||B = KL(Model || Bigram)



In [65]:
model_name = 'A1'
print(model_name)
print('B||M = KL(Markov || Model), M||B = KL(Model || Markov)', end='\n\n')
for i in range(5, 300, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

A1
B||M = KL(Markov || Model), M||B = KL(Model || Markov)

Model 5: B||M - 0.391, M||B - 4.544
Model 10: B||M - 0.389, M||B - 4.520
Model 15: B||M - 0.388, M||B - 4.504
Model 20: B||M - 0.386, M||B - 4.478
Model 25: B||M - 0.377, M||B - 4.357
Model 30: B||M - 0.277, M||B - 2.963
Model 35: B||M - 0.131, M||B - 1.292
Model 40: B||M - 0.096, M||B - 0.959
Model 45: B||M - 0.075, M||B - 0.650
Model 50: B||M - 0.062, M||B - 0.563
Model 55: B||M - 0.058, M||B - 0.526
Model 60: B||M - 0.056, M||B - 0.497
Model 65: B||M - 0.068, M||B - 0.640
Model 70: B||M - 0.062, M||B - 0.578
Model 75: B||M - 0.066, M||B - 0.571
Model 80: B||M - 0.068, M||B - 0.553
Model 85: B||M - 0.081, M||B - 0.638
Model 90: B||M - 0.078, M||B - 0.581
Model 95: B||M - 0.084, M||B - 0.591
Model 100: B||M - 0.083, M||B - 0.580
Model 105: B||M - 0.084, M||B - 0.548
Model 110: B||M - 0.087, M||B - 0.557
Model 115: B||M - 0.098, M||B - 0.607
Model 120: B||M - 0.088, M||B - 0.555
Model 125: B||M - 0.091, M||B - 0.567
Model 130: 

In [66]:
model_name = 'A1'
epoch = 5
print(model_name, end='\n\n')
while True:
    try:
        print(f'Model_{epoch}')
        for i in test_on_all(load_model(f'{model_name}/model_{epoch}.pt', f'{model_name}/model_cfg.pt'), 9):
            print('|', end='')
            #print(f'Sequence: {i.tolist()}, Predictions: {model(i).argmax(dim=-1).flatten().tolist()}')
        print()
        epoch += 5
    except:
        break

A1

Model_5
Accuracy: 87.50 %
||||||||||||||||||
Model_10
Accuracy: 87.50 %
||||||||||||||||||
Model_15
Accuracy: 87.50 %
||||||||||||||||||
Model_20
Accuracy: 87.50 %
||||||||||||||||||
Model_25
Accuracy: 87.50 %
||||||||||||||||||
Model_30
Accuracy: 92.71 %
||||||||||||
Model_35
Accuracy: 95.83 %
||||||
Model_40
Accuracy: 96.88 %
||||||
Model_45
Accuracy: 96.88 %
||||
Model_50
Accuracy: 96.88 %
||||||
Model_55
Accuracy: 96.88 %
||||
Model_60
Accuracy: 96.88 %
||||||
Model_65
Accuracy: 96.88 %
||||||
Model_70
Accuracy: 96.88 %
||||||
Model_75
Accuracy: 96.88 %
||||
Model_80
Accuracy: 96.88 %
||||
Model_85
Accuracy: 96.88 %
||||||
Model_90
Accuracy: 96.88 %
||||
Model_95
Accuracy: 96.88 %
||||
Model_100
Accuracy: 96.88 %
||||
Model_105
Accuracy: 96.88 %
||||
Model_110
Accuracy: 96.88 %
||||
Model_115
Accuracy: 96.88 %
||||||
Model_120
Accuracy: 96.88 %
||||
Model_125
Accuracy: 96.88 %
||||||
Model_130
Accuracy: 96.88 %
||||
Model_135
Accuracy: 96.88 %
||||
Model_140
Accuracy: 96.88 %
|

# B1 - Layer Norm

In [77]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=4,
            attn_only=True,
            lr=0.1,
            normalization_type='LNPre',
            wandb=True,
            wandb_project_name='superposition',
            save_dir='B1_noscale/',
            save_every=5
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:04<20:10,  4.05s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6384778618812561
Epoch 1 Validation Loss 0.6402980089187622


  1%|          | 2/300 [00:07<17:11,  3.46s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6381754875183105
Epoch 2 Validation Loss 0.6335560083389282


  1%|          | 3/300 [00:09<14:56,  3.02s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6293154358863831
Epoch 3 Validation Loss 0.6274530291557312


  1%|▏         | 4/300 [00:12<13:49,  2.80s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.6169273257255554
Epoch 4 Validation Loss 0.6184610724449158


  2%|▏         | 5/300 [00:14<12:48,  2.60s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.602584958076477
Epoch 5 Validation Loss 0.602120578289032


  2%|▏         | 6/300 [00:16<12:30,  2.55s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.5920641422271729
Epoch 6 Validation Loss 0.5759753584861755


  2%|▏         | 7/300 [00:19<12:19,  2.52s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.5524297952651978
Epoch 7 Validation Loss 0.5528312921524048


  3%|▎         | 8/300 [00:22<12:51,  2.64s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.5506853461265564
Epoch 8 Validation Loss 0.5699166655540466


  3%|▎         | 9/300 [00:24<12:34,  2.59s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.5364776253700256
Epoch 9 Validation Loss 0.5289730429649353


  3%|▎         | 10/300 [00:27<12:37,  2.61s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.5123993158340454
Epoch 10 Validation Loss 0.5106461048126221


  4%|▎         | 11/300 [00:29<12:39,  2.63s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.500733494758606
Epoch 11 Validation Loss 0.5045009851455688


  4%|▍         | 12/300 [00:32<12:19,  2.57s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.511930525302887
Epoch 12 Validation Loss 0.49928390979766846


  4%|▍         | 13/300 [00:34<11:52,  2.48s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.550425112247467
Epoch 13 Validation Loss 0.5215328931808472


  5%|▍         | 14/300 [00:36<11:36,  2.43s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.510535478591919
Epoch 14 Validation Loss 0.4926074147224426


  5%|▌         | 15/300 [00:39<11:25,  2.40s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.4849042594432831
Epoch 15 Validation Loss 0.48665592074394226


  5%|▌         | 16/300 [00:41<11:39,  2.46s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.504124641418457
Epoch 16 Validation Loss 0.5296660661697388


  6%|▌         | 17/300 [00:44<11:30,  2.44s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.5115461945533752
Epoch 17 Validation Loss 0.49467208981513977


  6%|▌         | 18/300 [00:47<12:08,  2.58s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.4788138270378113
Epoch 18 Validation Loss 0.46759912371635437


  6%|▋         | 19/300 [00:49<11:43,  2.50s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.4818231463432312
Epoch 19 Validation Loss 0.4870152473449707


  7%|▋         | 20/300 [00:52<11:56,  2.56s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.45293691754341125
Epoch 20 Validation Loss 0.45019733905792236


  7%|▋         | 21/300 [00:54<11:39,  2.51s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.4419250190258026
Epoch 21 Validation Loss 0.4537978768348694


  7%|▋         | 22/300 [00:57<11:42,  2.53s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.44608786702156067
Epoch 22 Validation Loss 0.44834285974502563


  8%|▊         | 23/300 [00:59<11:36,  2.51s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.44331300258636475
Epoch 23 Validation Loss 0.49879154562950134


  8%|▊         | 24/300 [01:02<11:38,  2.53s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.4349231719970703
Epoch 24 Validation Loss 0.44685137271881104


  8%|▊         | 25/300 [01:04<11:08,  2.43s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.43007439374923706
Epoch 25 Validation Loss 0.4350889325141907


  9%|▊         | 26/300 [01:06<11:08,  2.44s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.4412708282470703
Epoch 26 Validation Loss 0.4337271749973297


  9%|▉         | 27/300 [01:09<11:10,  2.46s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.45290061831474304
Epoch 27 Validation Loss 0.5019891858100891


  9%|▉         | 28/300 [01:12<11:38,  2.57s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.41942936182022095
Epoch 28 Validation Loss 0.4332400858402252


 10%|▉         | 29/300 [01:14<11:16,  2.50s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.47452208399772644
Epoch 29 Validation Loss 0.43630552291870117


 10%|█         | 30/300 [01:17<11:24,  2.54s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.4293906092643738
Epoch 30 Validation Loss 0.440521240234375


 10%|█         | 31/300 [01:19<11:04,  2.47s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.42868924140930176
Epoch 31 Validation Loss 0.42729657888412476


 11%|█         | 32/300 [01:22<11:06,  2.49s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.4475589096546173
Epoch 32 Validation Loss 0.4492683410644531


 11%|█         | 33/300 [01:24<10:47,  2.42s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.45025870203971863
Epoch 33 Validation Loss 0.44633498787879944


 11%|█▏        | 34/300 [01:26<10:50,  2.45s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.42678600549697876
Epoch 34 Validation Loss 0.4261527359485626


 12%|█▏        | 35/300 [01:29<10:45,  2.44s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.41602227091789246
Epoch 35 Validation Loss 0.41954803466796875


 12%|█▏        | 36/300 [01:31<10:38,  2.42s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.40795376896858215
Epoch 36 Validation Loss 0.4235513210296631


 12%|█▏        | 37/300 [01:34<10:38,  2.43s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.4633607268333435
Epoch 37 Validation Loss 0.46685656905174255


 13%|█▎        | 38/300 [01:36<10:37,  2.43s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.389335572719574
Epoch 38 Validation Loss 0.4043152928352356
Epoch 39 Samples 8000 Step 124 Training Loss 0.4053613543510437


 13%|█▎        | 39/300 [01:39<11:02,  2.54s/it]

Epoch 39 Validation Loss 0.4074433445930481


 13%|█▎        | 40/300 [01:41<10:49,  2.50s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.3933524191379547
Epoch 40 Validation Loss 0.4049743413925171


 14%|█▎        | 41/300 [01:44<10:37,  2.46s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.4078123867511749
Epoch 41 Validation Loss 0.4059045612812042


 14%|█▍        | 42/300 [01:46<10:28,  2.44s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.4084678888320923
Epoch 42 Validation Loss 0.39537930488586426


 14%|█▍        | 43/300 [01:48<10:18,  2.41s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.3821086585521698
Epoch 43 Validation Loss 0.38269472122192383


 15%|█▍        | 44/300 [01:51<10:18,  2.42s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.38562101125717163
Epoch 44 Validation Loss 0.3873944580554962


 15%|█▌        | 45/300 [01:53<10:25,  2.45s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.3840903639793396
Epoch 45 Validation Loss 0.372856467962265


 15%|█▌        | 46/300 [01:56<10:34,  2.50s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.3586413264274597
Epoch 46 Validation Loss 0.3677917420864105


 16%|█▌        | 47/300 [01:58<10:35,  2.51s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.3673034608364105
Epoch 47 Validation Loss 0.3647848963737488


 16%|█▌        | 48/300 [02:01<10:40,  2.54s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.3587995767593384
Epoch 48 Validation Loss 0.3572161793708801


 16%|█▋        | 49/300 [02:04<11:02,  2.64s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.34762540459632874
Epoch 49 Validation Loss 0.38512349128723145


 17%|█▋        | 50/300 [02:07<11:11,  2.68s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.3681178390979767
Epoch 50 Validation Loss 0.3744417428970337


 17%|█▋        | 51/300 [02:09<10:35,  2.55s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.3721161484718323
Epoch 51 Validation Loss 0.5775871276855469


 17%|█▋        | 52/300 [02:11<10:19,  2.50s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.3364235758781433
Epoch 52 Validation Loss 0.34083229303359985


 18%|█▊        | 53/300 [02:14<10:22,  2.52s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.3337131440639496
Epoch 53 Validation Loss 0.3386968970298767


 18%|█▊        | 54/300 [02:17<10:41,  2.61s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.3383312523365021
Epoch 54 Validation Loss 0.33947813510894775


 18%|█▊        | 55/300 [02:19<10:31,  2.58s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.3228381872177124
Epoch 55 Validation Loss 0.33010929822921753


 19%|█▊        | 56/300 [02:22<10:44,  2.64s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.3342376947402954
Epoch 56 Validation Loss 0.3323339521884918


 19%|█▉        | 57/300 [02:24<10:29,  2.59s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.32895222306251526
Epoch 57 Validation Loss 0.32480281591415405


 19%|█▉        | 58/300 [02:27<10:23,  2.58s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.31223151087760925
Epoch 58 Validation Loss 0.3235371708869934


 20%|█▉        | 59/300 [02:29<10:10,  2.53s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.3124158978462219
Epoch 59 Validation Loss 0.31804540753364563


 20%|██        | 60/300 [02:32<09:51,  2.46s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.4283345639705658
Epoch 60 Validation Loss 0.3843940496444702


 20%|██        | 61/300 [02:34<10:08,  2.55s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.3102759122848511
Epoch 61 Validation Loss 0.31444016098976135


 21%|██        | 62/300 [02:37<09:52,  2.49s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.31151601672172546
Epoch 62 Validation Loss 0.3109532594680786


 21%|██        | 63/300 [02:39<09:50,  2.49s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.31056758761405945
Epoch 63 Validation Loss 0.3086536228656769


 21%|██▏       | 64/300 [02:42<09:41,  2.47s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.2979070544242859
Epoch 64 Validation Loss 0.30804693698883057


 22%|██▏       | 65/300 [02:44<09:33,  2.44s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.29825690388679504
Epoch 65 Validation Loss 0.3034905791282654


 22%|██▏       | 66/300 [02:46<09:23,  2.41s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.2947728633880615
Epoch 66 Validation Loss 0.30167895555496216


 22%|██▏       | 67/300 [02:49<09:32,  2.46s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.3039746880531311
Epoch 67 Validation Loss 0.30128854513168335


 23%|██▎       | 68/300 [02:51<09:24,  2.43s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.2969827353954315
Epoch 68 Validation Loss 0.3002732992172241


 23%|██▎       | 69/300 [02:54<09:23,  2.44s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.30204108357429504
Epoch 69 Validation Loss 0.30228596925735474


 23%|██▎       | 70/300 [02:56<09:14,  2.41s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.2886727750301361
Epoch 70 Validation Loss 0.29734212160110474


 24%|██▎       | 71/300 [02:58<09:05,  2.38s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.2919049859046936
Epoch 71 Validation Loss 0.299435555934906


 24%|██▍       | 72/300 [03:01<08:54,  2.35s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.2956831455230713
Epoch 72 Validation Loss 0.2987918257713318


 24%|██▍       | 73/300 [03:03<09:14,  2.44s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.3016809821128845
Epoch 73 Validation Loss 0.2977401316165924


 25%|██▍       | 74/300 [03:06<09:12,  2.44s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.2964446544647217
Epoch 74 Validation Loss 0.29525524377822876


 25%|██▌       | 75/300 [03:08<08:59,  2.40s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.2905855178833008
Epoch 75 Validation Loss 0.2943441569805145


 25%|██▌       | 76/300 [03:10<08:46,  2.35s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.28914889693260193
Epoch 76 Validation Loss 0.29567572474479675


 26%|██▌       | 77/300 [03:13<08:46,  2.36s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.2919655442237854
Epoch 77 Validation Loss 0.29524192214012146


 26%|██▌       | 78/300 [03:16<09:21,  2.53s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.2895541489124298
Epoch 78 Validation Loss 0.29525211453437805


 26%|██▋       | 79/300 [03:18<09:08,  2.48s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.2882944345474243
Epoch 79 Validation Loss 0.2938949465751648


 27%|██▋       | 80/300 [03:21<09:08,  2.49s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.29269692301750183
Epoch 80 Validation Loss 0.2964775860309601


 27%|██▋       | 81/300 [03:23<09:06,  2.50s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.2914937138557434
Epoch 81 Validation Loss 0.2913462519645691


 27%|██▋       | 82/300 [03:26<09:03,  2.49s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.2863066792488098
Epoch 82 Validation Loss 0.2908920645713806


 28%|██▊       | 83/300 [03:28<09:00,  2.49s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.28936952352523804
Epoch 83 Validation Loss 0.2913444936275482


 28%|██▊       | 84/300 [03:31<09:01,  2.51s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.2926890552043915
Epoch 84 Validation Loss 0.2909243106842041


 28%|██▊       | 85/300 [03:33<09:22,  2.62s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.2865939438343048
Epoch 85 Validation Loss 0.29070284962654114


 29%|██▊       | 86/300 [03:36<09:02,  2.53s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.3016468584537506
Epoch 86 Validation Loss 0.290424108505249


 29%|██▉       | 87/300 [03:39<09:21,  2.64s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.29495617747306824
Epoch 87 Validation Loss 0.290012925863266


 29%|██▉       | 88/300 [03:41<09:28,  2.68s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.2907554507255554
Epoch 88 Validation Loss 0.2903032600879669


 30%|██▉       | 89/300 [03:44<09:35,  2.73s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.2958168685436249
Epoch 89 Validation Loss 0.29098111391067505


 30%|███       | 90/300 [03:47<09:11,  2.62s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.2783985733985901
Epoch 90 Validation Loss 0.28879597783088684


 30%|███       | 91/300 [03:49<08:58,  2.58s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.2834348678588867
Epoch 91 Validation Loss 0.2883552610874176


 31%|███       | 92/300 [03:52<09:25,  2.72s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.2870749235153198
Epoch 92 Validation Loss 0.28908365964889526


 31%|███       | 93/300 [03:55<09:12,  2.67s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.28688904643058777
Epoch 93 Validation Loss 0.28697025775909424


 31%|███▏      | 94/300 [03:57<09:01,  2.63s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.2873426377773285
Epoch 94 Validation Loss 0.28666871786117554


 32%|███▏      | 95/300 [04:00<08:49,  2.58s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.28641200065612793
Epoch 95 Validation Loss 0.28753411769866943


 32%|███▏      | 96/300 [04:02<08:49,  2.60s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.28347599506378174
Epoch 96 Validation Loss 0.2878199815750122


 32%|███▏      | 97/300 [04:05<09:08,  2.70s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.2823185622692108
Epoch 97 Validation Loss 0.2878401279449463


 33%|███▎      | 98/300 [04:08<08:39,  2.57s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.28177911043167114
Epoch 98 Validation Loss 0.2857034504413605


 33%|███▎      | 99/300 [04:10<08:23,  2.51s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.2887181043624878
Epoch 99 Validation Loss 0.2859412729740143


 33%|███▎      | 100/300 [04:13<08:25,  2.53s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.28298696875572205
Epoch 100 Validation Loss 0.2853265702724457


 34%|███▎      | 101/300 [04:15<08:09,  2.46s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.28948062658309937
Epoch 101 Validation Loss 0.28574588894844055


 34%|███▍      | 102/300 [04:17<07:58,  2.42s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.27596551179885864
Epoch 102 Validation Loss 0.2860565781593323


 34%|███▍      | 103/300 [04:20<07:59,  2.44s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.28990769386291504
Epoch 103 Validation Loss 0.28582775592803955


 35%|███▍      | 104/300 [04:22<08:00,  2.45s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.2779483199119568
Epoch 104 Validation Loss 0.28444841504096985


 35%|███▌      | 105/300 [04:25<08:39,  2.66s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.27770718932151794
Epoch 105 Validation Loss 0.2845170795917511


 35%|███▌      | 106/300 [04:28<08:52,  2.75s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.280730277299881
Epoch 106 Validation Loss 0.2840731143951416


 36%|███▌      | 107/300 [04:31<08:28,  2.64s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.2784387469291687
Epoch 107 Validation Loss 0.283567875623703


 36%|███▌      | 108/300 [04:33<08:13,  2.57s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.28593024611473083
Epoch 108 Validation Loss 0.28732144832611084


 36%|███▋      | 109/300 [04:35<08:03,  2.53s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.2848581075668335
Epoch 109 Validation Loss 0.2835169732570648


 37%|███▋      | 110/300 [04:38<07:50,  2.47s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.2777983844280243
Epoch 110 Validation Loss 0.2830449640750885


 37%|███▋      | 111/300 [04:40<07:42,  2.44s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.2848619222640991
Epoch 111 Validation Loss 0.2830089032649994


 37%|███▋      | 112/300 [04:42<07:34,  2.42s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.27503377199172974
Epoch 112 Validation Loss 0.2826317250728607


 38%|███▊      | 113/300 [04:45<07:21,  2.36s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.283463716506958
Epoch 113 Validation Loss 0.282802939414978


 38%|███▊      | 114/300 [04:47<07:29,  2.42s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.2837565839290619
Epoch 114 Validation Loss 0.2823463976383209


 38%|███▊      | 115/300 [04:50<07:34,  2.45s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.2816658318042755
Epoch 115 Validation Loss 0.282371461391449


 39%|███▊      | 116/300 [04:52<07:13,  2.35s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.2799641489982605
Epoch 116 Validation Loss 0.2818019986152649


 39%|███▉      | 117/300 [04:54<07:05,  2.32s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.29218536615371704
Epoch 117 Validation Loss 0.2811947762966156


 39%|███▉      | 118/300 [04:57<07:05,  2.34s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.2879893183708191
Epoch 118 Validation Loss 0.28168004751205444


 40%|███▉      | 119/300 [04:59<07:15,  2.40s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.2809862196445465
Epoch 119 Validation Loss 0.2836640477180481


 40%|████      | 120/300 [05:02<07:13,  2.41s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.28074315190315247
Epoch 120 Validation Loss 0.2815702259540558


 40%|████      | 121/300 [05:05<07:45,  2.60s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.2781715393066406
Epoch 121 Validation Loss 0.28221726417541504


 41%|████      | 122/300 [05:07<07:39,  2.58s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.2754165530204773
Epoch 122 Validation Loss 0.27997663617134094


 41%|████      | 123/300 [05:10<07:44,  2.62s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.27503329515457153
Epoch 123 Validation Loss 0.279773473739624


 41%|████▏     | 124/300 [05:13<07:43,  2.63s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.27820470929145813
Epoch 124 Validation Loss 0.2795858383178711


 42%|████▏     | 125/300 [05:15<07:24,  2.54s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.27876824140548706
Epoch 125 Validation Loss 0.27974212169647217


 42%|████▏     | 126/300 [05:18<07:56,  2.74s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.28166165947914124
Epoch 126 Validation Loss 0.28129488229751587


 42%|████▏     | 127/300 [05:20<07:33,  2.62s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.27568235993385315
Epoch 127 Validation Loss 0.27990850806236267


 43%|████▎     | 128/300 [05:23<07:14,  2.53s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.2867003083229065
Epoch 128 Validation Loss 0.27904069423675537


 43%|████▎     | 129/300 [05:25<07:13,  2.54s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.2777557969093323
Epoch 129 Validation Loss 0.2792513966560364


 43%|████▎     | 130/300 [05:28<07:07,  2.52s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.273495614528656
Epoch 130 Validation Loss 0.2781181037425995


 44%|████▎     | 131/300 [05:30<06:59,  2.48s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.2747184634208679
Epoch 131 Validation Loss 0.2801433503627777


 44%|████▍     | 132/300 [05:32<06:51,  2.45s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.27195703983306885
Epoch 132 Validation Loss 0.2779169976711273


 44%|████▍     | 133/300 [05:35<06:47,  2.44s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.28585726022720337
Epoch 133 Validation Loss 0.2774672210216522


 45%|████▍     | 134/300 [05:38<06:56,  2.51s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.28442713618278503
Epoch 134 Validation Loss 0.27897483110427856


 45%|████▌     | 135/300 [05:40<07:00,  2.55s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.27553126215934753
Epoch 135 Validation Loss 0.2782911956310272


 45%|████▌     | 136/300 [05:43<06:48,  2.49s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.2736918330192566
Epoch 136 Validation Loss 0.2771643102169037


 46%|████▌     | 137/300 [05:45<06:50,  2.52s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.26814597845077515
Epoch 137 Validation Loss 0.2768275737762451


 46%|████▌     | 138/300 [05:48<06:41,  2.48s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.27886155247688293
Epoch 138 Validation Loss 0.2764889597892761


 46%|████▋     | 139/300 [05:50<06:35,  2.46s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.27546584606170654
Epoch 139 Validation Loss 0.2772355079650879


 47%|████▋     | 140/300 [05:52<06:32,  2.45s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.27397188544273376
Epoch 140 Validation Loss 0.27695760130882263


 47%|████▋     | 141/300 [05:55<06:31,  2.46s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.27601298689842224
Epoch 141 Validation Loss 0.27610480785369873


 47%|████▋     | 142/300 [05:57<06:29,  2.47s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.27503645420074463
Epoch 142 Validation Loss 0.2761237621307373


 48%|████▊     | 143/300 [06:00<06:26,  2.46s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.27580684423446655
Epoch 143 Validation Loss 0.27594491839408875


 48%|████▊     | 144/300 [06:03<06:50,  2.63s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.2766249477863312
Epoch 144 Validation Loss 0.2755085825920105


 48%|████▊     | 145/300 [06:06<06:50,  2.65s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.27170267701148987
Epoch 145 Validation Loss 0.27552974224090576


 49%|████▊     | 146/300 [06:08<06:36,  2.57s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.2738257646560669
Epoch 146 Validation Loss 0.2753669321537018


 49%|████▉     | 147/300 [06:10<06:28,  2.54s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.2869345545768738
Epoch 147 Validation Loss 0.2823283076286316


 49%|████▉     | 148/300 [06:13<06:25,  2.53s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.27162182331085205
Epoch 148 Validation Loss 0.2750518023967743


 50%|████▉     | 149/300 [06:16<06:43,  2.67s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.27516093850135803
Epoch 149 Validation Loss 0.27452781796455383


 50%|█████     | 150/300 [06:18<06:33,  2.62s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.27250272035598755
Epoch 150 Validation Loss 0.2745022177696228


 50%|█████     | 151/300 [06:21<06:23,  2.58s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.27059406042099
Epoch 151 Validation Loss 0.2745230793952942


 51%|█████     | 152/300 [06:23<06:19,  2.56s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.2865104377269745
Epoch 152 Validation Loss 0.28168490529060364


 51%|█████     | 153/300 [06:26<06:07,  2.50s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.2775147557258606
Epoch 153 Validation Loss 0.275603711605072


 51%|█████▏    | 154/300 [06:28<06:07,  2.51s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.272833913564682
Epoch 154 Validation Loss 0.27449509501457214


 52%|█████▏    | 155/300 [06:31<06:08,  2.54s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.2716713547706604
Epoch 155 Validation Loss 0.27379462122917175


 52%|█████▏    | 156/300 [06:33<05:57,  2.48s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.27261316776275635
Epoch 156 Validation Loss 0.2745026648044586


 52%|█████▏    | 157/300 [06:36<05:48,  2.44s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.2705855071544647
Epoch 157 Validation Loss 0.2731417417526245


 53%|█████▎    | 158/300 [06:38<05:54,  2.50s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.27307578921318054
Epoch 158 Validation Loss 0.27332180738449097


 53%|█████▎    | 159/300 [06:41<06:04,  2.58s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.2720329463481903
Epoch 159 Validation Loss 0.27380892634391785


 53%|█████▎    | 160/300 [06:43<05:52,  2.52s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.2671235203742981
Epoch 160 Validation Loss 0.2738058567047119


 54%|█████▎    | 161/300 [06:46<05:46,  2.49s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.27147847414016724
Epoch 161 Validation Loss 0.27282461524009705


 54%|█████▍    | 162/300 [06:48<05:41,  2.48s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.2696373462677002
Epoch 162 Validation Loss 0.2727726101875305


 54%|█████▍    | 163/300 [06:51<05:38,  2.47s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.27306103706359863
Epoch 163 Validation Loss 0.2736380100250244


 55%|█████▍    | 164/300 [06:53<05:38,  2.49s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.27393820881843567
Epoch 164 Validation Loss 0.2764213979244232


 55%|█████▌    | 165/300 [06:56<05:34,  2.48s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.27361080050468445
Epoch 165 Validation Loss 0.2751319408416748


 55%|█████▌    | 166/300 [06:58<05:25,  2.43s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.2700609564781189
Epoch 166 Validation Loss 0.27283406257629395


 56%|█████▌    | 167/300 [07:00<05:21,  2.42s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.2746230661869049
Epoch 167 Validation Loss 0.27307212352752686
Epoch 168 Samples 8000 Step 124 Training Loss 0.27217811346054077
Epoch 168 Validation Loss 0.2720493674278259


 56%|█████▋    | 169/300 [07:06<05:34,  2.56s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.2689257562160492
Epoch 169 Validation Loss 0.27324092388153076


 57%|█████▋    | 170/300 [07:08<05:24,  2.50s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.2794654369354248
Epoch 170 Validation Loss 0.2772763669490814


 57%|█████▋    | 171/300 [07:11<05:24,  2.51s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.27289894223213196
Epoch 171 Validation Loss 0.27338314056396484


 57%|█████▋    | 172/300 [07:13<05:31,  2.59s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.2724708318710327
Epoch 172 Validation Loss 0.27292129397392273


 58%|█████▊    | 173/300 [07:16<05:22,  2.54s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.27760347723960876
Epoch 173 Validation Loss 0.272623211145401


 58%|█████▊    | 174/300 [07:19<05:26,  2.59s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.27538883686065674
Epoch 174 Validation Loss 0.2730117738246918


 58%|█████▊    | 175/300 [07:21<05:16,  2.53s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.2759481370449066
Epoch 175 Validation Loss 0.27238157391548157


 59%|█████▊    | 176/300 [07:23<05:07,  2.48s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.2757584750652313
Epoch 176 Validation Loss 0.2823046147823334


 59%|█████▉    | 177/300 [07:26<05:16,  2.57s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.27553993463516235
Epoch 177 Validation Loss 0.27539530396461487


 59%|█████▉    | 178/300 [07:29<05:15,  2.59s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.2695971429347992
Epoch 178 Validation Loss 0.27244895696640015


 60%|█████▉    | 179/300 [07:31<05:14,  2.60s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.2799980938434601
Epoch 179 Validation Loss 0.27687835693359375


 60%|██████    | 180/300 [07:34<05:13,  2.62s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.271165668964386
Epoch 180 Validation Loss 0.27206090092658997


 60%|██████    | 181/300 [07:37<05:16,  2.66s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.2672325074672699
Epoch 181 Validation Loss 0.27318331599235535


 61%|██████    | 182/300 [07:39<05:06,  2.60s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.2714156210422516
Epoch 182 Validation Loss 0.2729335129261017


 61%|██████    | 183/300 [07:41<04:51,  2.49s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.36740297079086304
Epoch 183 Validation Loss 0.2915763258934021


 61%|██████▏   | 184/300 [07:44<04:48,  2.49s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.27424386143684387
Epoch 184 Validation Loss 0.2720055878162384


 62%|██████▏   | 185/300 [07:46<04:38,  2.42s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.26855558156967163
Epoch 185 Validation Loss 0.2722982168197632


 62%|██████▏   | 186/300 [07:49<04:36,  2.43s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.2739904522895813
Epoch 186 Validation Loss 0.27340245246887207


 62%|██████▏   | 187/300 [07:51<04:33,  2.42s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.2705652415752411
Epoch 187 Validation Loss 0.27291277050971985


 63%|██████▎   | 188/300 [07:54<04:33,  2.45s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.27101942896842957
Epoch 188 Validation Loss 0.27258893847465515


 63%|██████▎   | 189/300 [07:56<04:33,  2.46s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.2738180458545685
Epoch 189 Validation Loss 0.27425864338874817


 63%|██████▎   | 190/300 [07:59<04:31,  2.47s/it]

Epoch 190 Samples 8000 Step 124 Training Loss 0.2740550935268402
Epoch 190 Validation Loss 0.27280572056770325


 64%|██████▎   | 191/300 [08:01<04:26,  2.44s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.2717759907245636
Epoch 191 Validation Loss 0.27129513025283813


 64%|██████▍   | 192/300 [08:04<04:48,  2.67s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.2700135409832001
Epoch 192 Validation Loss 0.2715875208377838


 64%|██████▍   | 193/300 [08:07<04:48,  2.70s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.27295711636543274
Epoch 193 Validation Loss 0.27206769585609436


 65%|██████▍   | 194/300 [08:10<04:46,  2.70s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.2754947543144226
Epoch 194 Validation Loss 0.2719779312610626


 65%|██████▌   | 195/300 [08:12<04:33,  2.60s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.28001654148101807
Epoch 195 Validation Loss 0.27365225553512573


 65%|██████▌   | 196/300 [08:14<04:24,  2.54s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.2743215262889862
Epoch 196 Validation Loss 0.27599549293518066


 66%|██████▌   | 197/300 [08:17<04:12,  2.45s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.2700956165790558
Epoch 197 Validation Loss 0.2751537263393402


 66%|██████▌   | 198/300 [08:19<04:04,  2.40s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.2705385684967041
Epoch 198 Validation Loss 0.27275973558425903


 66%|██████▋   | 199/300 [08:21<04:06,  2.44s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.2723091244697571
Epoch 199 Validation Loss 0.2724993824958801


 67%|██████▋   | 200/300 [08:24<04:02,  2.43s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.2730042636394501
Epoch 200 Validation Loss 0.27340030670166016


 67%|██████▋   | 201/300 [08:26<04:01,  2.44s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.28159746527671814
Epoch 201 Validation Loss 0.28021439909935


 67%|██████▋   | 202/300 [08:29<03:59,  2.45s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.2808433473110199
Epoch 202 Validation Loss 0.28133806586265564


 68%|██████▊   | 203/300 [08:31<03:54,  2.42s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.2691231966018677
Epoch 203 Validation Loss 0.272944837808609


 68%|██████▊   | 204/300 [08:34<04:15,  2.66s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.2740817964076996
Epoch 204 Validation Loss 0.2737281024456024


 68%|██████▊   | 205/300 [08:37<04:13,  2.67s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.2717769145965576
Epoch 205 Validation Loss 0.2722814381122589


 69%|██████▊   | 206/300 [08:40<04:12,  2.69s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.27306291460990906
Epoch 206 Validation Loss 0.2719211280345917


 69%|██████▉   | 207/300 [08:43<04:20,  2.80s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.27343374490737915
Epoch 207 Validation Loss 0.27214232087135315


 69%|██████▉   | 208/300 [08:45<04:07,  2.69s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.2683466970920563
Epoch 208 Validation Loss 0.2717461585998535


 70%|██████▉   | 209/300 [08:48<04:01,  2.65s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.26677101850509644
Epoch 209 Validation Loss 0.2733886241912842


 70%|███████   | 210/300 [08:50<03:49,  2.55s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.2673622667789459
Epoch 210 Validation Loss 0.27176395058631897


 70%|███████   | 211/300 [08:53<03:57,  2.67s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.2706867754459381
Epoch 211 Validation Loss 0.27362653613090515


 71%|███████   | 212/300 [08:55<03:42,  2.53s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.27819377183914185
Epoch 212 Validation Loss 0.27292025089263916


 71%|███████   | 213/300 [08:58<03:36,  2.48s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.27522188425064087
Epoch 213 Validation Loss 0.27500981092453003


 71%|███████▏  | 214/300 [09:01<03:44,  2.61s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.2698158025741577
Epoch 214 Validation Loss 0.275132954120636


 72%|███████▏  | 215/300 [09:03<03:44,  2.64s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.2670845687389374
Epoch 215 Validation Loss 0.27138739824295044


 72%|███████▏  | 216/300 [09:08<04:28,  3.20s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.27556267380714417
Epoch 216 Validation Loss 0.27171483635902405


 72%|███████▏  | 217/300 [09:11<04:18,  3.11s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.27335160970687866
Epoch 217 Validation Loss 0.2765716314315796


 73%|███████▎  | 218/300 [09:14<04:12,  3.08s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.27185386419296265
Epoch 218 Validation Loss 0.2719601094722748


 73%|███████▎  | 219/300 [09:17<04:09,  3.08s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.26835042238235474
Epoch 219 Validation Loss 0.2715224623680115


 73%|███████▎  | 220/300 [09:20<04:03,  3.05s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.27276742458343506
Epoch 220 Validation Loss 0.2713916599750519


 74%|███████▎  | 221/300 [09:22<03:52,  2.94s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.27102896571159363
Epoch 221 Validation Loss 0.2719987630844116


 74%|███████▍  | 222/300 [09:27<04:22,  3.37s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.26429206132888794
Epoch 222 Validation Loss 0.2714507579803467


 74%|███████▍  | 223/300 [09:31<04:38,  3.62s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.2703240215778351
Epoch 223 Validation Loss 0.2730465531349182


 75%|███████▍  | 224/300 [09:34<04:15,  3.37s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.2687884569168091
Epoch 224 Validation Loss 0.2717609405517578


 75%|███████▌  | 225/300 [09:37<04:11,  3.36s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.2736412286758423
Epoch 225 Validation Loss 0.2787337005138397


 75%|███████▌  | 226/300 [09:40<04:03,  3.29s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.27351224422454834
Epoch 226 Validation Loss 0.2713615894317627


 76%|███████▌  | 227/300 [09:44<04:09,  3.42s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.2678705155849457
Epoch 227 Validation Loss 0.27156081795692444


 76%|███████▌  | 228/300 [09:47<03:48,  3.17s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.27171239256858826
Epoch 228 Validation Loss 0.2709607481956482


 76%|███████▋  | 229/300 [09:49<03:25,  2.90s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.27196693420410156
Epoch 229 Validation Loss 0.2710471451282501


 77%|███████▋  | 230/300 [09:51<03:12,  2.75s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.27103182673454285
Epoch 230 Validation Loss 0.27339908480644226


 77%|███████▋  | 231/300 [09:54<03:01,  2.63s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.28027355670928955
Epoch 231 Validation Loss 0.27078700065612793


 77%|███████▋  | 232/300 [09:56<02:52,  2.53s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.26888948678970337
Epoch 232 Validation Loss 0.2716914117336273


 78%|███████▊  | 233/300 [09:58<02:46,  2.49s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.27314338088035583
Epoch 233 Validation Loss 0.27111685276031494


 78%|███████▊  | 234/300 [10:01<02:40,  2.43s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.270891398191452
Epoch 234 Validation Loss 0.27096396684646606


 78%|███████▊  | 235/300 [10:03<02:42,  2.50s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.26644423604011536
Epoch 235 Validation Loss 0.2705821394920349


 79%|███████▊  | 236/300 [10:06<02:46,  2.61s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.2670568823814392
Epoch 236 Validation Loss 0.2751446068286896


 79%|███████▉  | 237/300 [10:08<02:37,  2.50s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.27222415804862976
Epoch 237 Validation Loss 0.27336248755455017
Epoch 238 Samples 8000 Step 124 Training Loss 0.270104318857193
Epoch 238 Validation Loss 0.2731638252735138


 80%|███████▉  | 239/300 [10:14<02:44,  2.69s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.2712378203868866
Epoch 239 Validation Loss 0.271928071975708


 80%|████████  | 240/300 [10:16<02:38,  2.65s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.27245181798934937
Epoch 240 Validation Loss 0.2742939591407776


 80%|████████  | 241/300 [10:19<02:42,  2.75s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.2693166136741638
Epoch 241 Validation Loss 0.2708132863044739


 81%|████████  | 242/300 [10:22<02:31,  2.60s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.27953994274139404
Epoch 242 Validation Loss 0.27912694215774536


 81%|████████  | 243/300 [10:24<02:30,  2.64s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.28534188866615295
Epoch 243 Validation Loss 0.28188836574554443


 81%|████████▏ | 244/300 [10:27<02:25,  2.60s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.2710934579372406
Epoch 244 Validation Loss 0.27106785774230957


 82%|████████▏ | 245/300 [10:30<02:24,  2.63s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.27187326550483704
Epoch 245 Validation Loss 0.27144351601600647


 82%|████████▏ | 246/300 [10:32<02:18,  2.57s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.27119433879852295
Epoch 246 Validation Loss 0.27394312620162964


 82%|████████▏ | 247/300 [10:34<02:12,  2.50s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.27448391914367676
Epoch 247 Validation Loss 0.272549033164978


 83%|████████▎ | 248/300 [10:37<02:17,  2.64s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.26655933260917664
Epoch 248 Validation Loss 0.2722247540950775


 83%|████████▎ | 249/300 [10:40<02:12,  2.60s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.34738707542419434
Epoch 249 Validation Loss 0.3730132281780243


 83%|████████▎ | 250/300 [10:43<02:19,  2.78s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.3119271695613861
Epoch 250 Validation Loss 0.3259327709674835


 84%|████████▎ | 251/300 [10:46<02:18,  2.83s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.321383535861969
Epoch 251 Validation Loss 0.31525298953056335


 84%|████████▍ | 252/300 [10:48<02:05,  2.61s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.31000012159347534
Epoch 252 Validation Loss 0.3135232925415039


 84%|████████▍ | 253/300 [10:50<01:49,  2.33s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.30688488483428955
Epoch 253 Validation Loss 0.3050428330898285


 85%|████████▍ | 254/300 [10:52<01:45,  2.29s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.2963607609272003
Epoch 254 Validation Loss 0.30087774991989136


 85%|████████▌ | 255/300 [10:54<01:41,  2.25s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.3017471432685852
Epoch 255 Validation Loss 0.2977112829685211


 85%|████████▌ | 256/300 [10:57<01:40,  2.29s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.30516669154167175
Epoch 256 Validation Loss 0.30112525820732117


 86%|████████▌ | 257/300 [10:59<01:35,  2.22s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.28677377104759216
Epoch 257 Validation Loss 0.29480165243148804


 86%|████████▌ | 258/300 [11:01<01:34,  2.24s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.2867658734321594
Epoch 258 Validation Loss 0.2931419014930725


 86%|████████▋ | 259/300 [11:03<01:32,  2.26s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.29644206166267395
Epoch 259 Validation Loss 0.29316458106040955


 87%|████████▋ | 260/300 [11:05<01:21,  2.04s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.29172539710998535
Epoch 260 Validation Loss 0.2905053198337555


 87%|████████▋ | 261/300 [11:07<01:17,  1.98s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.29775601625442505
Epoch 261 Validation Loss 0.2973615229129791


 87%|████████▋ | 262/300 [11:09<01:20,  2.13s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.29167628288269043
Epoch 262 Validation Loss 0.28713053464889526


 88%|████████▊ | 263/300 [11:11<01:19,  2.14s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.3818274438381195
Epoch 263 Validation Loss 0.3388155996799469


 88%|████████▊ | 264/300 [11:14<01:18,  2.19s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.2866971790790558
Epoch 264 Validation Loss 0.2849580943584442


 88%|████████▊ | 265/300 [11:16<01:16,  2.20s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.29297345876693726
Epoch 265 Validation Loss 0.2835872769355774


 89%|████████▊ | 266/300 [11:18<01:16,  2.24s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.2807770371437073
Epoch 266 Validation Loss 0.28348308801651


 89%|████████▉ | 267/300 [11:20<01:12,  2.18s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.27936872839927673
Epoch 267 Validation Loss 0.2820322811603546


 89%|████████▉ | 268/300 [11:22<01:10,  2.19s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.2806490957736969
Epoch 268 Validation Loss 0.28181159496307373


 90%|████████▉ | 269/300 [11:24<01:04,  2.07s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.2829253673553467
Epoch 269 Validation Loss 0.2817844748497009


 90%|█████████ | 270/300 [11:26<00:59,  1.97s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.27958419919013977
Epoch 270 Validation Loss 0.28154638409614563


 90%|█████████ | 271/300 [11:28<00:55,  1.91s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.27820488810539246
Epoch 271 Validation Loss 0.28040578961372375


 91%|█████████ | 272/300 [11:29<00:52,  1.86s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.28174668550491333
Epoch 272 Validation Loss 0.28078150749206543


 91%|█████████ | 273/300 [11:31<00:48,  1.81s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.2806292474269867
Epoch 273 Validation Loss 0.2831748425960541


 91%|█████████▏| 274/300 [11:33<00:46,  1.80s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.27808818221092224
Epoch 274 Validation Loss 0.27995967864990234


 92%|█████████▏| 275/300 [11:35<00:44,  1.77s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.28515520691871643
Epoch 275 Validation Loss 0.28004190325737


 92%|█████████▏| 276/300 [11:36<00:40,  1.68s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.27732163667678833
Epoch 276 Validation Loss 0.28054946660995483


 92%|█████████▏| 277/300 [11:38<00:38,  1.67s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.28859615325927734
Epoch 277 Validation Loss 0.2812163829803467


 93%|█████████▎| 278/300 [11:39<00:37,  1.72s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.2942206859588623
Epoch 278 Validation Loss 0.2881269156932831


 93%|█████████▎| 279/300 [11:41<00:37,  1.77s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.2735782563686371
Epoch 279 Validation Loss 0.28086215257644653


 93%|█████████▎| 280/300 [11:44<00:37,  1.88s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.2810857594013214
Epoch 280 Validation Loss 0.28074803948402405


 94%|█████████▎| 281/300 [11:46<00:39,  2.08s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.28053468465805054
Epoch 281 Validation Loss 0.28129610419273376


 94%|█████████▍| 282/300 [11:48<00:39,  2.18s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.28170090913772583
Epoch 282 Validation Loss 0.28049078583717346


 94%|█████████▍| 283/300 [11:51<00:37,  2.23s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.2768925726413727
Epoch 283 Validation Loss 0.2827404737472534


 95%|█████████▍| 284/300 [11:53<00:36,  2.26s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.2777286171913147
Epoch 284 Validation Loss 0.2808670401573181


 95%|█████████▌| 285/300 [11:56<00:34,  2.29s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.2785153090953827
Epoch 285 Validation Loss 0.28259947896003723


 95%|█████████▌| 286/300 [11:58<00:32,  2.32s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.2874833345413208
Epoch 286 Validation Loss 0.27887263894081116


 96%|█████████▌| 287/300 [12:00<00:30,  2.38s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.2832764685153961
Epoch 287 Validation Loss 0.2787608206272125


 96%|█████████▌| 288/300 [12:03<00:27,  2.32s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.26801925897598267
Epoch 288 Validation Loss 0.2796764075756073


 96%|█████████▋| 289/300 [12:05<00:24,  2.26s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.27646899223327637
Epoch 289 Validation Loss 0.2796024680137634


 97%|█████████▋| 290/300 [12:07<00:22,  2.23s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.278235524892807
Epoch 290 Validation Loss 0.2801596522331238


 97%|█████████▋| 291/300 [12:09<00:20,  2.23s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.2758626341819763
Epoch 291 Validation Loss 0.28025805950164795


 97%|█████████▋| 292/300 [12:11<00:17,  2.18s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.2774600386619568
Epoch 292 Validation Loss 0.2789100110530853


 98%|█████████▊| 293/300 [12:13<00:15,  2.15s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.2732389569282532
Epoch 293 Validation Loss 0.27904796600341797


 98%|█████████▊| 294/300 [12:15<00:12,  2.12s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.27759724855422974
Epoch 294 Validation Loss 0.27869912981987


 98%|█████████▊| 295/300 [12:17<00:10,  2.11s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.278051495552063
Epoch 295 Validation Loss 0.28072187304496765


 99%|█████████▊| 296/300 [12:20<00:08,  2.19s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.2717974781990051
Epoch 296 Validation Loss 0.2795358896255493


 99%|█████████▉| 297/300 [12:22<00:06,  2.15s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.2774728536605835
Epoch 297 Validation Loss 0.28071579337120056


 99%|█████████▉| 298/300 [12:24<00:04,  2.14s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.30191299319267273
Epoch 298 Validation Loss 0.28112322092056274


100%|█████████▉| 299/300 [12:27<00:02,  2.38s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.2816503643989563
Epoch 299 Validation Loss 0.2791288197040558


100%|██████████| 300/300 [12:29<00:00,  2.50s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.2786913514137268
Epoch 300 Validation Loss 0.27877387404441833


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▂▂▂▂▂▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▇▇▇▇▇▇████
samples,▁▄▅▆▄▇▅▇█▃▇▇▂▃▁▅▁▅█▆▅▂▃▃▇▁▇▂█▃▂▄█▆▆▄▆▆▅▃
train_loss,█▅▅▄▄▃▃▃▃▃▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▆▄▃▃▂▂▂▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▂▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.27869
val_loss,0.27877


In [None]:
model = train_model(
            dataset=dataset,
            n_epochs=25,
            d_model=4,
            attn_only=True,
            lr=0.2,
            normalization_type='LN',
            wandb=True,
            wandb_project_name='superposition',
            save_dir='B1_B',
            save_every=5
        )
wandb.finish()

Moving model to device:  cpu


  4%|▍         | 1/25 [00:06<02:43,  6.80s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.604846179485321
Epoch 1 Validation Loss 0.5970064401626587


  8%|▊         | 2/25 [00:13<02:30,  6.56s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.5687693953514099
Epoch 2 Validation Loss 0.5740916132926941


 12%|█▏        | 3/25 [00:19<02:23,  6.51s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6009655594825745
Epoch 3 Validation Loss 0.5926099419593811


 16%|█▌        | 4/25 [00:26<02:16,  6.48s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.591775119304657
Epoch 4 Validation Loss 0.6013076901435852


 20%|██        | 5/25 [00:32<02:08,  6.44s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.6074135303497314
Epoch 5 Validation Loss 0.5878259539604187


 24%|██▍       | 6/25 [00:38<02:02,  6.45s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.3404093086719513
Epoch 6 Validation Loss 0.36083856225013733
Epoch 7 Samples 8000 Step 124 Training Loss 0.38681620359420776
Epoch 7 Validation Loss 0.36029863357543945


 28%|██▊       | 7/25 [00:45<01:56,  6.47s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.3091886341571808
Epoch 8 Validation Loss 0.3038143217563629


 32%|███▏      | 8/25 [00:51<01:49,  6.46s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.2866921126842499


 36%|███▌      | 9/25 [00:58<01:44,  6.56s/it]

Epoch 9 Validation Loss 0.29604414105415344
Epoch 10 Samples 8000 Step 124 Training Loss 0.30738481879234314
Epoch 10 Validation Loss 0.30227792263031006


 40%|████      | 10/25 [01:05<01:39,  6.66s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.30910927057266235
Epoch 11 Validation Loss 0.3047047555446625


 48%|████▊     | 12/25 [01:18<01:26,  6.62s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.2768578827381134
Epoch 12 Validation Loss 0.27754801511764526
Epoch 13 Samples 8000 Step 124 Training Loss 0.5395041108131409
Epoch 13 Validation Loss 0.5325284004211426


 56%|█████▌    | 14/25 [01:32<01:13,  6.64s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.3201112449169159
Epoch 14 Validation Loss 0.32490816712379456
Epoch 15 Samples 8000 Step 124 Training Loss 0.2780906856060028
Epoch 15 Validation Loss 0.2780841886997223


 60%|██████    | 15/25 [01:38<01:06,  6.63s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.2659001052379608
Epoch 16 Validation Loss 0.26781296730041504


 68%|██████▊   | 17/25 [01:51<00:52,  6.61s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.27459821105003357
Epoch 17 Validation Loss 0.26633644104003906
Epoch 18 Samples 8000 Step 124 Training Loss 0.2721829414367676
Epoch 18 Validation Loss 0.26865091919898987


 72%|███████▏  | 18/25 [01:58<00:46,  6.58s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.27328869700431824
Epoch 19 Validation Loss 0.2801414430141449


 76%|███████▌  | 19/25 [02:04<00:39,  6.59s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.2587829828262329
Epoch 20 Validation Loss 0.25901129841804504


 80%|████████  | 20/25 [02:11<00:33,  6.61s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.26022177934646606
Epoch 21 Validation Loss 0.2608344852924347


 84%|████████▍ | 21/25 [02:18<00:26,  6.62s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.26407042145729065
Epoch 22 Validation Loss 0.2711472511291504


 92%|█████████▏| 23/25 [02:31<00:13,  6.63s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.2559430003166199
Epoch 23 Validation Loss 0.25720712542533875
Epoch 24 Samples 8000 Step 124 Training Loss 0.25245070457458496
Epoch 24 Validation Loss 0.256989449262619


 96%|█████████▌| 24/25 [02:38<00:06,  6.63s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.25624406337738037
Epoch 25 Validation Loss 0.2598312199115753


100%|██████████| 25/25 [02:44<00:00,  6.59s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▂▂▂▃▃▃▃▃▃▄▄▅▅▅▅▅▅▆▆▆▇▇▇▇▇▇▇▇██
samples,▁▃▅▃▆▃▃▂▅▁▅▁▄▇█▆▂▃▅▁▃▃▄▅▂▃▄▆▇▃▆▄▅▄█▇▄▆▆▄
train_loss,▇▇▆▆▇▆▄▃▆▃▃▅▂▂▂█▂▂▂▂▂▂▆▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▇███▃▃▂▂▂▂▁▇▂▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,25.0
samples,8000.0
train_loss,0.25624
val_loss,0.25983


In [None]:
model_name = 'A1'
print(model_name)
print('B||M = KL(Bigram || Model), M||B = KL(Bigram || Markov)', end='\n\n')
for i in range(5, 300, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

B1
B||M = KL(Bigram || Model), M||B = KL(Bigram || Markov)

Model 5: B||M - 0.383, M||B - 4.442
Model 10: B||M - 0.341, M||B - 3.899
Model 15: B||M - 0.292, M||B - 3.198
Model 20: B||M - 0.274, M||B - 3.012
Model 25: B||M - 0.246, M||B - 2.678
Model 30: B||M - 0.206, M||B - 2.282
Model 35: B||M - 0.304, M||B - 2.642
Model 40: B||M - 0.126, M||B - 1.281
Model 45: B||M - 0.106, M||B - 1.091
Model 50: B||M - 0.184, M||B - 1.548
Model 55: B||M - 0.082, M||B - 0.856
Model 60: B||M - 0.078, M||B - 0.818
Model 65: B||M - 0.072, M||B - 0.746
Model 70: B||M - 0.267, M||B - 2.807
Model 75: B||M - 0.098, M||B - 0.852
Model 80: B||M - 0.106, M||B - 0.742
Model 85: B||M - 0.114, M||B - 0.731
Model 90: B||M - 0.116, M||B - 0.725
Model 95: B||M - 0.121, M||B - 0.710
Model 100: B||M - 0.129, M||B - 0.729


In [58]:
model_name = 'B1'
print(model_name)
print('B||M = KL(Markov || Model), M||B = KL(Model || Markov)', end='\n\n')
for i in range(5, 300, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

B1
B||M = KL(Markov || Model), M||B = KL(Model || Markov)

Model 5: B||M - 0.383, M||B - 4.442
Model 10: B||M - 0.341, M||B - 3.899
Model 15: B||M - 0.292, M||B - 3.198
Model 20: B||M - 0.274, M||B - 3.012
Model 25: B||M - 0.246, M||B - 2.678
Model 30: B||M - 0.206, M||B - 2.282
Model 35: B||M - 0.304, M||B - 2.642
Model 40: B||M - 0.126, M||B - 1.281
Model 45: B||M - 0.106, M||B - 1.091
Model 50: B||M - 0.184, M||B - 1.548
Model 55: B||M - 0.082, M||B - 0.856
Model 60: B||M - 0.078, M||B - 0.818
Model 65: B||M - 0.072, M||B - 0.746
Model 70: B||M - 0.267, M||B - 2.807
Model 75: B||M - 0.098, M||B - 0.852
Model 80: B||M - 0.106, M||B - 0.742
Model 85: B||M - 0.114, M||B - 0.731
Model 90: B||M - 0.116, M||B - 0.725
Model 95: B||M - 0.121, M||B - 0.710
Model 100: B||M - 0.129, M||B - 0.729


In [56]:
model_name = 'B1'
epoch = 5
print(model_name, end='\n\n')
while True:
    try:
        print(f'Model_{epoch}')
        for i in test_on_all(load_model(f'{model_name}/model_{epoch}.pt', f'{model_name}/model_cfg.pt'), 9):
            print('|', end='')
            #print(f'Sequence: {i.tolist()}, Predictions: {model(i).argmax(dim=-1).flatten().tolist()}')
        print()
        epoch += 5
    except:
        break

B1

Model_5
Accuracy: 87.50 %
||||||||||||||||||
Model_10
Accuracy: 88.54 %
|||||||||||||||||
Model_15
Accuracy: 89.06 %
||||||||||||||||
Model_20
Accuracy: 88.54 %
||||||||||||||||
Model_25
Accuracy: 91.67 %
|||||||||||||
Model_30
Accuracy: 97.92 %
||||
Model_35
Accuracy: 91.67 %
||||||||||||||||
Model_40
Accuracy: 97.40 %
|||||
Model_45
Accuracy: 97.40 %
|||||
Model_50
Accuracy: 91.67 %
||||||||||||
Model_55
Accuracy: 97.40 %
|||||
Model_60
Accuracy: 97.40 %
|||||
Model_65
Accuracy: 97.40 %
|||||
Model_70
Accuracy: 94.79 %
||||||||||
Model_75
Accuracy: 98.96 %
||
Model_80
Accuracy: 99.48 %
|
Model_85
Accuracy: 98.96 %
||
Model_90
Accuracy: 98.96 %
||
Model_95
Accuracy: 100.00 %

Model_100
Accuracy: 100.00 %

Model_105


# C1 - MLP No LN

In [86]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=4,
            d_mlp=16,
            lr=0.1,
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='C1'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:01<08:06,  1.63s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6456921696662903
Epoch 1 Validation Loss 0.6470196843147278


  1%|          | 2/300 [00:03<07:50,  1.58s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6414846181869507
Epoch 2 Validation Loss 0.6419138312339783


  1%|          | 3/300 [00:04<07:42,  1.56s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6425970792770386
Epoch 3 Validation Loss 0.6397568583488464


  1%|▏         | 4/300 [00:06<08:51,  1.79s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.6394213438034058
Epoch 4 Validation Loss 0.6384862065315247


  2%|▏         | 5/300 [00:08<08:57,  1.82s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.6346855163574219
Epoch 5 Validation Loss 0.6375410556793213


  2%|▏         | 6/300 [00:10<08:51,  1.81s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.637339174747467
Epoch 6 Validation Loss 0.6368271708488464


  2%|▏         | 7/300 [00:12<08:51,  1.82s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.6359414458274841
Epoch 7 Validation Loss 0.6364356279373169


  3%|▎         | 8/300 [00:14<09:14,  1.90s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.6393077373504639
Epoch 8 Validation Loss 0.6358028054237366


  3%|▎         | 9/300 [00:16<09:27,  1.95s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.6394236087799072
Epoch 9 Validation Loss 0.6352559328079224


  3%|▎         | 10/300 [00:18<08:59,  1.86s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.6328982710838318
Epoch 10 Validation Loss 0.6347201466560364


  4%|▎         | 11/300 [00:19<08:41,  1.80s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.6291667222976685
Epoch 11 Validation Loss 0.6342573165893555


  4%|▍         | 12/300 [00:21<08:31,  1.78s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.6346768140792847
Epoch 12 Validation Loss 0.633610188961029


  4%|▍         | 13/300 [00:23<08:22,  1.75s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.6321232914924622
Epoch 13 Validation Loss 0.6329507231712341


  5%|▍         | 14/300 [00:25<08:25,  1.77s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.6349892616271973
Epoch 14 Validation Loss 0.6322609186172485


  5%|▌         | 15/300 [00:26<08:19,  1.75s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.6351009011268616
Epoch 15 Validation Loss 0.631578803062439


  5%|▌         | 16/300 [00:28<08:41,  1.84s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.6322143077850342
Epoch 16 Validation Loss 0.6307447552680969


  6%|▌         | 17/300 [00:30<08:39,  1.83s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.626013994216919
Epoch 17 Validation Loss 0.6299778819084167


  6%|▌         | 18/300 [00:32<08:48,  1.87s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.6282632946968079
Epoch 18 Validation Loss 0.62885981798172


  6%|▋         | 19/300 [00:34<09:19,  1.99s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.6310935616493225
Epoch 19 Validation Loss 0.62762850522995


  7%|▋         | 20/300 [00:36<08:48,  1.89s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.629884660243988
Epoch 20 Validation Loss 0.6262092590332031


  7%|▋         | 21/300 [00:38<08:32,  1.84s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.6210935115814209
Epoch 21 Validation Loss 0.6241752505302429


  7%|▋         | 22/300 [00:39<08:15,  1.78s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.6202161908149719
Epoch 22 Validation Loss 0.6221402883529663


  8%|▊         | 23/300 [00:41<08:02,  1.74s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.6151484847068787
Epoch 23 Validation Loss 0.6188517808914185


  8%|▊         | 24/300 [00:43<08:17,  1.80s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.6135756373405457
Epoch 24 Validation Loss 0.6140524744987488


  8%|▊         | 25/300 [00:45<08:19,  1.82s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.5963752269744873
Epoch 25 Validation Loss 0.6071016788482666


  9%|▊         | 26/300 [00:46<08:03,  1.76s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.5921199917793274
Epoch 26 Validation Loss 0.597851574420929


  9%|▉         | 27/300 [00:48<08:07,  1.78s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.5951511263847351
Epoch 27 Validation Loss 0.5863642692565918


  9%|▉         | 28/300 [00:50<07:49,  1.73s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.56923907995224
Epoch 28 Validation Loss 0.5702242255210876


 10%|▉         | 29/300 [00:52<07:56,  1.76s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.5290298461914062
Epoch 29 Validation Loss 0.5336078405380249


 10%|█         | 30/300 [00:53<07:46,  1.73s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.49704161286354065
Epoch 30 Validation Loss 0.4958018958568573


 10%|█         | 31/300 [00:55<07:40,  1.71s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.49492010474205017
Epoch 31 Validation Loss 0.5096004605293274


 11%|█         | 32/300 [00:57<07:35,  1.70s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.3838121294975281
Epoch 32 Validation Loss 0.4108237326145172


 11%|█         | 33/300 [00:59<07:53,  1.77s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.4428037703037262
Epoch 33 Validation Loss 0.43851035833358765


 11%|█▏        | 34/300 [01:01<08:28,  1.91s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.4049528241157532
Epoch 34 Validation Loss 0.4076817035675049


 12%|█▏        | 35/300 [01:03<08:27,  1.91s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.3934685289859772
Epoch 35 Validation Loss 0.3757965862751007


 12%|█▏        | 36/300 [01:04<08:04,  1.83s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.3919074237346649
Epoch 36 Validation Loss 0.38900113105773926


 12%|█▏        | 37/300 [01:06<08:17,  1.89s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.34174102544784546
Epoch 37 Validation Loss 0.34131473302841187


 13%|█▎        | 38/300 [01:09<08:32,  1.96s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.44649645686149597
Epoch 38 Validation Loss 0.42759275436401367


 13%|█▎        | 39/300 [01:10<08:22,  1.93s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.3625045120716095
Epoch 39 Validation Loss 0.359012633562088


 13%|█▎        | 40/300 [01:13<09:27,  2.18s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.33327046036720276
Epoch 40 Validation Loss 0.33562996983528137


 14%|█▎        | 41/300 [01:16<10:14,  2.37s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.3830874562263489
Epoch 41 Validation Loss 0.3605085611343384


 14%|█▍        | 42/300 [01:18<09:16,  2.16s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.3291630446910858
Epoch 42 Validation Loss 0.3355157673358917


 14%|█▍        | 43/300 [01:20<08:51,  2.07s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.3373147249221802
Epoch 43 Validation Loss 0.33454322814941406


 15%|█▍        | 44/300 [01:22<08:41,  2.04s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.34178560972213745
Epoch 44 Validation Loss 0.33505165576934814


 15%|█▌        | 45/300 [01:23<08:14,  1.94s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.3244019150733948
Epoch 45 Validation Loss 0.3132280707359314


 15%|█▌        | 46/300 [01:25<07:45,  1.83s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.3045639097690582
Epoch 46 Validation Loss 0.3062414824962616


 16%|█▌        | 47/300 [01:27<07:40,  1.82s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.2972843647003174
Epoch 47 Validation Loss 0.30264049768447876


 16%|█▌        | 48/300 [01:28<07:37,  1.82s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.2988334000110626
Epoch 48 Validation Loss 0.29839974641799927


 16%|█▋        | 49/300 [01:30<07:32,  1.80s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.30953872203826904
Epoch 49 Validation Loss 0.3077102303504944


 17%|█▋        | 50/300 [01:32<07:20,  1.76s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.32291197776794434
Epoch 50 Validation Loss 0.3459482192993164


 17%|█▋        | 51/300 [01:34<07:34,  1.82s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.27766379714012146
Epoch 51 Validation Loss 0.27888253331184387


 17%|█▋        | 52/300 [01:36<07:43,  1.87s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.27597376704216003
Epoch 52 Validation Loss 0.2791220247745514


 18%|█▊        | 53/300 [01:38<07:39,  1.86s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.29747676849365234
Epoch 53 Validation Loss 0.29363077878952026


 18%|█▊        | 54/300 [01:39<07:30,  1.83s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.2810266315937042
Epoch 54 Validation Loss 0.2801113724708557


 18%|█▊        | 55/300 [01:41<07:43,  1.89s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.285330593585968
Epoch 55 Validation Loss 0.2855236530303955


 19%|█▊        | 56/300 [01:43<07:22,  1.81s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.2737354040145874
Epoch 56 Validation Loss 0.2757982015609741


 19%|█▉        | 57/300 [01:45<07:59,  1.97s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.28661036491394043
Epoch 57 Validation Loss 0.2811639606952667


 19%|█▉        | 58/300 [01:48<08:10,  2.03s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.2813081741333008
Epoch 58 Validation Loss 0.2848288118839264


 20%|█▉        | 59/300 [01:50<08:17,  2.07s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.2836860418319702
Epoch 59 Validation Loss 0.2967475354671478


 20%|██        | 60/300 [01:51<07:49,  1.96s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.2693602740764618
Epoch 60 Validation Loss 0.2723768651485443


 20%|██        | 61/300 [01:53<07:28,  1.88s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.2663160264492035
Epoch 61 Validation Loss 0.2756396532058716


 21%|██        | 62/300 [01:55<07:56,  2.00s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.26711076498031616
Epoch 62 Validation Loss 0.27282628417015076


 21%|██        | 63/300 [01:57<07:50,  1.98s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.2755146026611328
Epoch 63 Validation Loss 0.2743277847766876


 21%|██▏       | 64/300 [01:59<07:35,  1.93s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.28682634234428406
Epoch 64 Validation Loss 0.2824811637401581


 22%|██▏       | 65/300 [02:01<07:12,  1.84s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.28225523233413696
Epoch 65 Validation Loss 0.2771766185760498


 22%|██▏       | 66/300 [02:03<07:11,  1.84s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.2862367033958435
Epoch 66 Validation Loss 0.28228381276130676


 22%|██▏       | 67/300 [02:04<06:52,  1.77s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.2743719518184662
Epoch 67 Validation Loss 0.27748116850852966


 23%|██▎       | 68/300 [02:06<06:45,  1.75s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.2897486090660095
Epoch 68 Validation Loss 0.29799261689186096


 23%|██▎       | 69/300 [02:08<07:00,  1.82s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.2785630524158478
Epoch 69 Validation Loss 0.28277403116226196


 23%|██▎       | 70/300 [02:10<06:51,  1.79s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.2670019268989563
Epoch 70 Validation Loss 0.26974213123321533


 24%|██▎       | 71/300 [02:12<07:03,  1.85s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.28297746181488037
Epoch 71 Validation Loss 0.30281031131744385


 24%|██▍       | 72/300 [02:14<07:27,  1.96s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.265929639339447
Epoch 72 Validation Loss 0.2693108022212982


 24%|██▍       | 73/300 [02:16<07:23,  1.95s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.2696814239025116
Epoch 73 Validation Loss 0.27551302313804626


 25%|██▍       | 74/300 [02:17<06:57,  1.85s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.27430856227874756
Epoch 74 Validation Loss 0.2682681381702423


 25%|██▌       | 75/300 [02:19<06:41,  1.78s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.2708795666694641
Epoch 75 Validation Loss 0.26899245381355286


 25%|██▌       | 76/300 [02:21<07:16,  1.95s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.27187949419021606
Epoch 76 Validation Loss 0.26820436120033264


 26%|██▌       | 77/300 [02:23<07:02,  1.89s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.2637191712856293
Epoch 77 Validation Loss 0.2683736979961395


 26%|██▌       | 78/300 [02:25<06:55,  1.87s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.26780471205711365
Epoch 78 Validation Loss 0.2657763957977295


 26%|██▋       | 79/300 [02:27<06:55,  1.88s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.27957260608673096
Epoch 79 Validation Loss 0.28015199303627014


 27%|██▋       | 80/300 [02:29<06:52,  1.88s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.27430999279022217
Epoch 80 Validation Loss 0.27301403880119324


 27%|██▋       | 81/300 [02:31<07:03,  1.93s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.29173019528388977
Epoch 81 Validation Loss 0.2823280990123749


 27%|██▋       | 82/300 [02:33<07:24,  2.04s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.2659665048122406
Epoch 82 Validation Loss 0.26605939865112305


 28%|██▊       | 83/300 [02:35<07:13,  2.00s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.27255237102508545
Epoch 83 Validation Loss 0.26775938272476196


 28%|██▊       | 84/300 [02:37<07:11,  2.00s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.2704407274723053
Epoch 84 Validation Loss 0.2740032374858856


 28%|██▊       | 85/300 [02:39<07:06,  1.99s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.2645317316055298
Epoch 85 Validation Loss 0.26508599519729614


 29%|██▊       | 86/300 [02:41<07:08,  2.00s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.2739335000514984
Epoch 86 Validation Loss 0.272373229265213


 29%|██▉       | 87/300 [02:43<06:41,  1.88s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.27830132842063904
Epoch 87 Validation Loss 0.275083988904953


 29%|██▉       | 88/300 [02:44<06:36,  1.87s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.26438575983047485
Epoch 88 Validation Loss 0.26528576016426086


 30%|██▉       | 89/300 [02:46<06:22,  1.81s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.27246424555778503
Epoch 89 Validation Loss 0.2672715485095978


 30%|███       | 90/300 [02:48<06:15,  1.79s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.2637835741043091
Epoch 90 Validation Loss 0.27028483152389526


 30%|███       | 91/300 [02:50<06:18,  1.81s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.26412129402160645
Epoch 91 Validation Loss 0.2642969489097595


 31%|███       | 92/300 [02:51<06:07,  1.77s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.2662070393562317
Epoch 92 Validation Loss 0.26347616314888


 31%|███       | 93/300 [02:53<06:05,  1.77s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.2594103515148163
Epoch 93 Validation Loss 0.26296892762184143


 31%|███▏      | 94/300 [02:55<05:58,  1.74s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.2668845057487488
Epoch 94 Validation Loss 0.26322638988494873


 32%|███▏      | 95/300 [02:57<06:23,  1.87s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.2683990001678467
Epoch 95 Validation Loss 0.26878491044044495


 32%|███▏      | 96/300 [02:59<06:15,  1.84s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.27792268991470337
Epoch 96 Validation Loss 0.26853224635124207


 32%|███▏      | 97/300 [03:00<05:59,  1.77s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.2693755328655243
Epoch 97 Validation Loss 0.2632337212562561


 33%|███▎      | 98/300 [03:02<06:13,  1.85s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.2902587056159973
Epoch 98 Validation Loss 0.29449304938316345


 33%|███▎      | 99/300 [03:04<05:59,  1.79s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.2637594938278198
Epoch 99 Validation Loss 0.26782041788101196


 33%|███▎      | 100/300 [03:06<06:13,  1.87s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.26567813754081726
Epoch 100 Validation Loss 0.26340511441230774


 34%|███▎      | 101/300 [03:08<05:54,  1.78s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.2627623379230499
Epoch 101 Validation Loss 0.2657095193862915


 34%|███▍      | 102/300 [03:09<05:43,  1.74s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.2733986973762512
Epoch 102 Validation Loss 0.27185702323913574


 34%|███▍      | 103/300 [03:11<05:50,  1.78s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.27078375220298767
Epoch 103 Validation Loss 0.2739086151123047


 35%|███▍      | 104/300 [03:13<06:01,  1.85s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.26012104749679565
Epoch 104 Validation Loss 0.2653065025806427


 35%|███▌      | 105/300 [03:15<06:15,  1.93s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.2673659920692444
Epoch 105 Validation Loss 0.26298975944519043


 35%|███▌      | 106/300 [03:17<06:14,  1.93s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.2600732743740082
Epoch 106 Validation Loss 0.2641275227069855


 36%|███▌      | 107/300 [03:19<06:05,  1.89s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.2619473934173584
Epoch 107 Validation Loss 0.2656416594982147


 36%|███▌      | 108/300 [03:21<06:30,  2.03s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.2638716995716095
Epoch 108 Validation Loss 0.26415207982063293


 36%|███▋      | 109/300 [03:24<06:37,  2.08s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.25686541199684143
Epoch 109 Validation Loss 0.26515787839889526


 37%|███▋      | 110/300 [03:26<06:37,  2.09s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.2671717405319214
Epoch 110 Validation Loss 0.26438644528388977


 37%|███▋      | 111/300 [03:28<06:25,  2.04s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.26953816413879395
Epoch 111 Validation Loss 0.2665835916996002


 37%|███▋      | 112/300 [03:30<06:20,  2.03s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.2677558660507202
Epoch 112 Validation Loss 0.27072006464004517


 38%|███▊      | 113/300 [03:32<06:50,  2.19s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.26419296860694885
Epoch 113 Validation Loss 0.2665087878704071


 38%|███▊      | 114/300 [03:35<07:33,  2.44s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.26396769285202026
Epoch 114 Validation Loss 0.26412418484687805


 38%|███▊      | 115/300 [03:37<06:52,  2.23s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.26367053389549255
Epoch 115 Validation Loss 0.2679396867752075


 39%|███▊      | 116/300 [03:39<06:23,  2.09s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.26403674483299255
Epoch 116 Validation Loss 0.2651022672653198


 39%|███▉      | 117/300 [03:41<06:18,  2.07s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.26859164237976074
Epoch 117 Validation Loss 0.2690441310405731


 39%|███▉      | 118/300 [03:43<06:14,  2.05s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.2739357650279999
Epoch 118 Validation Loss 0.27371692657470703


 40%|███▉      | 119/300 [03:45<06:02,  2.00s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.2664584517478943
Epoch 119 Validation Loss 0.2711542546749115


 40%|████      | 120/300 [03:47<06:02,  2.01s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.26535218954086304
Epoch 120 Validation Loss 0.2636519968509674


 40%|████      | 121/300 [03:49<05:52,  1.97s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.2691730260848999
Epoch 121 Validation Loss 0.270337849855423


 41%|████      | 122/300 [03:50<05:44,  1.94s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.26643940806388855
Epoch 122 Validation Loss 0.26530033349990845


 41%|████      | 123/300 [03:52<05:26,  1.84s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.2709057033061981
Epoch 123 Validation Loss 0.27229341864585876


 41%|████▏     | 124/300 [03:54<05:17,  1.80s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.2708129286766052
Epoch 124 Validation Loss 0.2631036043167114


 42%|████▏     | 125/300 [03:56<05:28,  1.88s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.2690921127796173
Epoch 125 Validation Loss 0.2656092643737793


 42%|████▏     | 126/300 [03:57<05:12,  1.79s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.2624235451221466
Epoch 126 Validation Loss 0.26443591713905334


 42%|████▏     | 127/300 [03:59<05:19,  1.85s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.2718565762042999
Epoch 127 Validation Loss 0.2714858949184418


 43%|████▎     | 128/300 [04:01<05:16,  1.84s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.26073113083839417
Epoch 128 Validation Loss 0.2650488615036011


 43%|████▎     | 129/300 [04:04<05:42,  2.00s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.2645643353462219
Epoch 129 Validation Loss 0.26387733221054077


 43%|████▎     | 130/300 [04:06<05:46,  2.04s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.262972891330719
Epoch 130 Validation Loss 0.26223623752593994


 44%|████▎     | 131/300 [04:08<05:57,  2.11s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.2660883069038391
Epoch 131 Validation Loss 0.269950270652771


 44%|████▍     | 132/300 [04:10<05:50,  2.09s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.26765820384025574
Epoch 132 Validation Loss 0.2646472156047821


 44%|████▍     | 133/300 [04:12<05:31,  1.98s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.26606324315071106
Epoch 133 Validation Loss 0.2643738090991974


 45%|████▍     | 134/300 [04:14<05:28,  1.98s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.264260470867157
Epoch 134 Validation Loss 0.26258042454719543


 45%|████▌     | 135/300 [04:15<05:16,  1.92s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.26130110025405884
Epoch 135 Validation Loss 0.262442022562027


 45%|████▌     | 136/300 [04:18<05:26,  1.99s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.29008859395980835
Epoch 136 Validation Loss 0.2682783305644989


 46%|████▌     | 137/300 [04:20<05:19,  1.96s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.2612682580947876
Epoch 137 Validation Loss 0.2613169848918915


 46%|████▌     | 138/300 [04:22<05:35,  2.07s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.2604823708534241
Epoch 138 Validation Loss 0.2625410854816437


 46%|████▋     | 139/300 [04:24<05:20,  1.99s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.26545068621635437
Epoch 139 Validation Loss 0.26534658670425415


 47%|████▋     | 140/300 [04:25<05:03,  1.90s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.2683749496936798
Epoch 140 Validation Loss 0.26732704043388367


 47%|████▋     | 141/300 [04:27<04:56,  1.86s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.26299533247947693
Epoch 141 Validation Loss 0.2622861862182617


 47%|████▋     | 142/300 [04:29<05:05,  1.93s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.2579597532749176
Epoch 142 Validation Loss 0.26252618432044983


 48%|████▊     | 143/300 [04:31<04:57,  1.90s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.2630715072154999
Epoch 143 Validation Loss 0.2633671760559082


 48%|████▊     | 144/300 [04:33<04:51,  1.87s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.264094740152359
Epoch 144 Validation Loss 0.26443713903427124


 48%|████▊     | 145/300 [04:35<04:43,  1.83s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.259897381067276
Epoch 145 Validation Loss 0.26284894347190857


 49%|████▊     | 146/300 [04:36<04:38,  1.81s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.26273155212402344
Epoch 146 Validation Loss 0.26271355152130127


 49%|████▉     | 147/300 [04:38<04:43,  1.85s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.2669016122817993
Epoch 147 Validation Loss 0.26322874426841736


 49%|████▉     | 148/300 [04:40<04:39,  1.84s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.27046099305152893
Epoch 148 Validation Loss 0.2641473710536957


 50%|████▉     | 149/300 [04:42<04:30,  1.79s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.2643577754497528
Epoch 149 Validation Loss 0.2626942992210388


 50%|█████     | 150/300 [04:44<04:32,  1.82s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.2605658173561096
Epoch 150 Validation Loss 0.26190850138664246


 50%|█████     | 151/300 [04:46<04:33,  1.84s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.2643813192844391
Epoch 151 Validation Loss 0.26340264081954956


 51%|█████     | 152/300 [04:47<04:24,  1.79s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.2628589868545532
Epoch 152 Validation Loss 0.2647347152233124


 51%|█████     | 153/300 [04:49<04:24,  1.80s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.2720327079296112
Epoch 153 Validation Loss 0.2687355875968933


 51%|█████▏    | 154/300 [04:51<04:18,  1.77s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.26312270760536194
Epoch 154 Validation Loss 0.26335081458091736


 52%|█████▏    | 155/300 [04:53<04:20,  1.79s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.26348164677619934
Epoch 155 Validation Loss 0.26497724652290344


 52%|█████▏    | 156/300 [04:54<04:20,  1.81s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.27287018299102783
Epoch 156 Validation Loss 0.26412129402160645


 52%|█████▏    | 157/300 [04:56<04:21,  1.83s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.26515501737594604
Epoch 157 Validation Loss 0.2647757828235626


 53%|█████▎    | 158/300 [04:58<04:16,  1.81s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.262249618768692
Epoch 158 Validation Loss 0.262014240026474


 53%|█████▎    | 159/300 [05:00<04:16,  1.82s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.2612464725971222
Epoch 159 Validation Loss 0.26069241762161255


 53%|█████▎    | 160/300 [05:02<04:08,  1.77s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.25835415720939636
Epoch 160 Validation Loss 0.2632964253425598


 54%|█████▎    | 161/300 [05:03<03:59,  1.72s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.2660805881023407
Epoch 161 Validation Loss 0.26357516646385193


 54%|█████▍    | 162/300 [05:05<03:55,  1.70s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.26297780871391296
Epoch 162 Validation Loss 0.26123711466789246


 54%|█████▍    | 163/300 [05:07<04:06,  1.80s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.2638093829154968
Epoch 163 Validation Loss 0.2613493502140045


 55%|█████▍    | 164/300 [05:08<03:57,  1.74s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.2598501741886139
Epoch 164 Validation Loss 0.2631172835826874


 55%|█████▌    | 165/300 [05:10<03:52,  1.72s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.2657826244831085
Epoch 165 Validation Loss 0.26696035265922546


 55%|█████▌    | 166/300 [05:12<03:51,  1.73s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.2632525563240051
Epoch 166 Validation Loss 0.26229897141456604


 56%|█████▌    | 167/300 [05:14<03:56,  1.78s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.2640692889690399
Epoch 167 Validation Loss 0.26278629899024963


 56%|█████▌    | 168/300 [05:16<04:04,  1.85s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.2581685781478882
Epoch 168 Validation Loss 0.26205289363861084


 56%|█████▋    | 169/300 [05:17<03:54,  1.79s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.26105624437332153
Epoch 169 Validation Loss 0.262280136346817


 57%|█████▋    | 170/300 [05:19<03:49,  1.76s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.26474568247795105
Epoch 170 Validation Loss 0.2625529170036316


 57%|█████▋    | 171/300 [05:21<03:54,  1.82s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.26361075043678284
Epoch 171 Validation Loss 0.2633652985095978


 57%|█████▋    | 172/300 [05:23<03:55,  1.84s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.26429903507232666
Epoch 172 Validation Loss 0.2614375650882721


 58%|█████▊    | 173/300 [05:25<03:51,  1.82s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.2618253231048584
Epoch 173 Validation Loss 0.2617417275905609


 58%|█████▊    | 174/300 [05:26<03:42,  1.76s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.25777381658554077
Epoch 174 Validation Loss 0.26471132040023804


 58%|█████▊    | 175/300 [05:28<03:52,  1.86s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.27543905377388
Epoch 175 Validation Loss 0.26784852147102356


 59%|█████▊    | 176/300 [05:30<03:43,  1.80s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.26166683435440063
Epoch 176 Validation Loss 0.26252326369285583


 59%|█████▉    | 177/300 [05:32<03:33,  1.74s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.26125913858413696
Epoch 177 Validation Loss 0.2642551064491272


 59%|█████▉    | 178/300 [05:34<03:38,  1.79s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.26118460297584534
Epoch 178 Validation Loss 0.2624703049659729


 60%|█████▉    | 179/300 [05:36<03:42,  1.84s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.2749778628349304
Epoch 179 Validation Loss 0.26669782400131226


 60%|██████    | 180/300 [05:37<03:38,  1.82s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.2627401649951935
Epoch 180 Validation Loss 0.2651281952857971


 60%|██████    | 181/300 [05:39<03:34,  1.80s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.2610075771808624
Epoch 181 Validation Loss 0.26360419392585754


 61%|██████    | 182/300 [05:41<03:28,  1.76s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.25903093814849854
Epoch 182 Validation Loss 0.26158416271209717


 61%|██████    | 183/300 [05:43<03:39,  1.88s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.26331907510757446
Epoch 183 Validation Loss 0.2611846327781677


 61%|██████▏   | 184/300 [05:45<03:45,  1.94s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.27239033579826355
Epoch 184 Validation Loss 0.2636042833328247


 62%|██████▏   | 185/300 [05:47<03:38,  1.90s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.26188862323760986
Epoch 185 Validation Loss 0.26164934039115906


 62%|██████▏   | 186/300 [05:49<03:43,  1.96s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.2678064703941345
Epoch 186 Validation Loss 0.262481153011322


 62%|██████▏   | 187/300 [05:51<03:32,  1.88s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.2667030990123749
Epoch 187 Validation Loss 0.26131319999694824


 63%|██████▎   | 188/300 [05:53<03:31,  1.89s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.27008354663848877
Epoch 188 Validation Loss 0.2644110321998596


 63%|██████▎   | 189/300 [05:54<03:19,  1.80s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.260810524225235
Epoch 189 Validation Loss 0.2629720866680145


 63%|██████▎   | 190/300 [05:56<03:15,  1.77s/it]

Epoch 190 Samples 8000 Step 124 Training Loss 0.26506879925727844
Epoch 190 Validation Loss 0.2628345787525177


 64%|██████▎   | 191/300 [05:58<03:12,  1.76s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.2660098373889923
Epoch 191 Validation Loss 0.2626749873161316


 64%|██████▍   | 192/300 [05:59<03:13,  1.79s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.26354125142097473
Epoch 192 Validation Loss 0.2641616463661194


 64%|██████▍   | 193/300 [06:01<03:06,  1.74s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.2567649185657501
Epoch 193 Validation Loss 0.26161709427833557


 65%|██████▍   | 194/300 [06:03<03:01,  1.71s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.25996899604797363
Epoch 194 Validation Loss 0.2619614005088806


 65%|██████▌   | 195/300 [06:04<02:57,  1.69s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.2652251422405243
Epoch 195 Validation Loss 0.26659059524536133


 65%|██████▌   | 196/300 [06:06<03:01,  1.74s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.27492567896842957
Epoch 196 Validation Loss 0.26509231328964233


 66%|██████▌   | 197/300 [06:08<02:57,  1.72s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.2683381736278534
Epoch 197 Validation Loss 0.2615979015827179


 66%|██████▌   | 198/300 [06:10<02:52,  1.70s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.2621709704399109
Epoch 198 Validation Loss 0.26261651515960693


 66%|██████▋   | 199/300 [06:11<02:56,  1.75s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.26519232988357544
Epoch 199 Validation Loss 0.2626687288284302


 67%|██████▋   | 200/300 [06:14<03:05,  1.86s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.26298636198043823
Epoch 200 Validation Loss 0.2638697922229767


 67%|██████▋   | 201/300 [06:15<03:07,  1.89s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.2633458375930786
Epoch 201 Validation Loss 0.26269397139549255


 67%|██████▋   | 202/300 [06:17<03:01,  1.86s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.26223355531692505
Epoch 202 Validation Loss 0.26059457659721375


 68%|██████▊   | 203/300 [06:19<02:55,  1.81s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.25924813747406006
Epoch 203 Validation Loss 0.2619714140892029


 68%|██████▊   | 204/300 [06:21<02:54,  1.82s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.26702791452407837
Epoch 204 Validation Loss 0.2656511664390564


 68%|██████▊   | 205/300 [06:23<02:52,  1.81s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.25906771421432495
Epoch 205 Validation Loss 0.26112961769104004


 69%|██████▊   | 206/300 [06:24<02:47,  1.78s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.2653602063655853
Epoch 206 Validation Loss 0.26324838399887085


 69%|██████▉   | 207/300 [06:26<02:51,  1.84s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.2606046199798584
Epoch 207 Validation Loss 0.2670572102069855


 69%|██████▉   | 208/300 [06:29<03:00,  1.96s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.2648395299911499
Epoch 208 Validation Loss 0.26273900270462036


 70%|██████▉   | 209/300 [06:30<02:57,  1.95s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.26201993227005005
Epoch 209 Validation Loss 0.2615381181240082


 70%|███████   | 210/300 [06:32<02:50,  1.89s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.25819164514541626
Epoch 210 Validation Loss 0.26197361946105957


 70%|███████   | 211/300 [06:34<02:42,  1.83s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.26263999938964844
Epoch 211 Validation Loss 0.2615759074687958


 71%|███████   | 212/300 [06:36<02:50,  1.93s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.2618309259414673
Epoch 212 Validation Loss 0.2618460953235626


 71%|███████   | 213/300 [06:38<02:54,  2.00s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.2627153694629669
Epoch 213 Validation Loss 0.26111388206481934


 71%|███████▏  | 214/300 [06:40<02:50,  1.99s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.2613302171230316
Epoch 214 Validation Loss 0.26438087224960327


 72%|███████▏  | 215/300 [06:42<02:46,  1.96s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.26240214705467224
Epoch 215 Validation Loss 0.2640262842178345


 72%|███████▏  | 216/300 [06:44<02:48,  2.01s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.2650141417980194
Epoch 216 Validation Loss 0.2639774680137634


 72%|███████▏  | 217/300 [06:46<02:51,  2.07s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.2650972604751587
Epoch 217 Validation Loss 0.26199913024902344


 73%|███████▎  | 218/300 [06:48<02:50,  2.08s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.261574387550354
Epoch 218 Validation Loss 0.2602090835571289


 73%|███████▎  | 219/300 [06:51<02:57,  2.19s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.26350414752960205
Epoch 219 Validation Loss 0.26506295800209045


 73%|███████▎  | 220/300 [06:53<02:50,  2.13s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.26168596744537354
Epoch 220 Validation Loss 0.2609049677848816


 74%|███████▎  | 221/300 [06:55<02:35,  1.96s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.26124051213264465
Epoch 221 Validation Loss 0.2614798843860626


 74%|███████▍  | 222/300 [06:56<02:26,  1.88s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.26188594102859497
Epoch 222 Validation Loss 0.26155635714530945


 74%|███████▍  | 223/300 [06:58<02:18,  1.80s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.2620367407798767
Epoch 223 Validation Loss 0.2658666968345642


 75%|███████▍  | 224/300 [07:00<02:16,  1.80s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.26092541217803955
Epoch 224 Validation Loss 0.26497983932495117


 75%|███████▌  | 225/300 [07:01<02:12,  1.76s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.2610868513584137
Epoch 225 Validation Loss 0.2620786726474762


 75%|███████▌  | 226/300 [07:03<02:07,  1.72s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.2703350782394409
Epoch 226 Validation Loss 0.26461097598075867


 76%|███████▌  | 227/300 [07:04<02:02,  1.68s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.27241092920303345
Epoch 227 Validation Loss 0.26822295784950256


 76%|███████▌  | 228/300 [07:06<02:03,  1.72s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.2626306414604187
Epoch 228 Validation Loss 0.26703476905822754


 76%|███████▋  | 229/300 [07:08<02:01,  1.71s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.26251354813575745
Epoch 229 Validation Loss 0.26035213470458984


 77%|███████▋  | 230/300 [07:10<01:57,  1.67s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.2649786174297333
Epoch 230 Validation Loss 0.2611571252346039


 77%|███████▋  | 231/300 [07:12<02:01,  1.76s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.25984475016593933
Epoch 231 Validation Loss 0.26329532265663147


 77%|███████▋  | 232/300 [07:13<02:03,  1.81s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.26173946261405945
Epoch 232 Validation Loss 0.2619320750236511


 78%|███████▊  | 233/300 [07:16<02:07,  1.90s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.25828874111175537
Epoch 233 Validation Loss 0.262729287147522


 78%|███████▊  | 234/300 [07:17<02:02,  1.86s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.2640042006969452
Epoch 234 Validation Loss 0.26207828521728516


 78%|███████▊  | 235/300 [07:19<01:56,  1.79s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.2602936625480652
Epoch 235 Validation Loss 0.26151010394096375


 79%|███████▊  | 236/300 [07:21<02:03,  1.93s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.2646636962890625
Epoch 236 Validation Loss 0.2617857754230499


 79%|███████▉  | 237/300 [07:23<01:57,  1.87s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.26733967661857605
Epoch 237 Validation Loss 0.26159587502479553


 79%|███████▉  | 238/300 [07:25<01:54,  1.84s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.2634532153606415
Epoch 238 Validation Loss 0.2666511535644531


 80%|███████▉  | 239/300 [07:26<01:50,  1.82s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.2584131956100464
Epoch 239 Validation Loss 0.26192712783813477


 80%|████████  | 240/300 [07:28<01:49,  1.83s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.2702186107635498
Epoch 240 Validation Loss 0.2680068016052246


 80%|████████  | 241/300 [07:30<01:48,  1.83s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.2730816602706909
Epoch 241 Validation Loss 0.2675016224384308


 81%|████████  | 242/300 [07:32<01:44,  1.80s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.26680251955986023
Epoch 242 Validation Loss 0.2627105116844177


 81%|████████  | 243/300 [07:34<01:43,  1.81s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.2600193917751312
Epoch 243 Validation Loss 0.2654784619808197


 81%|████████▏ | 244/300 [07:36<01:46,  1.90s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.2602663040161133
Epoch 244 Validation Loss 0.26077184081077576


 82%|████████▏ | 245/300 [07:37<01:39,  1.80s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.2657230794429779
Epoch 245 Validation Loss 0.26111167669296265


 82%|████████▏ | 246/300 [07:39<01:33,  1.73s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.2631959021091461
Epoch 246 Validation Loss 0.26328179240226746


 82%|████████▏ | 247/300 [07:41<01:30,  1.71s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.2654038667678833
Epoch 247 Validation Loss 0.2611091732978821


 83%|████████▎ | 248/300 [07:42<01:30,  1.75s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.2614920437335968
Epoch 248 Validation Loss 0.26040107011795044


 83%|████████▎ | 249/300 [07:44<01:26,  1.70s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.26821407675743103
Epoch 249 Validation Loss 0.26122143864631653


 83%|████████▎ | 250/300 [07:46<01:22,  1.66s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.26575592160224915
Epoch 250 Validation Loss 0.26501914858818054


 84%|████████▎ | 251/300 [07:47<01:20,  1.65s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.2642787992954254
Epoch 251 Validation Loss 0.2626265585422516


 84%|████████▍ | 252/300 [07:49<01:21,  1.70s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.2636815905570984
Epoch 252 Validation Loss 0.26252371072769165


 84%|████████▍ | 253/300 [07:51<01:18,  1.68s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.2603224217891693
Epoch 253 Validation Loss 0.26025712490081787


 85%|████████▍ | 254/300 [07:52<01:15,  1.65s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.2628364562988281
Epoch 254 Validation Loss 0.263874888420105


 85%|████████▌ | 255/300 [07:54<01:13,  1.63s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.26189664006233215
Epoch 255 Validation Loss 0.26186469197273254


 85%|████████▌ | 256/300 [07:56<01:13,  1.68s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.26018276810646057
Epoch 256 Validation Loss 0.2614128291606903


 86%|████████▌ | 257/300 [07:58<01:14,  1.73s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.2685232162475586
Epoch 257 Validation Loss 0.262613445520401


 86%|████████▌ | 258/300 [07:59<01:11,  1.70s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.2628740668296814
Epoch 258 Validation Loss 0.26164036989212036


 86%|████████▋ | 259/300 [08:01<01:08,  1.68s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.2645905613899231
Epoch 259 Validation Loss 0.26371103525161743


 87%|████████▋ | 260/300 [08:03<01:08,  1.71s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.26990869641304016
Epoch 260 Validation Loss 0.2646741569042206


 87%|████████▋ | 261/300 [08:04<01:08,  1.77s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.2623513340950012
Epoch 261 Validation Loss 0.2637470066547394


 87%|████████▋ | 262/300 [08:07<01:17,  2.05s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.2626173794269562
Epoch 262 Validation Loss 0.2626512944698334


 88%|████████▊ | 263/300 [08:09<01:12,  1.97s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.2609333395957947
Epoch 263 Validation Loss 0.2613317668437958


 88%|████████▊ | 264/300 [08:11<01:07,  1.87s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.2642022669315338
Epoch 264 Validation Loss 0.26178810000419617


 88%|████████▊ | 265/300 [08:12<01:04,  1.83s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.2879139482975006
Epoch 265 Validation Loss 0.27318495512008667


 89%|████████▊ | 266/300 [08:14<01:04,  1.90s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.2624758183956146
Epoch 266 Validation Loss 0.26241910457611084


 89%|████████▉ | 267/300 [08:16<01:02,  1.89s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.2645036578178406
Epoch 267 Validation Loss 0.2620174288749695


 89%|████████▉ | 268/300 [08:18<00:58,  1.82s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.26443418860435486
Epoch 268 Validation Loss 0.2646868824958801


 90%|████████▉ | 269/300 [08:19<00:54,  1.75s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.26272347569465637
Epoch 269 Validation Loss 0.2624812126159668


 90%|█████████ | 270/300 [08:21<00:54,  1.80s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.2600906491279602
Epoch 270 Validation Loss 0.2609974145889282


 90%|█████████ | 271/300 [08:23<00:54,  1.87s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.2604086697101593
Epoch 271 Validation Loss 0.26114898920059204


 91%|█████████ | 272/300 [08:26<00:54,  1.94s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.2681805193424225
Epoch 272 Validation Loss 0.26213544607162476


 91%|█████████ | 273/300 [08:28<00:59,  2.22s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.2592311501502991
Epoch 273 Validation Loss 0.26141348481178284


 91%|█████████▏| 274/300 [08:31<00:56,  2.18s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.25773561000823975
Epoch 274 Validation Loss 0.26000890135765076


 92%|█████████▏| 275/300 [08:33<00:53,  2.15s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.2631666958332062
Epoch 275 Validation Loss 0.26070937514305115


 92%|█████████▏| 276/300 [08:34<00:49,  2.06s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.2647877335548401
Epoch 276 Validation Loss 0.26198941469192505


 92%|█████████▏| 277/300 [08:36<00:44,  1.92s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.2589244246482849
Epoch 277 Validation Loss 0.26079875230789185


 93%|█████████▎| 278/300 [08:38<00:39,  1.80s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.2636309564113617
Epoch 278 Validation Loss 0.2616042196750641


 93%|█████████▎| 279/300 [08:39<00:38,  1.83s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.26382583379745483
Epoch 279 Validation Loss 0.26857316493988037


 93%|█████████▎| 280/300 [08:42<00:39,  1.97s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.26221972703933716
Epoch 280 Validation Loss 0.2608291804790497


 94%|█████████▎| 281/300 [08:43<00:34,  1.84s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.26487839221954346
Epoch 281 Validation Loss 0.2616874575614929


 94%|█████████▍| 282/300 [08:45<00:32,  1.81s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.2597561478614807
Epoch 282 Validation Loss 0.26168009638786316


 94%|█████████▍| 283/300 [08:47<00:29,  1.75s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.25982075929641724
Epoch 283 Validation Loss 0.2600182592868805


 95%|█████████▍| 284/300 [08:49<00:30,  1.93s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.2608393430709839
Epoch 284 Validation Loss 0.2619419991970062


 95%|█████████▌| 285/300 [08:51<00:28,  1.91s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.2659122943878174
Epoch 285 Validation Loss 0.26286497712135315


 95%|█████████▌| 286/300 [08:53<00:27,  1.99s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.26072049140930176
Epoch 286 Validation Loss 0.26072263717651367


 96%|█████████▌| 287/300 [08:55<00:24,  1.90s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.2630317807197571
Epoch 287 Validation Loss 0.2629314661026001


 96%|█████████▌| 288/300 [08:57<00:23,  1.99s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.2673128545284271
Epoch 288 Validation Loss 0.26183098554611206


 96%|█████████▋| 289/300 [08:59<00:23,  2.10s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.2597661018371582
Epoch 289 Validation Loss 0.26341909170150757


 97%|█████████▋| 290/300 [09:01<00:19,  1.99s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.2617536187171936
Epoch 290 Validation Loss 0.2599461078643799


 97%|█████████▋| 291/300 [09:03<00:16,  1.89s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.2561647891998291
Epoch 291 Validation Loss 0.26038816571235657


 97%|█████████▋| 292/300 [09:04<00:14,  1.87s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.26504623889923096
Epoch 292 Validation Loss 0.26494550704956055


 98%|█████████▊| 293/300 [09:06<00:12,  1.79s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.2623722553253174
Epoch 293 Validation Loss 0.2610274851322174


 98%|█████████▊| 294/300 [09:08<00:10,  1.75s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.2719202935695648
Epoch 294 Validation Loss 0.2632223963737488


 98%|█████████▊| 295/300 [09:10<00:09,  1.87s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.261019766330719
Epoch 295 Validation Loss 0.2603704333305359


 99%|█████████▊| 296/300 [09:12<00:08,  2.09s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.263712614774704
Epoch 296 Validation Loss 0.2670079171657562


 99%|█████████▉| 297/300 [09:15<00:06,  2.10s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.2788534462451935
Epoch 297 Validation Loss 0.2709982395172119


 99%|█████████▉| 298/300 [09:16<00:04,  2.02s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.2689315378665924
Epoch 298 Validation Loss 0.266701340675354


100%|█████████▉| 299/300 [09:18<00:01,  1.93s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.25705933570861816
Epoch 299 Validation Loss 0.2618131935596466


100%|██████████| 300/300 [09:20<00:00,  1.87s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.266920804977417
Epoch 300 Validation Loss 0.26216280460357666


0,1
epoch,▁▁▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▃▄▄▄▄▄▄▄▅▅▅▅▅▆▆▇▇▇▇▇▇██
samples,▇▃▂▇▁▆▆▃▃▃▂█▅▆█▂▇▅▃▂▅▄▁▆▆▃█▅▄▃▂▄▃▇▃▃▇▅▃▃
train_loss,█▄▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,████▇▄▂▂▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.26692
val_loss,0.26216


In [109]:
print('B||M = KL(Bigram || Model), M||B = KL(Model || Bigram)')
for i in range(5, 100, 5):
    try:
        bm, mb = bigram_kl(load_model(f'B1_B/model_{i}.pt', 'B1/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except:
        break

B||M = KL(Bigram || Model), M||B = KL(Model || Bigram)
Model 5: B||M - 0.077, M||B - 0.068
Model 10: B||M - 0.703, M||B - 0.335
Model 15: B||M - 1.068, M||B - 0.365
Model 20: B||M - 1.401, M||B - 0.386
Model 25: B||M - 1.567, M||B - 0.373


In [378]:
model_name = 'B1'
print(model_name)
print('B||M = KL(Markov || Model), M||B = KL(Model || Markov)', end='\n\n')
for i in range(5, 100, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

B1
B||M = KL(Markov || Model), M||B = KL(Model || Markov)

Model 5: B||M - 0.375, M||B - 4.106
Model 10: B||M - 0.087, M||B - 0.747
Model 15: B||M - 0.082, M||B - 0.495
Model 20: B||M - 0.116, M||B - 0.375
Model 25: B||M - 0.146, M||B - 0.359
Model 30: B||M - 0.282, M||B - 2.836
Model 35: B||M - 0.178, M||B - 0.913
Model 40: B||M - 0.181, M||B - 0.845
Model 45: B||M - 0.173, M||B - 0.583


In [376]:
model_name = 'B1'
epoch = 5
print(model_name, end='\n\n')
while True:
    try:
        print(f'Model_{epoch}')
        for i in test_on_all(load_model(f'{model_name}/model_{epoch}.pt', f'{model_name}/model_cfg.pt'), 30):
            print('|', end='')
            #print(f'Sequence: {i.tolist()}, Predictions: {model(i).argmax(dim=-1).flatten().tolist()}')
        print()
        epoch += 5
    except:
        break

B1

Model_5
Accuracy: 87.22 %
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

# C1_

In [87]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=2,
            d_mlp=16,
            lr=0.1,
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='C1_'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:01<06:30,  1.31s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6661055088043213
Epoch 1 Validation Loss 0.660175621509552


  1%|          | 2/300 [00:02<06:27,  1.30s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6472493410110474
Epoch 2 Validation Loss 0.6473472714424133


  1%|          | 3/300 [00:03<06:22,  1.29s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6433057188987732
Epoch 3 Validation Loss 0.6431404948234558


  1%|▏         | 4/300 [00:05<06:44,  1.37s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.6436291337013245
Epoch 4 Validation Loss 0.6409651041030884


  2%|▏         | 5/300 [00:06<06:43,  1.37s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.641920268535614
Epoch 5 Validation Loss 0.6394059658050537


  2%|▏         | 6/300 [00:07<06:30,  1.33s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.6372483968734741
Epoch 6 Validation Loss 0.638312816619873


  2%|▏         | 7/300 [00:09<06:36,  1.35s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.638185977935791
Epoch 7 Validation Loss 0.6374650597572327


  3%|▎         | 8/300 [00:10<06:45,  1.39s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.6354606747627258
Epoch 8 Validation Loss 0.636804461479187


  3%|▎         | 9/300 [00:12<06:35,  1.36s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.6400820016860962
Epoch 9 Validation Loss 0.6361834406852722


  3%|▎         | 10/300 [00:13<06:31,  1.35s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.631060779094696
Epoch 10 Validation Loss 0.6357405781745911


  4%|▎         | 11/300 [00:14<06:35,  1.37s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.6392436623573303
Epoch 11 Validation Loss 0.6353441476821899


  4%|▍         | 12/300 [00:16<06:38,  1.38s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.6375721096992493
Epoch 12 Validation Loss 0.6348792910575867


  4%|▍         | 13/300 [00:17<06:34,  1.37s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.6297692060470581
Epoch 13 Validation Loss 0.6344603300094604


  5%|▍         | 14/300 [00:18<06:23,  1.34s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.6251161694526672
Epoch 14 Validation Loss 0.6339762210845947


  5%|▌         | 15/300 [00:20<06:31,  1.38s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.6240221858024597
Epoch 15 Validation Loss 0.6333433389663696


  5%|▌         | 16/300 [00:21<06:30,  1.37s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.6334325075149536
Epoch 16 Validation Loss 0.6326223015785217


  6%|▌         | 17/300 [00:22<06:09,  1.31s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.6285014152526855
Epoch 17 Validation Loss 0.6316307783126831


  6%|▌         | 18/300 [00:24<06:10,  1.32s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.6246925592422485
Epoch 18 Validation Loss 0.6304455399513245


  6%|▋         | 19/300 [00:25<06:41,  1.43s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.632171094417572
Epoch 19 Validation Loss 0.6289850473403931


  7%|▋         | 20/300 [00:27<06:49,  1.46s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.6283766627311707
Epoch 20 Validation Loss 0.6268329620361328


  7%|▋         | 21/300 [00:29<07:01,  1.51s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.6178391575813293
Epoch 21 Validation Loss 0.6234283447265625


  7%|▋         | 22/300 [00:30<07:17,  1.58s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.6073275804519653
Epoch 22 Validation Loss 0.6170437932014465


  8%|▊         | 23/300 [00:32<07:11,  1.56s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.5879260301589966
Epoch 23 Validation Loss 0.6064333319664001


  8%|▊         | 24/300 [00:33<07:03,  1.53s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.5792734622955322
Epoch 24 Validation Loss 0.5868691205978394


  8%|▊         | 25/300 [00:35<07:10,  1.56s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.5894477367401123
Epoch 25 Validation Loss 0.5766065716743469


  9%|▊         | 26/300 [00:37<07:25,  1.63s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.5534325242042542
Epoch 26 Validation Loss 0.5555077195167542


  9%|▉         | 27/300 [00:38<07:29,  1.65s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.5102058053016663
Epoch 27 Validation Loss 0.5212989449501038


  9%|▉         | 28/300 [00:40<07:32,  1.66s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.5500788688659668
Epoch 28 Validation Loss 0.5266996622085571


 10%|▉         | 29/300 [00:42<07:47,  1.73s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.5262649655342102
Epoch 29 Validation Loss 0.4991219937801361


 10%|█         | 30/300 [00:44<07:33,  1.68s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.4886032044887543
Epoch 30 Validation Loss 0.49872711300849915


 10%|█         | 31/300 [00:45<07:26,  1.66s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.48469048738479614
Epoch 31 Validation Loss 0.48416492342948914


 11%|█         | 32/300 [00:47<07:50,  1.76s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.5060605406761169
Epoch 32 Validation Loss 0.5066206455230713


 11%|█         | 33/300 [00:49<07:47,  1.75s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.47477027773857117
Epoch 33 Validation Loss 0.47504371404647827


 11%|█▏        | 34/300 [00:51<07:56,  1.79s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.4609447717666626
Epoch 34 Validation Loss 0.4759509265422821


 12%|█▏        | 35/300 [00:52<07:40,  1.74s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.4672278165817261
Epoch 35 Validation Loss 0.47524744272232056


 12%|█▏        | 36/300 [00:54<07:44,  1.76s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.4605536162853241
Epoch 36 Validation Loss 0.46384280920028687


 12%|█▏        | 37/300 [00:56<08:11,  1.87s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.5125386118888855
Epoch 37 Validation Loss 0.49529725313186646


 13%|█▎        | 38/300 [00:58<07:58,  1.83s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.4807891249656677
Epoch 38 Validation Loss 0.4574451148509979


 13%|█▎        | 39/300 [01:00<07:36,  1.75s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.45600345730781555
Epoch 39 Validation Loss 0.45914608240127563


 13%|█▎        | 40/300 [01:01<07:23,  1.70s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.5070249438285828
Epoch 40 Validation Loss 0.4767591953277588


 14%|█▎        | 41/300 [01:03<07:11,  1.67s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.4402526021003723
Epoch 41 Validation Loss 0.4550975561141968


 14%|█▍        | 42/300 [01:05<07:29,  1.74s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.43324488401412964
Epoch 42 Validation Loss 0.4480394124984741


 14%|█▍        | 43/300 [01:06<07:09,  1.67s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.48651617765426636
Epoch 43 Validation Loss 0.49459490180015564


 15%|█▍        | 44/300 [01:08<07:01,  1.65s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.4638482928276062
Epoch 44 Validation Loss 0.4575110673904419


 15%|█▌        | 45/300 [01:09<07:00,  1.65s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.4576978385448456
Epoch 45 Validation Loss 0.4503884017467499


 15%|█▌        | 46/300 [01:11<06:52,  1.63s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.4645150601863861
Epoch 46 Validation Loss 0.47238340973854065


 16%|█▌        | 47/300 [01:13<06:50,  1.62s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.48624399304389954
Epoch 47 Validation Loss 0.4640875458717346


 16%|█▌        | 48/300 [01:14<07:00,  1.67s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.46439340710639954
Epoch 48 Validation Loss 0.4667113125324249


 16%|█▋        | 49/300 [01:16<06:48,  1.63s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.4786469638347626
Epoch 49 Validation Loss 0.46700042486190796


 17%|█▋        | 50/300 [01:17<06:38,  1.59s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.4883267283439636
Epoch 50 Validation Loss 0.4924389123916626


 17%|█▋        | 51/300 [01:19<06:37,  1.60s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.41743767261505127
Epoch 51 Validation Loss 0.4402933120727539


 17%|█▋        | 52/300 [01:21<06:27,  1.56s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.4649442136287689
Epoch 52 Validation Loss 0.48096421360969543


 18%|█▊        | 53/300 [01:22<06:31,  1.59s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.4224601984024048
Epoch 53 Validation Loss 0.4311824440956116


 18%|█▊        | 54/300 [01:24<06:29,  1.58s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.47021836042404175
Epoch 54 Validation Loss 0.4576626718044281


 18%|█▊        | 55/300 [01:26<06:54,  1.69s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.4031932055950165
Epoch 55 Validation Loss 0.43065401911735535


 19%|█▊        | 56/300 [01:27<06:22,  1.57s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.42105361819267273
Epoch 56 Validation Loss 0.4317796230316162


 19%|█▉        | 57/300 [01:28<06:09,  1.52s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.46743252873420715
Epoch 57 Validation Loss 0.46221357583999634


 19%|█▉        | 58/300 [01:30<05:42,  1.41s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.4560052752494812
Epoch 58 Validation Loss 0.4566531479358673


 20%|█▉        | 59/300 [01:31<05:35,  1.39s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.4108343720436096
Epoch 59 Validation Loss 0.4261420965194702


 20%|██        | 60/300 [01:33<05:49,  1.45s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.4846639633178711
Epoch 60 Validation Loss 0.48009416460990906


 20%|██        | 61/300 [01:34<05:50,  1.46s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.4252175986766815
Epoch 61 Validation Loss 0.4249136447906494


 21%|██        | 62/300 [01:36<05:57,  1.50s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.44469571113586426
Epoch 62 Validation Loss 0.43839654326438904


 21%|██        | 63/300 [01:38<06:41,  1.69s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.5418400764465332
Epoch 63 Validation Loss 0.5879949927330017


 21%|██▏       | 64/300 [01:39<06:31,  1.66s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.40024685859680176
Epoch 64 Validation Loss 0.414626806974411


 22%|██▏       | 65/300 [01:41<06:26,  1.65s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.42122069001197815
Epoch 65 Validation Loss 0.40849852561950684


 22%|██▏       | 66/300 [01:43<06:40,  1.71s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.41336607933044434
Epoch 66 Validation Loss 0.4132790267467499


 22%|██▏       | 67/300 [01:44<06:37,  1.71s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.4118621051311493
Epoch 67 Validation Loss 0.40791699290275574


 23%|██▎       | 68/300 [01:46<06:40,  1.72s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.4064398407936096
Epoch 68 Validation Loss 0.41966745257377625


 23%|██▎       | 69/300 [01:48<06:29,  1.69s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.43821537494659424
Epoch 69 Validation Loss 0.442889004945755


 23%|██▎       | 70/300 [01:49<06:21,  1.66s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.4299148619174957
Epoch 70 Validation Loss 0.4373803734779358


 24%|██▎       | 71/300 [01:51<06:32,  1.72s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.41201433539390564
Epoch 71 Validation Loss 0.39567843079566956


 24%|██▍       | 72/300 [01:53<06:28,  1.70s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.3837244212627411
Epoch 72 Validation Loss 0.3898516297340393


 24%|██▍       | 73/300 [01:54<06:10,  1.63s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.4119763672351837
Epoch 73 Validation Loss 0.4368755519390106


 25%|██▍       | 74/300 [01:56<06:08,  1.63s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.3808285892009735
Epoch 74 Validation Loss 0.38449761271476746


 25%|██▌       | 75/300 [01:58<06:23,  1.70s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.4159493148326874
Epoch 75 Validation Loss 0.4676906168460846


 25%|██▌       | 76/300 [02:00<07:04,  1.89s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.40303534269332886
Epoch 76 Validation Loss 0.4090648591518402


 26%|██▌       | 77/300 [02:02<07:18,  1.97s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.3674284815788269
Epoch 77 Validation Loss 0.3806731104850769


 26%|██▌       | 78/300 [02:04<07:13,  1.95s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.3754965662956238
Epoch 78 Validation Loss 0.39440664649009705


 26%|██▋       | 79/300 [02:06<06:47,  1.84s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.37764772772789
Epoch 79 Validation Loss 0.37536630034446716


 27%|██▋       | 80/300 [02:08<06:28,  1.77s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.41327574849128723
Epoch 80 Validation Loss 0.41562527418136597


 27%|██▋       | 81/300 [02:10<06:43,  1.84s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.3779646158218384
Epoch 81 Validation Loss 0.3803098797798157


 27%|██▋       | 82/300 [02:11<06:32,  1.80s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.40385904908180237
Epoch 82 Validation Loss 0.42706242203712463


 28%|██▊       | 83/300 [02:13<06:16,  1.73s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.40927064418792725
Epoch 83 Validation Loss 0.39412909746170044


 28%|██▊       | 84/300 [02:14<05:59,  1.66s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.3884349763393402
Epoch 84 Validation Loss 0.3868826925754547


 28%|██▊       | 85/300 [02:16<05:50,  1.63s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.4204469621181488
Epoch 85 Validation Loss 0.45879533886909485


 29%|██▊       | 86/300 [02:18<05:56,  1.67s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.4491906464099884
Epoch 86 Validation Loss 0.5017224550247192


 29%|██▉       | 87/300 [02:19<05:46,  1.63s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.40923187136650085
Epoch 87 Validation Loss 0.4105078876018524


 29%|██▉       | 88/300 [02:21<05:39,  1.60s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.3798311948776245
Epoch 88 Validation Loss 0.3988756835460663


 30%|██▉       | 89/300 [02:22<05:43,  1.63s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.42133456468582153
Epoch 89 Validation Loss 0.3963022530078888


 30%|███       | 90/300 [02:24<05:39,  1.62s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.4020345211029053
Epoch 90 Validation Loss 0.3977523446083069


 30%|███       | 91/300 [02:26<05:49,  1.67s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.3698606491088867
Epoch 91 Validation Loss 0.3844437599182129


 31%|███       | 92/300 [02:27<05:44,  1.66s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.3889499604701996
Epoch 92 Validation Loss 0.40873983502388


 31%|███       | 93/300 [02:29<05:41,  1.65s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.3669065833091736
Epoch 93 Validation Loss 0.36802828311920166


 31%|███▏      | 94/300 [02:31<05:39,  1.65s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.37579649686813354
Epoch 94 Validation Loss 0.3696778118610382


 32%|███▏      | 95/300 [02:32<05:45,  1.69s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.39548400044441223
Epoch 95 Validation Loss 0.39864403009414673


 32%|███▏      | 96/300 [02:34<05:54,  1.74s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.46787217259407043
Epoch 96 Validation Loss 0.46619126200675964


 32%|███▏      | 97/300 [02:36<05:46,  1.71s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.3725089728832245
Epoch 97 Validation Loss 0.37832164764404297


 33%|███▎      | 98/300 [02:38<05:45,  1.71s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.4145829975605011
Epoch 98 Validation Loss 0.39882466197013855


 33%|███▎      | 99/300 [02:39<05:39,  1.69s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.37299036979675293
Epoch 99 Validation Loss 0.37017524242401123


 33%|███▎      | 100/300 [02:41<05:32,  1.66s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.37144026160240173
Epoch 100 Validation Loss 0.3758920431137085


 34%|███▎      | 101/300 [02:43<05:39,  1.71s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.4158756732940674
Epoch 101 Validation Loss 0.4206022322177887


 34%|███▍      | 102/300 [02:44<05:34,  1.69s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.36524516344070435
Epoch 102 Validation Loss 0.392770916223526


 34%|███▍      | 103/300 [02:46<05:29,  1.67s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.42665034532546997
Epoch 103 Validation Loss 0.4293508231639862


 35%|███▍      | 104/300 [02:48<05:30,  1.69s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.4004206955432892
Epoch 104 Validation Loss 0.40347903966903687


 35%|███▌      | 105/300 [02:49<05:24,  1.66s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.3717333972454071
Epoch 105 Validation Loss 0.38140785694122314


 35%|███▌      | 106/300 [02:51<05:43,  1.77s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.37063315510749817
Epoch 106 Validation Loss 0.35672253370285034


 36%|███▌      | 107/300 [02:53<05:39,  1.76s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.36486080288887024
Epoch 107 Validation Loss 0.37940484285354614


 36%|███▌      | 108/300 [02:55<05:38,  1.76s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.37555813789367676
Epoch 108 Validation Loss 0.40024077892303467


 36%|███▋      | 109/300 [02:57<05:55,  1.86s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.41841647028923035
Epoch 109 Validation Loss 0.4998156726360321


 37%|███▋      | 110/300 [02:59<06:09,  1.94s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.3613797426223755
Epoch 110 Validation Loss 0.3891279101371765


 37%|███▋      | 111/300 [03:01<06:05,  1.93s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.37172093987464905
Epoch 111 Validation Loss 0.38141298294067383


 37%|███▋      | 112/300 [03:03<05:45,  1.84s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.3831554651260376
Epoch 112 Validation Loss 0.38739314675331116


 38%|███▊      | 113/300 [03:04<05:30,  1.77s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.358590692281723
Epoch 113 Validation Loss 0.3565264046192169


 38%|███▊      | 114/300 [03:06<05:16,  1.70s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.3634093403816223
Epoch 114 Validation Loss 0.3628440797328949


 38%|███▊      | 115/300 [03:07<05:11,  1.68s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.37019041180610657
Epoch 115 Validation Loss 0.36046281456947327


 39%|███▊      | 116/300 [03:09<05:19,  1.73s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.3591196835041046
Epoch 116 Validation Loss 0.36062976717948914


 39%|███▉      | 117/300 [03:11<05:15,  1.72s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.3569406569004059
Epoch 117 Validation Loss 0.36255574226379395


 39%|███▉      | 118/300 [03:13<05:10,  1.71s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.36465010046958923
Epoch 118 Validation Loss 0.3777209520339966


 40%|███▉      | 119/300 [03:14<05:10,  1.71s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.40985530614852905
Epoch 119 Validation Loss 0.4476691782474518


 40%|████      | 120/300 [03:16<04:58,  1.66s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.3616626262664795
Epoch 120 Validation Loss 0.3593190312385559


 40%|████      | 121/300 [03:18<05:09,  1.73s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.3529983162879944
Epoch 121 Validation Loss 0.35789135098457336


 41%|████      | 122/300 [03:19<05:01,  1.69s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.34510499238967896
Epoch 122 Validation Loss 0.34818366169929504


 41%|████      | 123/300 [03:21<04:54,  1.66s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.36052510142326355
Epoch 123 Validation Loss 0.3555924594402313


 41%|████▏     | 124/300 [03:23<04:50,  1.65s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.34282705187797546
Epoch 124 Validation Loss 0.35342705249786377


 42%|████▏     | 125/300 [03:24<04:47,  1.64s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.3833504021167755
Epoch 125 Validation Loss 0.37050309777259827


 42%|████▏     | 126/300 [03:26<04:53,  1.69s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.33626589179039
Epoch 126 Validation Loss 0.34658172726631165


 42%|████▏     | 127/300 [03:28<04:48,  1.67s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.4178629517555237
Epoch 127 Validation Loss 0.3867214024066925


 43%|████▎     | 128/300 [03:29<04:44,  1.65s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.3372112512588501
Epoch 128 Validation Loss 0.368904173374176


 43%|████▎     | 129/300 [03:31<04:45,  1.67s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.36507245898246765
Epoch 129 Validation Loss 0.3774953782558441


 43%|████▎     | 130/300 [03:33<04:40,  1.65s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.47597673535346985
Epoch 130 Validation Loss 0.433108925819397


 44%|████▎     | 131/300 [03:35<04:54,  1.74s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.41651347279548645
Epoch 131 Validation Loss 0.38000956177711487


 44%|████▍     | 132/300 [03:36<04:51,  1.73s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.3439352214336395
Epoch 132 Validation Loss 0.34043049812316895


 44%|████▍     | 133/300 [03:38<04:43,  1.70s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.38448816537857056
Epoch 133 Validation Loss 0.37585315108299255


 45%|████▍     | 134/300 [03:39<04:38,  1.68s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.3729296922683716
Epoch 134 Validation Loss 0.3913504183292389


 45%|████▌     | 135/300 [03:41<04:28,  1.63s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.33964890241622925
Epoch 135 Validation Loss 0.34674468636512756


 45%|████▌     | 136/300 [03:43<04:36,  1.68s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.3364652395248413
Epoch 136 Validation Loss 0.3461223840713501


 46%|████▌     | 137/300 [03:44<04:33,  1.68s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.3292873203754425
Epoch 137 Validation Loss 0.339992880821228


 46%|████▌     | 138/300 [03:46<04:35,  1.70s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.38683322072029114
Epoch 138 Validation Loss 0.37589210271835327


 46%|████▋     | 139/300 [03:49<05:09,  1.92s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.3302360475063324
Epoch 139 Validation Loss 0.3549908399581909
Epoch 140 Samples 8000 Step 124 Training Loss 0.4430118799209595
Epoch 140 Validation Loss 0.43866610527038574


 47%|████▋     | 141/300 [03:54<06:12,  2.34s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.3272852301597595
Epoch 141 Validation Loss 0.34083110094070435


 47%|████▋     | 142/300 [03:56<05:34,  2.12s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.3328404724597931
Epoch 142 Validation Loss 0.33559858798980713


 48%|████▊     | 143/300 [03:58<05:46,  2.21s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.31828245520591736
Epoch 143 Validation Loss 0.3248765170574188


 48%|████▊     | 144/300 [04:01<05:45,  2.22s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.38448289036750793
Epoch 144 Validation Loss 0.39126530289649963


 48%|████▊     | 145/300 [04:02<05:26,  2.11s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.5613569617271423
Epoch 145 Validation Loss 0.5572143197059631


 49%|████▊     | 146/300 [04:04<05:05,  1.99s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.46832141280174255
Epoch 146 Validation Loss 0.4916287064552307


 49%|████▉     | 147/300 [04:06<05:10,  2.03s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.5748740434646606
Epoch 147 Validation Loss 0.5878748893737793


 49%|████▉     | 148/300 [04:08<05:18,  2.09s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.6327006220817566
Epoch 148 Validation Loss 0.5526881814002991


 50%|████▉     | 149/300 [04:11<05:53,  2.34s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.4315231442451477
Epoch 149 Validation Loss 0.43739601969718933


 50%|█████     | 150/300 [04:13<05:17,  2.11s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.4180821478366852
Epoch 150 Validation Loss 0.43593230843544006


 50%|█████     | 151/300 [04:15<04:59,  2.01s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.41266703605651855
Epoch 151 Validation Loss 0.42500054836273193


 51%|█████     | 152/300 [04:16<04:39,  1.89s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.6060639023780823
Epoch 152 Validation Loss 0.7077091336250305


 51%|█████     | 153/300 [04:18<04:26,  1.81s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.43876197934150696
Epoch 153 Validation Loss 0.4118640124797821


 51%|█████▏    | 154/300 [04:20<04:16,  1.75s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.4885779321193695
Epoch 154 Validation Loss 0.514654815196991


 52%|█████▏    | 155/300 [04:21<04:07,  1.71s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.36473914980888367
Epoch 155 Validation Loss 0.3796944320201874


 52%|█████▏    | 156/300 [04:23<04:08,  1.73s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.6124070286750793
Epoch 156 Validation Loss 0.701535165309906


 52%|█████▏    | 157/300 [04:25<04:02,  1.70s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.616700291633606
Epoch 157 Validation Loss 0.5852419137954712


 53%|█████▎    | 158/300 [04:26<03:56,  1.67s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.3638095259666443
Epoch 158 Validation Loss 0.3729568421840668


 53%|█████▎    | 159/300 [04:28<03:59,  1.70s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.38395777344703674
Epoch 159 Validation Loss 0.37774038314819336


 53%|█████▎    | 160/300 [04:30<03:55,  1.68s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.36206430196762085
Epoch 160 Validation Loss 0.3739040195941925


 54%|█████▎    | 161/300 [04:32<04:06,  1.77s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.4621464014053345
Epoch 161 Validation Loss 0.4824662506580353


 54%|█████▍    | 162/300 [04:33<03:58,  1.73s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.3705466389656067
Epoch 162 Validation Loss 0.37302348017692566


 54%|█████▍    | 163/300 [04:35<03:52,  1.70s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.4032641053199768
Epoch 163 Validation Loss 0.37718069553375244


 55%|█████▍    | 164/300 [04:37<03:49,  1.69s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.41375693678855896
Epoch 164 Validation Loss 0.4539587199687958


 55%|█████▌    | 165/300 [04:38<03:46,  1.68s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.7006784081459045
Epoch 165 Validation Loss 0.5401926040649414


 55%|█████▌    | 166/300 [04:40<04:04,  1.82s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.43473711609840393
Epoch 166 Validation Loss 0.45276233553886414


 56%|█████▌    | 167/300 [04:42<03:55,  1.77s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.35312122106552124
Epoch 167 Validation Loss 0.36399057507514954


 56%|█████▌    | 168/300 [04:44<03:50,  1.75s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.40987351536750793
Epoch 168 Validation Loss 0.3848418593406677


 56%|█████▋    | 169/300 [04:45<03:40,  1.68s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.35520780086517334
Epoch 169 Validation Loss 0.3625953197479248


 57%|█████▋    | 170/300 [04:47<03:35,  1.66s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.34846773743629456
Epoch 170 Validation Loss 0.3562581539154053


 57%|█████▋    | 171/300 [04:49<03:38,  1.70s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.4335825443267822
Epoch 171 Validation Loss 0.5271176695823669


 57%|█████▋    | 172/300 [04:50<03:38,  1.71s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.35201022028923035
Epoch 172 Validation Loss 0.35988980531692505


 58%|█████▊    | 173/300 [04:52<03:37,  1.71s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.3642919361591339
Epoch 173 Validation Loss 0.358832448720932


 58%|█████▊    | 174/300 [04:54<03:37,  1.73s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.5100635290145874
Epoch 174 Validation Loss 0.4120691120624542


 58%|█████▊    | 175/300 [04:55<03:32,  1.70s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.3496306836605072
Epoch 175 Validation Loss 0.3559868633747101


 59%|█████▊    | 176/300 [04:57<03:33,  1.72s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.378133088350296
Epoch 176 Validation Loss 0.37891024351119995


 59%|█████▉    | 177/300 [04:59<03:26,  1.68s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.34156325459480286
Epoch 177 Validation Loss 0.3740158975124359


 59%|█████▉    | 178/300 [05:00<03:19,  1.63s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.35372161865234375
Epoch 178 Validation Loss 0.34947866201400757


 60%|█████▉    | 179/300 [05:02<03:17,  1.63s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.3398120403289795
Epoch 179 Validation Loss 0.353211373090744


 60%|██████    | 180/300 [05:04<03:20,  1.67s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.33125990629196167
Epoch 180 Validation Loss 0.35922759771347046


 60%|██████    | 181/300 [05:05<03:19,  1.68s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.33438804745674133
Epoch 181 Validation Loss 0.34645575284957886


 61%|██████    | 182/300 [05:07<03:15,  1.65s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.3552033007144928
Epoch 182 Validation Loss 0.35726067423820496


 61%|██████    | 183/300 [05:09<03:19,  1.70s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.3402441740036011
Epoch 183 Validation Loss 0.3516283631324768


 61%|██████▏   | 184/300 [05:10<03:15,  1.68s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.35242608189582825
Epoch 184 Validation Loss 0.3516921401023865


 62%|██████▏   | 185/300 [05:12<03:19,  1.74s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.35391703248023987
Epoch 185 Validation Loss 0.3455154597759247


 62%|██████▏   | 186/300 [05:14<03:15,  1.71s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.366920530796051
Epoch 186 Validation Loss 0.37021324038505554


 62%|██████▏   | 187/300 [05:16<03:10,  1.69s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.481530100107193
Epoch 187 Validation Loss 0.40955865383148193


 63%|██████▎   | 188/300 [05:18<03:32,  1.89s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.33775436878204346
Epoch 188 Validation Loss 0.3482070863246918


 63%|██████▎   | 189/300 [05:20<03:21,  1.81s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.3487270474433899
Epoch 189 Validation Loss 0.34875425696372986


 63%|██████▎   | 190/300 [05:21<03:11,  1.74s/it]

Epoch 190 Samples 8000 Step 124 Training Loss 0.3730996251106262
Epoch 190 Validation Loss 0.35927122831344604


 64%|██████▎   | 191/300 [05:23<03:06,  1.71s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.36999765038490295
Epoch 191 Validation Loss 0.41171860694885254


 64%|██████▍   | 192/300 [05:25<03:13,  1.80s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.3426568806171417
Epoch 192 Validation Loss 0.3480720520019531


 64%|██████▍   | 193/300 [05:27<03:09,  1.77s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.48984330892562866
Epoch 193 Validation Loss 0.598501980304718


 65%|██████▍   | 194/300 [05:28<03:10,  1.80s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.331409752368927
Epoch 194 Validation Loss 0.34308305382728577


 65%|██████▌   | 195/300 [05:30<03:06,  1.77s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.440155029296875
Epoch 195 Validation Loss 0.4007446765899658


 65%|██████▌   | 196/300 [05:32<03:00,  1.74s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.36091241240501404
Epoch 196 Validation Loss 0.3539436161518097


 66%|██████▌   | 197/300 [05:33<02:56,  1.71s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.33413589000701904
Epoch 197 Validation Loss 0.3393700122833252


 66%|██████▌   | 198/300 [05:35<03:01,  1.78s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.3374561071395874
Epoch 198 Validation Loss 0.33801087737083435


 66%|██████▋   | 199/300 [05:37<02:52,  1.71s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.3358943462371826
Epoch 199 Validation Loss 0.3420788645744324


 67%|██████▋   | 200/300 [05:38<02:45,  1.65s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.3478846251964569
Epoch 200 Validation Loss 0.34666991233825684


 67%|██████▋   | 201/300 [05:40<02:44,  1.66s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.35807764530181885
Epoch 201 Validation Loss 0.35390543937683105


 67%|██████▋   | 202/300 [05:42<02:39,  1.63s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.34367111325263977
Epoch 202 Validation Loss 0.3423166871070862


 68%|██████▊   | 203/300 [05:44<02:46,  1.71s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.3442310690879822
Epoch 203 Validation Loss 0.339063823223114


 68%|██████▊   | 204/300 [05:45<02:39,  1.66s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.4419044554233551
Epoch 204 Validation Loss 0.5023391842842102


 68%|██████▊   | 205/300 [05:47<02:33,  1.61s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.4020549952983856
Epoch 205 Validation Loss 0.4483155310153961


 69%|██████▊   | 206/300 [05:48<02:34,  1.64s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.4012485444545746
Epoch 206 Validation Loss 0.4204237163066864


 69%|██████▉   | 207/300 [05:50<02:36,  1.68s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.3369618356227875
Epoch 207 Validation Loss 0.3375195860862732


 69%|██████▉   | 208/300 [05:52<02:31,  1.65s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.342664510011673
Epoch 208 Validation Loss 0.34221863746643066


 70%|██████▉   | 209/300 [05:53<02:27,  1.62s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.46307268738746643
Epoch 209 Validation Loss 0.45828133821487427


 70%|███████   | 210/300 [05:55<02:25,  1.61s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.38188308477401733
Epoch 210 Validation Loss 0.3787454068660736


 70%|███████   | 211/300 [05:56<02:23,  1.61s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.33571869134902954
Epoch 211 Validation Loss 0.3559734523296356


 71%|███████   | 212/300 [05:58<02:26,  1.66s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.35869306325912476
Epoch 212 Validation Loss 0.35621365904808044


 71%|███████   | 213/300 [06:00<02:25,  1.68s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.5576296448707581
Epoch 213 Validation Loss 0.4653277099132538


 71%|███████▏  | 214/300 [06:02<02:23,  1.67s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.3601130545139313
Epoch 214 Validation Loss 0.3877153694629669


 72%|███████▏  | 215/300 [06:03<02:21,  1.66s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.33780547976493835
Epoch 215 Validation Loss 0.3396906852722168


 72%|███████▏  | 216/300 [06:05<02:19,  1.66s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.33873099088668823
Epoch 216 Validation Loss 0.34330010414123535


 72%|███████▏  | 217/300 [06:07<02:23,  1.73s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.372008353471756
Epoch 217 Validation Loss 0.36007824540138245


 73%|███████▎  | 218/300 [06:08<02:20,  1.72s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.3418913781642914
Epoch 218 Validation Loss 0.3394224941730499


 73%|███████▎  | 219/300 [06:10<02:17,  1.69s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.336177259683609
Epoch 219 Validation Loss 0.33766743540763855


 73%|███████▎  | 220/300 [06:12<02:14,  1.69s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.32905134558677673
Epoch 220 Validation Loss 0.3366106450557709


 74%|███████▎  | 221/300 [06:13<02:13,  1.69s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.34765520691871643
Epoch 221 Validation Loss 0.3465663194656372


 74%|███████▍  | 222/300 [06:15<02:16,  1.75s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.33520984649658203
Epoch 222 Validation Loss 0.3394903838634491


 74%|███████▍  | 223/300 [06:17<02:13,  1.73s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.3632745146751404
Epoch 223 Validation Loss 0.36265525221824646


 75%|███████▍  | 224/300 [06:19<02:11,  1.73s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.3812256455421448
Epoch 224 Validation Loss 0.40337634086608887


 75%|███████▌  | 225/300 [06:20<02:09,  1.73s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.41295793652534485
Epoch 225 Validation Loss 0.5089616179466248


 75%|███████▌  | 226/300 [06:22<02:10,  1.76s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.34863799810409546
Epoch 226 Validation Loss 0.34363335371017456


 76%|███████▌  | 227/300 [06:24<02:06,  1.74s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.5561156868934631
Epoch 227 Validation Loss 1.0048483610153198


 76%|███████▌  | 228/300 [06:26<02:03,  1.72s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.3670678734779358
Epoch 228 Validation Loss 0.36447784304618835


 76%|███████▋  | 229/300 [06:27<01:59,  1.69s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.3531588613986969
Epoch 229 Validation Loss 0.36739838123321533


 77%|███████▋  | 230/300 [06:29<01:55,  1.66s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.6899505853652954
Epoch 230 Validation Loss 0.6381295919418335


 77%|███████▋  | 231/300 [06:31<02:00,  1.74s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.32799628376960754
Epoch 231 Validation Loss 0.33880332112312317


 77%|███████▋  | 232/300 [06:32<01:56,  1.72s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.334559828042984
Epoch 232 Validation Loss 0.3398040533065796


 78%|███████▊  | 233/300 [06:34<01:54,  1.70s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.3495519757270813
Epoch 233 Validation Loss 0.3466726541519165


 78%|███████▊  | 234/300 [06:36<01:52,  1.70s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.33438408374786377
Epoch 234 Validation Loss 0.3360637128353119


 78%|███████▊  | 235/300 [06:38<01:50,  1.70s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.3366066515445709
Epoch 235 Validation Loss 0.33136487007141113


 79%|███████▊  | 236/300 [06:39<01:51,  1.74s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.3393726646900177
Epoch 236 Validation Loss 0.35283324122428894


 79%|███████▉  | 237/300 [06:41<01:46,  1.69s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.33052653074264526
Epoch 237 Validation Loss 0.3309156000614166


 79%|███████▉  | 238/300 [06:42<01:42,  1.65s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.41708406805992126
Epoch 238 Validation Loss 0.4643150568008423


 80%|███████▉  | 239/300 [06:44<01:39,  1.62s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.33302730321884155
Epoch 239 Validation Loss 0.33697837591171265


 80%|████████  | 240/300 [06:46<01:40,  1.68s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.33934515714645386
Epoch 240 Validation Loss 0.34044235944747925


 80%|████████  | 241/300 [06:47<01:37,  1.66s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.3375924229621887
Epoch 241 Validation Loss 0.33265841007232666


 81%|████████  | 242/300 [06:49<01:34,  1.64s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.3562142848968506
Epoch 242 Validation Loss 0.350904643535614


 81%|████████  | 243/300 [06:51<01:33,  1.64s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.36718904972076416
Epoch 243 Validation Loss 0.3670312464237213


 81%|████████▏ | 244/300 [06:53<01:35,  1.71s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.3452565670013428
Epoch 244 Validation Loss 0.3644135296344757


 82%|████████▏ | 245/300 [06:54<01:32,  1.68s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.3272363543510437
Epoch 245 Validation Loss 0.3413750231266022


 82%|████████▏ | 246/300 [06:56<01:29,  1.65s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.36415359377861023
Epoch 246 Validation Loss 0.3635399341583252


 82%|████████▏ | 247/300 [06:57<01:26,  1.64s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.3285941779613495
Epoch 247 Validation Loss 0.33455246686935425


 83%|████████▎ | 248/300 [06:59<01:28,  1.71s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.3359427750110626
Epoch 248 Validation Loss 0.3321170508861542


 83%|████████▎ | 249/300 [07:01<01:26,  1.70s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.3262006938457489
Epoch 249 Validation Loss 0.32784539461135864


 83%|████████▎ | 250/300 [07:03<01:23,  1.68s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.32658064365386963
Epoch 250 Validation Loss 0.3380490243434906


 84%|████████▎ | 251/300 [07:04<01:22,  1.69s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.3330293595790863
Epoch 251 Validation Loss 0.3302496373653412


 84%|████████▍ | 252/300 [07:06<01:24,  1.76s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.32877638936042786
Epoch 252 Validation Loss 0.3347083032131195


 84%|████████▍ | 253/300 [07:08<01:23,  1.78s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.3680652379989624
Epoch 253 Validation Loss 0.36291053891181946


 85%|████████▍ | 254/300 [07:10<01:19,  1.73s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.4943335950374603
Epoch 254 Validation Loss 0.49504950642585754


 85%|████████▌ | 255/300 [07:11<01:16,  1.71s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.33386385440826416
Epoch 255 Validation Loss 0.3241675794124603


 85%|████████▌ | 256/300 [07:13<01:17,  1.76s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.42177245020866394
Epoch 256 Validation Loss 0.42248618602752686


 86%|████████▌ | 257/300 [07:15<01:14,  1.74s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.3301869034767151
Epoch 257 Validation Loss 0.33185485005378723


 86%|████████▌ | 258/300 [07:17<01:11,  1.71s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.31970205903053284
Epoch 258 Validation Loss 0.324435830116272


 86%|████████▋ | 259/300 [07:18<01:09,  1.69s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.36499935388565063
Epoch 259 Validation Loss 0.3736427128314972


 87%|████████▋ | 260/300 [07:20<01:06,  1.67s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.3513523042201996
Epoch 260 Validation Loss 0.362238347530365


 87%|████████▋ | 261/300 [07:22<01:07,  1.73s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.3224502205848694
Epoch 261 Validation Loss 0.32396841049194336


 87%|████████▋ | 262/300 [07:23<01:06,  1.75s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.3162775933742523
Epoch 262 Validation Loss 0.3210848867893219


 88%|████████▊ | 263/300 [07:25<01:04,  1.75s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.38751375675201416
Epoch 263 Validation Loss 0.4570141136646271


 88%|████████▊ | 264/300 [07:27<01:02,  1.74s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.37760990858078003
Epoch 264 Validation Loss 0.3612602949142456


 88%|████████▊ | 265/300 [07:29<01:02,  1.78s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.33992457389831543
Epoch 265 Validation Loss 0.3355276584625244


 89%|████████▊ | 266/300 [07:31<01:00,  1.77s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.3222964107990265
Epoch 266 Validation Loss 0.32892921566963196


 89%|████████▉ | 267/300 [07:33<01:01,  1.86s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.47829627990722656
Epoch 267 Validation Loss 0.395999938249588


 89%|████████▉ | 268/300 [07:35<01:04,  2.01s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.32742932438850403
Epoch 268 Validation Loss 0.32638078927993774


 90%|████████▉ | 269/300 [07:37<01:06,  2.15s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.3216314911842346
Epoch 269 Validation Loss 0.3279707133769989


 90%|█████████ | 270/300 [07:39<00:59,  1.99s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.662793755531311
Epoch 270 Validation Loss 1.124639630317688


 90%|█████████ | 271/300 [07:41<00:54,  1.88s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.3282932937145233
Epoch 271 Validation Loss 0.3356866240501404


 91%|█████████ | 272/300 [07:42<00:50,  1.80s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.32208889722824097
Epoch 272 Validation Loss 0.3235909640789032


 91%|█████████ | 273/300 [07:44<00:47,  1.74s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.3299230933189392
Epoch 273 Validation Loss 0.3434221148490906


 91%|█████████▏| 274/300 [07:46<00:46,  1.78s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.33053913712501526
Epoch 274 Validation Loss 0.3300848603248596


 92%|█████████▏| 275/300 [07:48<00:44,  1.77s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.33815252780914307
Epoch 275 Validation Loss 0.3340865969657898


 92%|█████████▏| 276/300 [07:49<00:43,  1.81s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.6595844030380249
Epoch 276 Validation Loss 1.206538200378418


 92%|█████████▏| 277/300 [07:52<00:45,  1.97s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.3355481028556824
Epoch 277 Validation Loss 0.33629027009010315


 93%|█████████▎| 278/300 [07:54<00:44,  2.01s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.3220022916793823
Epoch 278 Validation Loss 0.32777008414268494


 93%|█████████▎| 279/300 [07:56<00:41,  1.95s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.3753357529640198
Epoch 279 Validation Loss 0.37064647674560547


 93%|█████████▎| 280/300 [07:57<00:37,  1.87s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.64372318983078
Epoch 280 Validation Loss 0.4975125193595886


 94%|█████████▎| 281/300 [07:59<00:34,  1.84s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.33386296033859253
Epoch 281 Validation Loss 0.33375081419944763


 94%|█████████▍| 282/300 [08:01<00:32,  1.80s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.527108907699585
Epoch 282 Validation Loss 0.4592062830924988


 94%|█████████▍| 283/300 [08:03<00:30,  1.81s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.3256731927394867
Epoch 283 Validation Loss 0.3270694315433502


 95%|█████████▍| 284/300 [08:04<00:28,  1.76s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.34829461574554443
Epoch 284 Validation Loss 0.34010234475135803


 95%|█████████▌| 285/300 [08:06<00:26,  1.75s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.346306711435318
Epoch 285 Validation Loss 0.34731242060661316


 95%|█████████▌| 286/300 [08:08<00:25,  1.84s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.3365516662597656
Epoch 286 Validation Loss 0.33315759897232056


 96%|█████████▌| 287/300 [08:10<00:24,  1.87s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.3517780005931854
Epoch 287 Validation Loss 0.3502773344516754


 96%|█████████▌| 288/300 [08:12<00:21,  1.78s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.3388010859489441
Epoch 288 Validation Loss 0.3264785408973694


 96%|█████████▋| 289/300 [08:13<00:19,  1.80s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.32757505774497986
Epoch 289 Validation Loss 0.32750484347343445


 97%|█████████▋| 290/300 [08:15<00:17,  1.76s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.3434799909591675
Epoch 290 Validation Loss 0.3318430185317993


 97%|█████████▋| 291/300 [08:17<00:16,  1.83s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.3251454532146454
Epoch 291 Validation Loss 0.3248966336250305


 97%|█████████▋| 292/300 [08:19<00:14,  1.82s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.333282470703125
Epoch 292 Validation Loss 0.33591124415397644


 98%|█████████▊| 293/300 [08:21<00:12,  1.79s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.3562408685684204
Epoch 293 Validation Loss 0.34498876333236694


 98%|█████████▊| 294/300 [08:22<00:10,  1.77s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.7155210971832275
Epoch 294 Validation Loss 0.44284069538116455


 98%|█████████▊| 295/300 [08:24<00:09,  1.85s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.3409648537635803
Epoch 295 Validation Loss 0.3373817205429077


 99%|█████████▊| 296/300 [08:26<00:07,  1.79s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.3688375651836395
Epoch 296 Validation Loss 0.3646612763404846


 99%|█████████▉| 297/300 [08:29<00:06,  2.03s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.36822009086608887
Epoch 297 Validation Loss 0.3794456124305725


 99%|█████████▉| 298/300 [08:31<00:04,  2.06s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.35647720098495483
Epoch 298 Validation Loss 0.34663712978363037


100%|█████████▉| 299/300 [08:33<00:02,  2.04s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.38891327381134033
Epoch 299 Validation Loss 0.3617330491542816


100%|██████████| 300/300 [08:34<00:00,  1.72s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.3869464099407196
Epoch 300 Validation Loss 0.5135846138000488


0,1
epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇█████
samples,▄▄▇▇▅▆▃▁▆▁▇▂▆▆▁▆▄▅▅▂▄▆▇▂▇▄▇█▇▆▆▂▃▅▂▅▂▄▅█
train_loss,███▇▅▄▅▄▄▄▆▄▂▃▂▄▂▂▂▂▃▃▂▂▂▃▄▂▂▂▂▁▂▁▆▃▁▁▁▂
val_loss,███▅▅▅▄▄▄▄▃▂▂▂▂▂▂▁▂▆▁▂▂▂▃▁▅▁▂▁▁▁▁▁▂▁▁▁▁▄

0,1
epoch,300.0
samples,8000.0
train_loss,0.38695
val_loss,0.51358


# D1 - MLP + LN

In [98]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=4,
            d_mlp=16,
            lr=0.05,
            normalization_type='LN',
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='D1'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:01<05:20,  1.07s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6242347955703735
Epoch 1 Validation Loss 0.6167828440666199


  1%|          | 2/300 [00:02<04:58,  1.00s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.5873025059700012
Epoch 2 Validation Loss 0.5860589742660522


  1%|          | 3/300 [00:02<04:48,  1.03it/s]

Epoch 3 Samples 8000 Step 124 Training Loss 0.556725025177002
Epoch 3 Validation Loss 0.5459794998168945


  1%|▏         | 4/300 [00:04<05:05,  1.03s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.5128946900367737
Epoch 4 Validation Loss 0.5048330426216125


  2%|▏         | 5/300 [00:05<05:41,  1.16s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.4780747890472412
Epoch 5 Validation Loss 0.47816500067710876


  2%|▏         | 6/300 [00:06<05:41,  1.16s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.4530395567417145
Epoch 6 Validation Loss 0.45158204436302185


  2%|▏         | 7/300 [00:07<05:40,  1.16s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.4305723309516907
Epoch 7 Validation Loss 0.42059317231178284


  3%|▎         | 8/300 [00:08<05:40,  1.16s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.425962895154953
Epoch 8 Validation Loss 0.4161887764930725


  3%|▎         | 9/300 [00:10<06:30,  1.34s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.7723873853683472
Epoch 9 Validation Loss 0.7226327657699585


  3%|▎         | 10/300 [00:12<07:00,  1.45s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.5688483119010925
Epoch 10 Validation Loss 0.568030834197998
Epoch 11 Samples 8000 Step 124 Training Loss 0.5254412293434143
Epoch 11 Validation Loss 0.5579948425292969


  4%|▍         | 12/300 [00:15<07:28,  1.56s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.48706576228141785
Epoch 12 Validation Loss 0.4807043969631195


  4%|▍         | 13/300 [00:17<07:16,  1.52s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.45964741706848145
Epoch 13 Validation Loss 0.4647713303565979


  5%|▍         | 14/300 [00:18<07:23,  1.55s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.45216190814971924
Epoch 14 Validation Loss 0.4309268891811371


  5%|▌         | 15/300 [00:20<07:11,  1.51s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.4580436646938324
Epoch 15 Validation Loss 0.4269713759422302


  5%|▌         | 16/300 [00:21<07:06,  1.50s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.4286077618598938
Epoch 16 Validation Loss 0.4084683656692505


  6%|▌         | 17/300 [00:23<07:10,  1.52s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.40248602628707886
Epoch 17 Validation Loss 0.4062982201576233


  6%|▌         | 18/300 [00:24<07:09,  1.52s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.4444783926010132
Epoch 18 Validation Loss 0.4145708978176117


  6%|▋         | 19/300 [00:26<07:08,  1.53s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.3945063352584839
Epoch 19 Validation Loss 0.37696993350982666


  7%|▋         | 20/300 [00:27<06:38,  1.42s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.3823471665382385
Epoch 20 Validation Loss 0.3729195296764374


  7%|▋         | 21/300 [00:28<06:19,  1.36s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.4199431240558624
Epoch 21 Validation Loss 0.4144955575466156


  7%|▋         | 22/300 [00:30<07:08,  1.54s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.38165080547332764
Epoch 22 Validation Loss 0.3794635832309723


  8%|▊         | 23/300 [00:32<07:04,  1.53s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.3781351149082184
Epoch 23 Validation Loss 0.3709139823913574


  8%|▊         | 24/300 [00:33<07:05,  1.54s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.3586394488811493
Epoch 24 Validation Loss 0.36258596181869507


  8%|▊         | 25/300 [00:34<06:35,  1.44s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.3672923743724823
Epoch 25 Validation Loss 0.3688311278820038


  9%|▊         | 26/300 [00:36<06:57,  1.52s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.3633536696434021
Epoch 26 Validation Loss 0.3572222590446472


  9%|▉         | 27/300 [00:38<06:55,  1.52s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.3424663245677948
Epoch 27 Validation Loss 0.34205469489097595


  9%|▉         | 28/300 [00:39<06:32,  1.44s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.33779874444007874
Epoch 28 Validation Loss 0.34508568048477173


 10%|▉         | 29/300 [00:41<06:40,  1.48s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.3460845649242401
Epoch 29 Validation Loss 0.3378312289714813


 10%|█         | 30/300 [00:42<06:23,  1.42s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.34992796182632446
Epoch 30 Validation Loss 0.34680792689323425


 10%|█         | 31/300 [00:44<06:48,  1.52s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.3358473479747772
Epoch 31 Validation Loss 0.33594071865081787


 11%|█         | 32/300 [00:45<06:47,  1.52s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.3147548735141754
Epoch 32 Validation Loss 0.32269397377967834


 11%|█         | 33/300 [00:46<06:24,  1.44s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.3163585364818573
Epoch 33 Validation Loss 0.31405332684516907


 11%|█▏        | 34/300 [00:48<06:38,  1.50s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.3172552287578583
Epoch 34 Validation Loss 0.31582072377204895


 12%|█▏        | 35/300 [00:49<06:37,  1.50s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.3252098262310028
Epoch 35 Validation Loss 0.31864041090011597


 12%|█▏        | 36/300 [00:51<06:52,  1.56s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.32017356157302856
Epoch 36 Validation Loss 0.3426773250102997


 12%|█▏        | 37/300 [00:53<06:37,  1.51s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.3063649535179138
Epoch 37 Validation Loss 0.29722654819488525


 13%|█▎        | 38/300 [00:54<06:43,  1.54s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.29791581630706787
Epoch 38 Validation Loss 0.2945207357406616


 13%|█▎        | 39/300 [00:56<06:41,  1.54s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.4582941234111786
Epoch 39 Validation Loss 0.41707807779312134


 13%|█▎        | 40/300 [00:57<06:33,  1.51s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.2978211045265198
Epoch 40 Validation Loss 0.2868572175502777


 14%|█▎        | 41/300 [00:58<06:12,  1.44s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.2806989848613739
Epoch 41 Validation Loss 0.28274765610694885


 14%|█▍        | 42/300 [01:00<06:07,  1.42s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.3079104721546173
Epoch 42 Validation Loss 0.3066377341747284


 14%|█▍        | 43/300 [01:01<06:15,  1.46s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.2859407365322113
Epoch 43 Validation Loss 0.28910863399505615


 15%|█▍        | 44/300 [01:03<06:32,  1.53s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.28174594044685364
Epoch 44 Validation Loss 0.27667638659477234


 15%|█▌        | 45/300 [01:05<06:25,  1.51s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.2838239371776581
Epoch 45 Validation Loss 0.27627140283584595


 15%|█▌        | 46/300 [01:06<06:06,  1.44s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.27182427048683167
Epoch 46 Validation Loss 0.27033373713493347


 16%|█▌        | 47/300 [01:07<06:08,  1.46s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.27349603176116943
Epoch 47 Validation Loss 0.27367058396339417


 16%|█▌        | 48/300 [01:08<05:42,  1.36s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.29715147614479065
Epoch 48 Validation Loss 0.3081386089324951


 16%|█▋        | 49/300 [01:10<05:36,  1.34s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.2692581117153168
Epoch 49 Validation Loss 0.26632580161094666


 17%|█▋        | 50/300 [01:11<05:51,  1.40s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.268483430147171
Epoch 50 Validation Loss 0.2762598693370819


 17%|█▋        | 51/300 [01:13<06:09,  1.48s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.2622704803943634
Epoch 51 Validation Loss 0.2650584280490875


 17%|█▋        | 52/300 [01:15<06:25,  1.55s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.26194873452186584
Epoch 52 Validation Loss 0.26270240545272827


 18%|█▊        | 53/300 [01:17<06:56,  1.69s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.2696714699268341
Epoch 53 Validation Loss 0.2651872932910919


 18%|█▊        | 54/300 [01:18<06:44,  1.64s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.270717978477478
Epoch 54 Validation Loss 0.27135369181632996


 18%|█▊        | 55/300 [01:20<06:36,  1.62s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.2642287313938141
Epoch 55 Validation Loss 0.2632496953010559


 19%|█▊        | 56/300 [01:21<06:20,  1.56s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.26492807269096375
Epoch 56 Validation Loss 0.2638801634311676


 19%|█▉        | 57/300 [01:23<06:18,  1.56s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.2615455389022827
Epoch 57 Validation Loss 0.25980934500694275


 19%|█▉        | 58/300 [01:24<06:22,  1.58s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.25949227809906006
Epoch 58 Validation Loss 0.25877735018730164


 20%|█▉        | 59/300 [01:26<05:52,  1.46s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.2584884762763977
Epoch 59 Validation Loss 0.258806973695755


 20%|██        | 60/300 [01:27<05:28,  1.37s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.25632092356681824
Epoch 60 Validation Loss 0.2587151527404785


 20%|██        | 61/300 [01:28<05:28,  1.38s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.2603621482849121
Epoch 61 Validation Loss 0.2584012448787689


 21%|██        | 62/300 [01:29<05:08,  1.30s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.25894349813461304
Epoch 62 Validation Loss 0.257110059261322


 21%|██        | 63/300 [01:31<05:27,  1.38s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.2582022547721863
Epoch 63 Validation Loss 0.25808995962142944


 21%|██▏       | 64/300 [01:32<05:18,  1.35s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.2601415514945984
Epoch 64 Validation Loss 0.2603515386581421


 22%|██▏       | 65/300 [01:34<05:40,  1.45s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.25820115208625793
Epoch 65 Validation Loss 0.2573982775211334


 22%|██▏       | 66/300 [01:35<05:18,  1.36s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.2635380029678345
Epoch 66 Validation Loss 0.26280224323272705


 22%|██▏       | 67/300 [01:37<05:47,  1.49s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.3333728015422821
Epoch 67 Validation Loss 0.3276514708995819


 23%|██▎       | 68/300 [01:38<05:21,  1.39s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.28117215633392334
Epoch 68 Validation Loss 0.28241345286369324


 23%|██▎       | 69/300 [01:39<05:08,  1.33s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.2670848071575165
Epoch 69 Validation Loss 0.26848217844963074


 23%|██▎       | 70/300 [01:41<05:42,  1.49s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.29885369539260864
Epoch 70 Validation Loss 0.3218170404434204


 24%|██▎       | 71/300 [01:42<05:28,  1.44s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.2663422226905823
Epoch 71 Validation Loss 0.2675253450870514


 24%|██▍       | 72/300 [01:44<05:29,  1.44s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.30629003047943115
Epoch 72 Validation Loss 0.31378576159477234


 24%|██▍       | 73/300 [01:45<05:42,  1.51s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.2756924331188202
Epoch 73 Validation Loss 0.27485716342926025


 25%|██▍       | 74/300 [01:47<05:43,  1.52s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.26865217089653015
Epoch 74 Validation Loss 0.26482388377189636


 25%|██▌       | 75/300 [01:49<05:49,  1.55s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.2742561995983124
Epoch 75 Validation Loss 0.26484033465385437


 25%|██▌       | 76/300 [01:50<05:50,  1.57s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.2658599019050598
Epoch 76 Validation Loss 0.26836228370666504


 26%|██▌       | 77/300 [01:52<05:47,  1.56s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.2629699110984802
Epoch 77 Validation Loss 0.26134252548217773


 26%|██▌       | 78/300 [01:53<05:42,  1.54s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.2600238621234894
Epoch 78 Validation Loss 0.26076748967170715


 26%|██▋       | 79/300 [01:54<05:25,  1.47s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.26698148250579834
Epoch 79 Validation Loss 0.2657483220100403


 27%|██▋       | 80/300 [01:55<04:54,  1.34s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.2627975642681122
Epoch 80 Validation Loss 0.2605353891849518


 27%|██▋       | 81/300 [01:56<04:30,  1.24s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.27353379130363464
Epoch 81 Validation Loss 0.2652846872806549


 27%|██▋       | 82/300 [01:58<04:53,  1.35s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.2613382339477539
Epoch 82 Validation Loss 0.26051509380340576


 28%|██▊       | 83/300 [02:00<05:01,  1.39s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.2586028277873993
Epoch 83 Validation Loss 0.2624695301055908


 28%|██▊       | 84/300 [02:01<05:11,  1.44s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.26443716883659363
Epoch 84 Validation Loss 0.2640153765678406
Epoch 85 Samples 8000 Step 124 Training Loss 0.2557629644870758


 28%|██▊       | 85/300 [02:03<05:32,  1.55s/it]

Epoch 85 Validation Loss 0.2594450116157532


 29%|██▊       | 86/300 [02:05<05:56,  1.67s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.2608494460582733
Epoch 86 Validation Loss 0.26201772689819336


 29%|██▉       | 87/300 [02:07<06:06,  1.72s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.26210105419158936
Epoch 87 Validation Loss 0.26405975222587585


 29%|██▉       | 88/300 [02:08<05:54,  1.67s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.25812089443206787
Epoch 88 Validation Loss 0.25924965739250183


 30%|██▉       | 89/300 [02:10<05:44,  1.63s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.2638198137283325
Epoch 89 Validation Loss 0.26486432552337646


 30%|███       | 90/300 [02:11<05:39,  1.62s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.25985491275787354
Epoch 90 Validation Loss 0.26401084661483765


 30%|███       | 91/300 [02:14<06:12,  1.78s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.25942596793174744
Epoch 91 Validation Loss 0.26011741161346436


 31%|███       | 92/300 [02:15<06:08,  1.77s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.2546073794364929
Epoch 92 Validation Loss 0.2591080963611603


 31%|███       | 93/300 [02:16<05:27,  1.58s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.2563555836677551
Epoch 93 Validation Loss 0.2586123049259186


 31%|███▏      | 94/300 [02:18<04:59,  1.45s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.2565873861312866
Epoch 94 Validation Loss 0.2601763904094696


 32%|███▏      | 95/300 [02:19<05:09,  1.51s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.25690335035324097
Epoch 95 Validation Loss 0.2573361098766327


 32%|███▏      | 96/300 [02:21<05:11,  1.53s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.2571790814399719
Epoch 96 Validation Loss 0.25794127583503723


 32%|███▏      | 97/300 [02:22<05:10,  1.53s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.26577162742614746
Epoch 97 Validation Loss 0.2688312232494354


 33%|███▎      | 98/300 [02:24<05:09,  1.53s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.27268052101135254
Epoch 98 Validation Loss 0.2710358500480652


 33%|███▎      | 99/300 [02:25<04:59,  1.49s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.2565511465072632
Epoch 99 Validation Loss 0.25778457522392273


 33%|███▎      | 100/300 [02:27<04:48,  1.44s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.257941335439682
Epoch 100 Validation Loss 0.2575790584087372


 34%|███▎      | 101/300 [02:28<04:28,  1.35s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.2574377655982971
Epoch 101 Validation Loss 0.25870153307914734


 34%|███▍      | 102/300 [02:29<04:30,  1.37s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.2662979066371918
Epoch 102 Validation Loss 0.2643994987010956


 34%|███▍      | 103/300 [02:31<04:39,  1.42s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.25769197940826416
Epoch 103 Validation Loss 0.2598350942134857


 35%|███▍      | 104/300 [02:32<04:56,  1.51s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.2573195695877075
Epoch 104 Validation Loss 0.25762614607810974


 35%|███▌      | 105/300 [02:34<04:42,  1.45s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.2657793462276459
Epoch 105 Validation Loss 0.26462894678115845


 35%|███▌      | 106/300 [02:35<04:39,  1.44s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.2571347951889038
Epoch 106 Validation Loss 0.2583748698234558


 36%|███▌      | 107/300 [02:36<04:11,  1.30s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.25906965136528015
Epoch 107 Validation Loss 0.2567600905895233


 36%|███▌      | 108/300 [02:37<04:02,  1.26s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.2565038204193115
Epoch 108 Validation Loss 0.2576747238636017


 36%|███▋      | 109/300 [02:39<04:05,  1.29s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.25624898076057434
Epoch 109 Validation Loss 0.2607503831386566


 37%|███▋      | 110/300 [02:40<04:23,  1.39s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.2544766068458557
Epoch 110 Validation Loss 0.25690439343452454


 37%|███▋      | 111/300 [02:42<04:35,  1.46s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.2615934908390045
Epoch 111 Validation Loss 0.25881269574165344


 37%|███▋      | 112/300 [02:44<05:30,  1.76s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.2575293481349945
Epoch 112 Validation Loss 0.25716084241867065


 38%|███▊      | 113/300 [02:46<05:21,  1.72s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.2567434012889862
Epoch 113 Validation Loss 0.26176586747169495


 38%|███▊      | 114/300 [02:48<05:17,  1.70s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.257012277841568
Epoch 114 Validation Loss 0.2576090693473816


 38%|███▊      | 115/300 [02:49<05:05,  1.65s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.25675591826438904
Epoch 115 Validation Loss 0.2584208548069


 39%|███▊      | 116/300 [02:51<04:57,  1.61s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.25856754183769226
Epoch 116 Validation Loss 0.2579249143600464


 39%|███▉      | 117/300 [02:52<04:52,  1.60s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.2580292224884033
Epoch 117 Validation Loss 0.25841814279556274


 39%|███▉      | 118/300 [02:54<04:37,  1.53s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.2560659646987915
Epoch 118 Validation Loss 0.26094695925712585


 40%|███▉      | 119/300 [02:55<04:25,  1.47s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.25340962409973145
Epoch 119 Validation Loss 0.2570439577102661


 40%|████      | 120/300 [02:56<03:58,  1.32s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.2572072148323059
Epoch 120 Validation Loss 0.25739794969558716


 40%|████      | 121/300 [02:57<03:42,  1.24s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.25675299763679504
Epoch 121 Validation Loss 0.25649333000183105


 41%|████      | 122/300 [02:58<03:50,  1.30s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.2587859332561493
Epoch 122 Validation Loss 0.2563549876213074


 41%|████      | 123/300 [03:00<04:08,  1.40s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.25550681352615356
Epoch 123 Validation Loss 0.26029646396636963


 41%|████▏     | 124/300 [03:02<04:17,  1.46s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.2618362009525299
Epoch 124 Validation Loss 0.2581373155117035


 42%|████▏     | 125/300 [03:03<04:21,  1.49s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.2551366090774536
Epoch 125 Validation Loss 0.2600845992565155


 42%|████▏     | 126/300 [03:05<04:18,  1.49s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.25622227787971497
Epoch 126 Validation Loss 0.2558728754520416


 42%|████▏     | 127/300 [03:06<04:19,  1.50s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.2560485899448395
Epoch 127 Validation Loss 0.2567976415157318


 43%|████▎     | 128/300 [03:09<05:10,  1.80s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.2574845552444458
Epoch 128 Validation Loss 0.2585422396659851


 43%|████▎     | 129/300 [03:11<05:06,  1.79s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.257215678691864
Epoch 129 Validation Loss 0.25654661655426025


 43%|████▎     | 130/300 [03:12<04:50,  1.71s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.2599960267543793
Epoch 130 Validation Loss 0.25713473558425903


 44%|████▎     | 131/300 [03:14<04:37,  1.64s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.25672629475593567
Epoch 131 Validation Loss 0.2561345398426056
Epoch 132 Samples 8000 Step 124 Training Loss 0.25547316670417786


 44%|████▍     | 132/300 [03:15<04:36,  1.64s/it]

Epoch 132 Validation Loss 0.2579188346862793


 44%|████▍     | 133/300 [03:16<04:02,  1.45s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.2550937533378601
Epoch 133 Validation Loss 0.25612181425094604


 45%|████▍     | 134/300 [03:17<03:46,  1.36s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.25626465678215027
Epoch 134 Validation Loss 0.25657984614372253


 45%|████▌     | 135/300 [03:19<03:53,  1.42s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.25446951389312744
Epoch 135 Validation Loss 0.2570539712905884


 45%|████▌     | 136/300 [03:20<03:51,  1.41s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.2648029923439026
Epoch 136 Validation Loss 0.26916995644569397


 46%|████▌     | 137/300 [03:22<03:58,  1.46s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.2621384561061859
Epoch 137 Validation Loss 0.26377496123313904


 46%|████▌     | 138/300 [03:24<04:06,  1.52s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.2574111521244049
Epoch 138 Validation Loss 0.25631991028785706


 46%|████▋     | 139/300 [03:25<04:00,  1.50s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.25637131929397583
Epoch 139 Validation Loss 0.25800448656082153


 47%|████▋     | 140/300 [03:26<03:35,  1.35s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.25871387124061584
Epoch 140 Validation Loss 0.264830082654953


 47%|████▋     | 141/300 [03:27<03:24,  1.29s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.25656622648239136
Epoch 141 Validation Loss 0.25564080476760864


 47%|████▋     | 142/300 [03:29<03:31,  1.34s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.25578024983406067
Epoch 142 Validation Loss 0.256481409072876


 48%|████▊     | 143/300 [03:30<03:45,  1.44s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.254850834608078
Epoch 143 Validation Loss 0.2608759105205536


 48%|████▊     | 144/300 [03:32<03:50,  1.48s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.2573834955692291
Epoch 144 Validation Loss 0.2565162181854248


 48%|████▊     | 145/300 [03:33<03:52,  1.50s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.2560250163078308
Epoch 145 Validation Loss 0.2577170729637146


 49%|████▊     | 146/300 [03:35<03:44,  1.46s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.25893011689186096
Epoch 146 Validation Loss 0.25686514377593994


 49%|████▉     | 147/300 [03:36<03:37,  1.42s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.25602397322654724
Epoch 147 Validation Loss 0.2571747899055481


 49%|████▉     | 148/300 [03:37<03:21,  1.33s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.26405075192451477
Epoch 148 Validation Loss 0.259734183549881


 50%|████▉     | 149/300 [03:39<03:26,  1.37s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.25669047236442566
Epoch 149 Validation Loss 0.25617972016334534


 50%|█████     | 150/300 [03:40<03:34,  1.43s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.2551618814468384
Epoch 150 Validation Loss 0.2556488513946533


 50%|█████     | 151/300 [03:42<03:35,  1.45s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.257223904132843
Epoch 151 Validation Loss 0.25641483068466187


 51%|█████     | 152/300 [03:43<03:43,  1.51s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.2563076615333557
Epoch 152 Validation Loss 0.25650152564048767


 51%|█████     | 153/300 [03:45<03:43,  1.52s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.26040711998939514
Epoch 153 Validation Loss 0.26306766271591187


 51%|█████▏    | 154/300 [03:46<03:44,  1.54s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.25553783774375916
Epoch 154 Validation Loss 0.256088525056839


 52%|█████▏    | 155/300 [03:48<03:55,  1.62s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.2555314600467682
Epoch 155 Validation Loss 0.2568455934524536


 52%|█████▏    | 156/300 [03:50<03:50,  1.60s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.2678314745426178
Epoch 156 Validation Loss 0.26002442836761475


 52%|█████▏    | 157/300 [03:51<03:45,  1.58s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.25645411014556885
Epoch 157 Validation Loss 0.25880300998687744


 53%|█████▎    | 158/300 [03:53<03:42,  1.57s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.25675156712532043
Epoch 158 Validation Loss 0.25656402111053467


 53%|█████▎    | 159/300 [03:54<03:33,  1.51s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.25599128007888794
Epoch 159 Validation Loss 0.2564271092414856


 53%|█████▎    | 160/300 [03:56<03:24,  1.46s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.26149436831474304
Epoch 160 Validation Loss 0.26208385825157166


 54%|█████▎    | 161/300 [03:57<03:15,  1.41s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.26279133558273315
Epoch 161 Validation Loss 0.25851890444755554


 54%|█████▍    | 162/300 [03:58<03:12,  1.39s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.256969153881073
Epoch 162 Validation Loss 0.2572137117385864


 54%|█████▍    | 163/300 [04:00<03:15,  1.43s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.25585800409317017
Epoch 163 Validation Loss 0.25629737973213196


 55%|█████▍    | 164/300 [04:01<03:22,  1.49s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.25556614995002747
Epoch 164 Validation Loss 0.2556185722351074


 55%|█████▌    | 165/300 [04:03<03:24,  1.52s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.25824782252311707
Epoch 165 Validation Loss 0.26284682750701904


 55%|█████▌    | 166/300 [04:05<03:31,  1.58s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.2575458586215973
Epoch 166 Validation Loss 0.2588200867176056


 56%|█████▌    | 167/300 [04:06<03:06,  1.40s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.2619302272796631
Epoch 167 Validation Loss 0.2637285888195038


 56%|█████▌    | 168/300 [04:07<02:48,  1.28s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.3017888367176056
Epoch 168 Validation Loss 0.2968607246875763


 56%|█████▋    | 169/300 [04:08<02:46,  1.27s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.25817596912384033
Epoch 169 Validation Loss 0.2599413990974426


 57%|█████▋    | 170/300 [04:10<03:09,  1.46s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.2551301419734955
Epoch 170 Validation Loss 0.2575334310531616


 57%|█████▋    | 171/300 [04:11<03:08,  1.46s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.26017922163009644
Epoch 171 Validation Loss 0.2579048275947571


 57%|█████▋    | 172/300 [04:13<03:10,  1.49s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.2721130847930908
Epoch 172 Validation Loss 0.26597750186920166
Epoch 173 Samples 8000 Step 124 Training Loss 0.2553579807281494
Epoch 173 Validation Loss 0.25656285881996155


 58%|█████▊    | 174/300 [04:16<03:25,  1.63s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.2554989457130432
Epoch 174 Validation Loss 0.25768348574638367
Epoch 175 Samples 8000 Step 124 Training Loss 0.25802749395370483


 58%|█████▊    | 175/300 [04:18<03:15,  1.57s/it]

Epoch 175 Validation Loss 0.25744134187698364


 59%|█████▊    | 176/300 [04:19<03:18,  1.60s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.2551155984401703
Epoch 176 Validation Loss 0.2561783194541931


 59%|█████▉    | 177/300 [04:21<03:13,  1.57s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.25622084736824036
Epoch 177 Validation Loss 0.2590603530406952


 59%|█████▉    | 178/300 [04:23<03:20,  1.64s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.26007282733917236
Epoch 178 Validation Loss 0.2585102915763855


 60%|█████▉    | 179/300 [04:24<03:00,  1.49s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.28782087564468384
Epoch 179 Validation Loss 0.30741235613822937


 60%|██████    | 180/300 [04:25<02:57,  1.48s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.2563115358352661
Epoch 180 Validation Loss 0.25609269738197327


 60%|██████    | 181/300 [04:27<03:01,  1.53s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.26020270586013794
Epoch 181 Validation Loss 0.2564961910247803


 61%|██████    | 182/300 [04:29<03:15,  1.66s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.25629565119743347
Epoch 182 Validation Loss 0.2557923495769501


 61%|██████    | 183/300 [04:30<03:07,  1.61s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.25602594017982483
Epoch 183 Validation Loss 0.2583615183830261


 61%|██████▏   | 184/300 [04:32<03:07,  1.61s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.2646818161010742
Epoch 184 Validation Loss 0.2588805854320526
Epoch 185 Samples 8000 Step 124 Training Loss 0.2552925646305084


 62%|██████▏   | 185/300 [04:34<03:04,  1.60s/it]

Epoch 185 Validation Loss 0.25562435388565063


 62%|██████▏   | 186/300 [04:36<03:16,  1.72s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.2542227804660797
Epoch 186 Validation Loss 0.25626373291015625


 62%|██████▏   | 187/300 [04:37<02:58,  1.58s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.25741297006607056
Epoch 187 Validation Loss 0.2562069892883301


 63%|██████▎   | 188/300 [04:38<02:44,  1.47s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.26114732027053833
Epoch 188 Validation Loss 0.260260671377182


 63%|██████▎   | 189/300 [04:39<02:34,  1.39s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.25518444180488586
Epoch 189 Validation Loss 0.2556358873844147


 63%|██████▎   | 190/300 [04:42<03:17,  1.80s/it]

Epoch 190 Samples 8000 Step 124 Training Loss 0.2589358687400818
Epoch 190 Validation Loss 0.25633466243743896


 64%|██████▎   | 191/300 [04:43<03:01,  1.67s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.25876355171203613
Epoch 191 Validation Loss 0.25593775510787964


 64%|██████▍   | 192/300 [04:45<02:58,  1.65s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.25523045659065247
Epoch 192 Validation Loss 0.2593836784362793


 64%|██████▍   | 193/300 [04:47<03:14,  1.82s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.25424766540527344
Epoch 193 Validation Loss 0.25667890906333923


 65%|██████▍   | 194/300 [04:49<02:59,  1.70s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.25674501061439514
Epoch 194 Validation Loss 0.2558991014957428
Epoch 195 Samples 8000 Step 124 Training Loss 0.25427210330963135


 65%|██████▌   | 195/300 [04:50<03:01,  1.72s/it]

Epoch 195 Validation Loss 0.2580028176307678


 65%|██████▌   | 196/300 [04:52<02:46,  1.60s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.2557164430618286
Epoch 196 Validation Loss 0.25768759846687317
Epoch 197 Samples 8000 Step 124 Training Loss 0.2599239647388458


 66%|██████▌   | 197/300 [04:53<02:43,  1.59s/it]

Epoch 197 Validation Loss 0.2577575147151947


 66%|██████▌   | 198/300 [04:55<02:35,  1.52s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.2572835087776184
Epoch 198 Validation Loss 0.25558456778526306


 66%|██████▋   | 199/300 [04:56<02:25,  1.44s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.25609132647514343
Epoch 199 Validation Loss 0.2554105222225189


 67%|██████▋   | 200/300 [04:58<02:35,  1.56s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.25739866495132446
Epoch 200 Validation Loss 0.25587189197540283


 67%|██████▋   | 201/300 [04:59<02:39,  1.61s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.25780799984931946
Epoch 201 Validation Loss 0.25818392634391785


 67%|██████▋   | 202/300 [05:01<02:26,  1.49s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.2567167580127716
Epoch 202 Validation Loss 0.2568192481994629


 68%|██████▊   | 203/300 [05:02<02:26,  1.51s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.2549145221710205
Epoch 203 Validation Loss 0.25597354769706726


 68%|██████▊   | 204/300 [05:03<02:13,  1.39s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.2584531009197235
Epoch 204 Validation Loss 0.2595441937446594


 68%|██████▊   | 205/300 [05:05<02:09,  1.36s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.25754019618034363
Epoch 205 Validation Loss 0.2564089894294739


 69%|██████▊   | 206/300 [05:06<02:11,  1.40s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.25371190905570984
Epoch 206 Validation Loss 0.2568473517894745


 69%|██████▉   | 207/300 [05:07<02:07,  1.37s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.2543054223060608
Epoch 207 Validation Loss 0.255776584148407


 69%|██████▉   | 208/300 [05:09<02:08,  1.40s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.2611754536628723
Epoch 208 Validation Loss 0.2603818476200104


 70%|██████▉   | 209/300 [05:10<02:11,  1.45s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.2550979554653168
Epoch 209 Validation Loss 0.2550537586212158


 70%|███████   | 210/300 [05:12<02:14,  1.50s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.2550726532936096
Epoch 210 Validation Loss 0.25531166791915894


 70%|███████   | 211/300 [05:13<02:08,  1.44s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.25410038232803345
Epoch 211 Validation Loss 0.2558065354824066


 71%|███████   | 212/300 [05:15<02:11,  1.49s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.25386279821395874
Epoch 212 Validation Loss 0.2554696798324585
Epoch 213 Samples 8000 Step 124 Training Loss 0.25686898827552795


 71%|███████   | 213/300 [05:16<02:07,  1.47s/it]

Epoch 213 Validation Loss 0.25554391741752625


 71%|███████▏  | 214/300 [05:18<01:57,  1.36s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.26026052236557007
Epoch 214 Validation Loss 0.2640822231769562


 72%|███████▏  | 215/300 [05:19<01:47,  1.26s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.25988540053367615
Epoch 215 Validation Loss 0.2574780583381653


 72%|███████▏  | 216/300 [05:20<01:58,  1.41s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.2541866600513458
Epoch 216 Validation Loss 0.2553296983242035


 72%|███████▏  | 217/300 [05:21<01:51,  1.34s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.2576249837875366
Epoch 217 Validation Loss 0.2569268047809601


 73%|███████▎  | 218/300 [05:23<01:52,  1.38s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.26034072041511536
Epoch 218 Validation Loss 0.2555515766143799


 73%|███████▎  | 219/300 [05:24<01:47,  1.32s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.2558918297290802
Epoch 219 Validation Loss 0.25733745098114014


 73%|███████▎  | 220/300 [05:26<01:59,  1.49s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.2587980329990387
Epoch 220 Validation Loss 0.25833407044410706


 74%|███████▎  | 221/300 [05:27<01:52,  1.42s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.25520798563957214
Epoch 221 Validation Loss 0.25523898005485535


 74%|███████▍  | 222/300 [05:29<01:47,  1.38s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.25566965341567993
Epoch 222 Validation Loss 0.25557073950767517


 74%|███████▍  | 223/300 [05:30<01:49,  1.42s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.25804150104522705
Epoch 223 Validation Loss 0.2557009160518646


 75%|███████▍  | 224/300 [05:32<02:05,  1.65s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.25606241822242737
Epoch 224 Validation Loss 0.25521203875541687


 75%|███████▌  | 225/300 [05:33<01:52,  1.50s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.2544819116592407
Epoch 225 Validation Loss 0.2555481195449829


 75%|███████▌  | 226/300 [05:35<01:44,  1.41s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.25385162234306335
Epoch 226 Validation Loss 0.2556852400302887


 76%|███████▌  | 227/300 [05:37<01:54,  1.56s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.2564524710178375
Epoch 227 Validation Loss 0.2590121030807495


 76%|███████▌  | 228/300 [05:39<02:02,  1.70s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.2557399272918701
Epoch 228 Validation Loss 0.2572518289089203


 76%|███████▋  | 229/300 [05:40<01:52,  1.59s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.25872838497161865
Epoch 229 Validation Loss 0.25591373443603516


 77%|███████▋  | 230/300 [05:41<01:40,  1.43s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.2603754699230194
Epoch 230 Validation Loss 0.25745072960853577


 77%|███████▋  | 231/300 [05:42<01:30,  1.31s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.25441738963127136
Epoch 231 Validation Loss 0.25682133436203003


 77%|███████▋  | 232/300 [05:44<01:39,  1.46s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.2560218870639801
Epoch 232 Validation Loss 0.25528281927108765


 78%|███████▊  | 233/300 [05:45<01:42,  1.52s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.25346386432647705
Epoch 233 Validation Loss 0.255563884973526


 78%|███████▊  | 234/300 [05:47<01:38,  1.49s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.2623704969882965
Epoch 234 Validation Loss 0.25906050205230713


 78%|███████▊  | 235/300 [05:49<01:42,  1.57s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.2556214928627014
Epoch 235 Validation Loss 0.2561582028865814


 79%|███████▊  | 236/300 [05:50<01:39,  1.56s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.2550009787082672
Epoch 236 Validation Loss 0.25550559163093567


 79%|███████▉  | 237/300 [05:52<01:39,  1.59s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.25854894518852234
Epoch 237 Validation Loss 0.25596821308135986


 79%|███████▉  | 238/300 [05:53<01:36,  1.56s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.25598976016044617
Epoch 238 Validation Loss 0.2583729326725006


 80%|███████▉  | 239/300 [05:55<01:30,  1.48s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.25766339898109436
Epoch 239 Validation Loss 0.2553131878376007


 80%|████████  | 240/300 [05:56<01:22,  1.38s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.2566310167312622
Epoch 240 Validation Loss 0.25625818967819214


 80%|████████  | 241/300 [05:58<01:42,  1.73s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.26036980748176575
Epoch 241 Validation Loss 0.26366573572158813


 81%|████████  | 242/300 [06:00<01:40,  1.74s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.2576950490474701
Epoch 242 Validation Loss 0.2576245963573456


 81%|████████  | 243/300 [06:02<01:35,  1.68s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.27394017577171326
Epoch 243 Validation Loss 0.2659986913204193


 81%|████████▏ | 244/300 [06:03<01:33,  1.66s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.2569819390773773
Epoch 244 Validation Loss 0.2558732032775879


 82%|████████▏ | 245/300 [06:05<01:31,  1.67s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.25656411051750183
Epoch 245 Validation Loss 0.2562319040298462


 82%|████████▏ | 246/300 [06:06<01:23,  1.54s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.2540039122104645
Epoch 246 Validation Loss 0.25544264912605286


 82%|████████▏ | 247/300 [06:07<01:17,  1.47s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.2564680576324463
Epoch 247 Validation Loss 0.25547167658805847


 83%|████████▎ | 248/300 [06:09<01:22,  1.59s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.2594170570373535
Epoch 248 Validation Loss 0.2554900050163269


 83%|████████▎ | 249/300 [06:11<01:21,  1.59s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.256547749042511
Epoch 249 Validation Loss 0.25608664751052856


 83%|████████▎ | 250/300 [06:13<01:19,  1.59s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.25689393281936646
Epoch 250 Validation Loss 0.25471583008766174


 84%|████████▎ | 251/300 [06:14<01:13,  1.49s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.25760069489479065
Epoch 251 Validation Loss 0.25503405928611755


 84%|████████▍ | 252/300 [06:15<01:12,  1.51s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.25553202629089355
Epoch 252 Validation Loss 0.25498491525650024


 84%|████████▍ | 253/300 [06:17<01:10,  1.50s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.2555759847164154
Epoch 253 Validation Loss 0.25516992807388306


 85%|████████▍ | 254/300 [06:18<01:09,  1.50s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.2598506510257721
Epoch 254 Validation Loss 0.2570955753326416


 85%|████████▌ | 255/300 [06:20<01:09,  1.54s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.2550945281982422
Epoch 255 Validation Loss 0.2551788091659546


 85%|████████▌ | 256/300 [06:22<01:11,  1.62s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.25396618247032166
Epoch 256 Validation Loss 0.25685569643974304


 86%|████████▌ | 257/300 [06:23<01:04,  1.49s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.2590814232826233
Epoch 257 Validation Loss 0.25659406185150146


 86%|████████▌ | 258/300 [06:24<00:59,  1.42s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.2546667754650116
Epoch 258 Validation Loss 0.2545868158340454


 86%|████████▋ | 259/300 [06:26<00:59,  1.46s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.2546831965446472
Epoch 259 Validation Loss 0.2564520835876465


 87%|████████▋ | 260/300 [06:27<00:54,  1.37s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.25914058089256287
Epoch 260 Validation Loss 0.2617705762386322


 87%|████████▋ | 261/300 [06:28<00:52,  1.34s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.25627148151397705
Epoch 261 Validation Loss 0.2551218569278717


 87%|████████▋ | 262/300 [06:30<00:52,  1.39s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.25374722480773926
Epoch 262 Validation Loss 0.2559869587421417


 88%|████████▊ | 263/300 [06:31<00:51,  1.40s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.25478100776672363
Epoch 263 Validation Loss 0.255107045173645


 88%|████████▊ | 264/300 [06:33<00:51,  1.43s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.2537555396556854
Epoch 264 Validation Loss 0.2556001543998718


 88%|████████▊ | 265/300 [06:34<00:47,  1.34s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.3084127604961395
Epoch 265 Validation Loss 0.33239156007766724


 89%|████████▊ | 266/300 [06:35<00:48,  1.43s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.2574455440044403
Epoch 266 Validation Loss 0.2562561333179474


 89%|████████▉ | 267/300 [06:37<00:50,  1.54s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.25565996766090393
Epoch 267 Validation Loss 0.25489282608032227


 89%|████████▉ | 268/300 [06:39<00:49,  1.55s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.26000598073005676
Epoch 268 Validation Loss 0.25736188888549805


 90%|████████▉ | 269/300 [06:40<00:47,  1.54s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.2550935447216034
Epoch 269 Validation Loss 0.2556353807449341


 90%|█████████ | 270/300 [06:42<00:47,  1.57s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.25616252422332764
Epoch 270 Validation Loss 0.25492849946022034


 90%|█████████ | 271/300 [06:44<00:47,  1.63s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.2540132701396942
Epoch 271 Validation Loss 0.25563332438468933


 91%|█████████ | 272/300 [06:45<00:42,  1.53s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.2547496557235718
Epoch 272 Validation Loss 0.2548454701900482


 91%|█████████ | 273/300 [06:46<00:38,  1.43s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.25491341948509216
Epoch 273 Validation Loss 0.2547460198402405


 91%|█████████▏| 274/300 [06:47<00:34,  1.33s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.2543598413467407
Epoch 274 Validation Loss 0.25457170605659485


 92%|█████████▏| 275/300 [06:49<00:34,  1.39s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.26100215315818787
Epoch 275 Validation Loss 0.26051032543182373


 92%|█████████▏| 276/300 [06:50<00:34,  1.44s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.2531585097312927
Epoch 276 Validation Loss 0.25611576437950134


 92%|█████████▏| 277/300 [06:52<00:35,  1.53s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.25562572479248047
Epoch 277 Validation Loss 0.25935399532318115


 93%|█████████▎| 278/300 [06:54<00:33,  1.52s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.2552098333835602
Epoch 278 Validation Loss 0.25495150685310364


 93%|█████████▎| 279/300 [06:55<00:28,  1.35s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.2560010254383087
Epoch 279 Validation Loss 0.26139798760414124


 93%|█████████▎| 280/300 [06:56<00:27,  1.35s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.25502070784568787
Epoch 280 Validation Loss 0.2546744644641876


 94%|█████████▎| 281/300 [06:57<00:26,  1.42s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.25456503033638
Epoch 281 Validation Loss 0.255430668592453


 94%|█████████▍| 282/300 [06:59<00:26,  1.45s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.2545674443244934
Epoch 282 Validation Loss 0.25509580969810486


 94%|█████████▍| 283/300 [07:01<00:25,  1.50s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.25530415773391724
Epoch 283 Validation Loss 0.2549512982368469


 95%|█████████▍| 284/300 [07:02<00:24,  1.53s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.2544839084148407
Epoch 284 Validation Loss 0.254983127117157


 95%|█████████▌| 285/300 [07:04<00:23,  1.56s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.2631240487098694
Epoch 285 Validation Loss 0.2575734853744507


 95%|█████████▌| 286/300 [07:05<00:20,  1.47s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.25399428606033325
Epoch 286 Validation Loss 0.256552129983902


 96%|█████████▌| 287/300 [07:06<00:17,  1.36s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.2589711844921112
Epoch 287 Validation Loss 0.25638124346733093


 96%|█████████▌| 288/300 [07:08<00:16,  1.37s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.25454986095428467
Epoch 288 Validation Loss 0.25517934560775757


 96%|█████████▋| 289/300 [07:09<00:16,  1.51s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.2549287676811218
Epoch 289 Validation Loss 0.25494474172592163


 97%|█████████▋| 290/300 [07:11<00:15,  1.52s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.2539260983467102
Epoch 290 Validation Loss 0.25462231040000916


 97%|█████████▋| 291/300 [07:13<00:14,  1.58s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.2522372603416443
Epoch 291 Validation Loss 0.2563735544681549


 97%|█████████▋| 292/300 [07:14<00:11,  1.48s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.2555028796195984
Epoch 292 Validation Loss 0.25657278299331665


 98%|█████████▊| 293/300 [07:15<00:10,  1.50s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.25303739309310913
Epoch 293 Validation Loss 0.255730003118515


 98%|█████████▊| 294/300 [07:17<00:09,  1.54s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.2919362783432007
Epoch 294 Validation Loss 0.2889162600040436


 98%|█████████▊| 295/300 [07:19<00:07,  1.55s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.26640281081199646
Epoch 295 Validation Loss 0.2641060948371887


 99%|█████████▊| 296/300 [07:20<00:06,  1.53s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.2593550682067871
Epoch 296 Validation Loss 0.2608878016471863


 99%|█████████▉| 297/300 [07:22<00:04,  1.57s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.27050355076789856
Epoch 297 Validation Loss 0.2698807120323181


 99%|█████████▉| 298/300 [07:23<00:03,  1.59s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.25987595319747925
Epoch 298 Validation Loss 0.2604096829891205


100%|█████████▉| 299/300 [07:25<00:01,  1.45s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.2583736181259155
Epoch 299 Validation Loss 0.2592010498046875


100%|██████████| 300/300 [07:26<00:00,  1.49s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.26113560795783997
Epoch 300 Validation Loss 0.2589398920536041


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▅▅▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇█
samples,▂▃▂▅█▂▂▁▅▆▇▁█▇▂▄▇▁▄▃▅▂▅▅▄▁▄▁▃▄▄▇█▄▃▄▁▂█▅
train_loss,██▇▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▄▃▃▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.26114
val_loss,0.25894


In [99]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=4,
            d_mlp=16,
            lr=0.05,
            normalization_type='LNPre',
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='D1_noscale'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:01<05:56,  1.19s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6370563507080078
Epoch 1 Validation Loss 0.6186138987541199


  1%|          | 2/300 [00:02<07:39,  1.54s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6218891143798828
Epoch 2 Validation Loss 0.619475781917572


  1%|          | 3/300 [00:04<07:57,  1.61s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.585067629814148
Epoch 3 Validation Loss 0.5775183439254761


  1%|▏         | 4/300 [00:06<07:40,  1.56s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.5426615476608276
Epoch 4 Validation Loss 0.5414972901344299


  2%|▏         | 5/300 [00:07<07:27,  1.52s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.503032922744751
Epoch 5 Validation Loss 0.504136860370636


  2%|▏         | 6/300 [00:09<07:24,  1.51s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.6800130605697632
Epoch 6 Validation Loss 0.6809961199760437


  2%|▏         | 7/300 [00:10<07:21,  1.51s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.60581374168396
Epoch 7 Validation Loss 0.5992417931556702


  3%|▎         | 8/300 [00:11<06:56,  1.43s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.5680682063102722
Epoch 8 Validation Loss 0.5695109367370605


  3%|▎         | 9/300 [00:12<06:12,  1.28s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.5196594595909119
Epoch 9 Validation Loss 0.5054609775543213


  3%|▎         | 10/300 [00:14<06:16,  1.30s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.8019002079963684
Epoch 10 Validation Loss 0.5802028775215149


  4%|▎         | 11/300 [00:15<06:38,  1.38s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.3830370306968689
Epoch 11 Validation Loss 0.3737981915473938


  4%|▍         | 12/300 [00:17<06:41,  1.39s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.6721823215484619
Epoch 12 Validation Loss 0.8576330542564392


  4%|▍         | 13/300 [00:18<06:50,  1.43s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.33675330877304077
Epoch 13 Validation Loss 0.3429413139820099


  5%|▍         | 14/300 [00:20<06:59,  1.47s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.3252705931663513
Epoch 14 Validation Loss 0.3234413266181946


  5%|▌         | 15/300 [00:21<07:00,  1.48s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.31040647625923157
Epoch 15 Validation Loss 0.31618812680244446


  5%|▌         | 16/300 [00:23<06:46,  1.43s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.30804359912872314
Epoch 16 Validation Loss 0.31181588768959045


  6%|▌         | 17/300 [00:24<07:02,  1.49s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.32989221811294556
Epoch 17 Validation Loss 0.3479411005973816


  6%|▌         | 18/300 [00:25<06:31,  1.39s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.3062782883644104
Epoch 18 Validation Loss 0.3122566044330597


  6%|▋         | 19/300 [00:27<06:25,  1.37s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.303444504737854
Epoch 19 Validation Loss 0.3032970726490021
Epoch 20 Samples 8000 Step 124 Training Loss 0.33610790967941284


  7%|▋         | 20/300 [00:28<06:31,  1.40s/it]

Epoch 20 Validation Loss 0.31472447514533997


  7%|▋         | 21/300 [00:29<06:29,  1.40s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.2935221195220947
Epoch 21 Validation Loss 0.298871248960495


  7%|▋         | 22/300 [00:31<06:22,  1.37s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.30766621232032776
Epoch 22 Validation Loss 0.3060036301612854


  8%|▊         | 23/300 [00:33<07:12,  1.56s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.2965591251850128
Epoch 23 Validation Loss 0.2961205542087555


  8%|▊         | 24/300 [00:34<07:09,  1.56s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.3001216650009155
Epoch 24 Validation Loss 0.30024731159210205


  8%|▊         | 25/300 [00:36<06:36,  1.44s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.2902006208896637
Epoch 25 Validation Loss 0.29260897636413574


  9%|▊         | 26/300 [00:38<07:26,  1.63s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.28616657853126526
Epoch 26 Validation Loss 0.29121989011764526


  9%|▉         | 27/300 [00:39<06:41,  1.47s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.28808239102363586
Epoch 27 Validation Loss 0.2934300899505615


  9%|▉         | 28/300 [00:40<06:22,  1.41s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.28774958848953247
Epoch 28 Validation Loss 0.28866398334503174


 10%|▉         | 29/300 [00:41<06:29,  1.44s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.29482173919677734
Epoch 29 Validation Loss 0.2920655310153961


 10%|█         | 30/300 [00:43<06:17,  1.40s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.31127625703811646
Epoch 30 Validation Loss 0.2998308837413788


 10%|█         | 31/300 [00:45<06:48,  1.52s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.28735727071762085
Epoch 31 Validation Loss 0.2900561988353729


 11%|█         | 32/300 [00:46<06:37,  1.48s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.28926074504852295
Epoch 32 Validation Loss 0.28555312752723694


 11%|█         | 33/300 [00:47<06:36,  1.48s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.29572945833206177
Epoch 33 Validation Loss 0.2853752672672272


 11%|█▏        | 34/300 [00:49<06:51,  1.55s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.27696382999420166
Epoch 34 Validation Loss 0.28358063101768494


 12%|█▏        | 35/300 [00:51<06:50,  1.55s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.29137301445007324
Epoch 35 Validation Loss 0.28215157985687256


 12%|█▏        | 36/300 [00:52<06:39,  1.51s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.2850968539714813
Epoch 36 Validation Loss 0.29031479358673096


 12%|█▏        | 37/300 [00:53<06:08,  1.40s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.28783831000328064
Epoch 37 Validation Loss 0.28404688835144043


 13%|█▎        | 38/300 [00:54<05:49,  1.33s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.2849526107311249
Epoch 38 Validation Loss 0.2841242551803589


 13%|█▎        | 39/300 [00:56<06:03,  1.39s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.2852647006511688
Epoch 39 Validation Loss 0.2831054627895355


 13%|█▎        | 40/300 [00:58<06:12,  1.43s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.28583312034606934
Epoch 40 Validation Loss 0.27984485030174255


 14%|█▎        | 41/300 [00:59<06:10,  1.43s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.2752540111541748
Epoch 41 Validation Loss 0.281295508146286


 14%|█▍        | 42/300 [01:00<06:09,  1.43s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.2895888686180115
Epoch 42 Validation Loss 0.34621095657348633


 14%|█▍        | 43/300 [01:02<06:25,  1.50s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.27699515223503113
Epoch 43 Validation Loss 0.27973514795303345


 15%|█▍        | 44/300 [01:04<07:11,  1.68s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.28548434376716614
Epoch 44 Validation Loss 0.2843819558620453


 15%|█▌        | 45/300 [01:06<06:50,  1.61s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.27652478218078613
Epoch 45 Validation Loss 0.27883586287498474


 15%|█▌        | 46/300 [01:07<07:00,  1.66s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.277823269367218
Epoch 46 Validation Loss 0.2795531451702118


 16%|█▌        | 47/300 [01:09<06:49,  1.62s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.27035999298095703
Epoch 47 Validation Loss 0.2781866490840912


 16%|█▌        | 48/300 [01:10<06:36,  1.57s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.2771015465259552
Epoch 48 Validation Loss 0.27722054719924927


 16%|█▋        | 49/300 [01:12<06:45,  1.62s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.2895883619785309
Epoch 49 Validation Loss 0.2864754796028137


 17%|█▋        | 50/300 [01:14<06:47,  1.63s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.27401334047317505
Epoch 50 Validation Loss 0.283147931098938


 17%|█▋        | 51/300 [01:15<06:40,  1.61s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.2730443775653839
Epoch 51 Validation Loss 0.27374646067619324


 17%|█▋        | 52/300 [01:17<06:25,  1.55s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.27471187710762024
Epoch 52 Validation Loss 0.27271968126296997


 18%|█▊        | 53/300 [01:18<05:46,  1.40s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.2767309546470642
Epoch 53 Validation Loss 0.27852335572242737


 18%|█▊        | 54/300 [01:19<05:41,  1.39s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.2703953683376312
Epoch 54 Validation Loss 0.27228111028671265


 18%|█▊        | 55/300 [01:21<05:51,  1.43s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.2666469216346741
Epoch 55 Validation Loss 0.2756998836994171


 19%|█▊        | 56/300 [01:22<06:10,  1.52s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.2737426459789276
Epoch 56 Validation Loss 0.2712635099887848


 19%|█▉        | 57/300 [01:24<06:13,  1.54s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.2702879011631012
Epoch 57 Validation Loss 0.2723899483680725


 19%|█▉        | 58/300 [01:25<06:12,  1.54s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.2741469442844391
Epoch 58 Validation Loss 0.2747003436088562


 20%|█▉        | 59/300 [01:27<06:10,  1.54s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.2814425528049469
Epoch 59 Validation Loss 0.2741014361381531


 20%|██        | 60/300 [01:29<06:05,  1.52s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.2959131896495819
Epoch 60 Validation Loss 0.29209578037261963


 20%|██        | 61/300 [01:30<06:14,  1.57s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.2830350697040558
Epoch 61 Validation Loss 0.28578776121139526


 21%|██        | 62/300 [01:32<06:18,  1.59s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.2724987864494324
Epoch 62 Validation Loss 0.2742719352245331


 21%|██        | 63/300 [01:33<05:54,  1.49s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.2709384858608246
Epoch 63 Validation Loss 0.27452078461647034


 21%|██▏       | 64/300 [01:34<05:27,  1.39s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.27844786643981934
Epoch 64 Validation Loss 0.2925656735897064


 22%|██▏       | 65/300 [01:36<05:38,  1.44s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.27176839113235474
Epoch 65 Validation Loss 0.2680237591266632


 22%|██▏       | 66/300 [01:37<05:32,  1.42s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.2710665762424469
Epoch 66 Validation Loss 0.26699987053871155


 22%|██▏       | 67/300 [01:39<05:41,  1.47s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.26763832569122314
Epoch 67 Validation Loss 0.26642265915870667


 23%|██▎       | 68/300 [01:40<05:35,  1.45s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.2715663015842438
Epoch 68 Validation Loss 0.27175402641296387


 23%|██▎       | 69/300 [01:42<05:29,  1.43s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.26255497336387634
Epoch 69 Validation Loss 0.26616090536117554


 23%|██▎       | 70/300 [01:43<05:47,  1.51s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.2659333348274231
Epoch 70 Validation Loss 0.2658160328865051


 24%|██▎       | 71/300 [01:46<06:48,  1.78s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.2715516984462738
Epoch 71 Validation Loss 0.27265265583992004


 24%|██▍       | 72/300 [01:47<06:27,  1.70s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.26838475465774536
Epoch 72 Validation Loss 0.2684173583984375


 24%|██▍       | 73/300 [01:50<07:26,  1.97s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.2699931561946869
Epoch 73 Validation Loss 0.2713053524494171


 25%|██▍       | 74/300 [01:51<06:42,  1.78s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.2766765356063843
Epoch 74 Validation Loss 0.26737266778945923


 25%|██▌       | 75/300 [01:53<06:41,  1.78s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.2694357633590698
Epoch 75 Validation Loss 0.29287055134773254


 25%|██▌       | 76/300 [01:54<06:24,  1.71s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.26533544063568115
Epoch 76 Validation Loss 0.26385697722435


 26%|██▌       | 77/300 [01:56<06:13,  1.67s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.26264849305152893
Epoch 77 Validation Loss 0.26356789469718933


 26%|██▌       | 78/300 [01:57<05:39,  1.53s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.262337327003479
Epoch 78 Validation Loss 0.2637658715248108


 26%|██▋       | 79/300 [01:59<05:35,  1.52s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.26784732937812805
Epoch 79 Validation Loss 0.26367923617362976


 27%|██▋       | 80/300 [02:00<05:26,  1.48s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.26679033041000366
Epoch 80 Validation Loss 0.26269397139549255


 27%|██▋       | 81/300 [02:02<05:39,  1.55s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.26926618814468384
Epoch 81 Validation Loss 0.26394227147102356


 27%|██▋       | 82/300 [02:03<05:33,  1.53s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.2585251033306122
Epoch 82 Validation Loss 0.26231518387794495


 28%|██▊       | 83/300 [02:05<05:31,  1.53s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.2664337754249573
Epoch 83 Validation Loss 0.2662631869316101


 28%|██▊       | 84/300 [02:07<05:48,  1.61s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.2747851014137268
Epoch 84 Validation Loss 0.272146999835968


 28%|██▊       | 85/300 [02:09<06:37,  1.85s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.2638363242149353
Epoch 85 Validation Loss 0.2629678249359131


 29%|██▊       | 86/300 [02:11<06:22,  1.79s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.2604092061519623
Epoch 86 Validation Loss 0.2627897560596466
Epoch 87 Samples 8000 Step 124 Training Loss 0.2628095746040344


 29%|██▉       | 87/300 [02:13<06:29,  1.83s/it]

Epoch 87 Validation Loss 0.26373061537742615


 29%|██▉       | 88/300 [02:14<05:53,  1.67s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.2595508098602295
Epoch 88 Validation Loss 0.2613265812397003


 30%|██▉       | 89/300 [02:16<06:05,  1.73s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.2681380808353424
Epoch 89 Validation Loss 0.26660987734794617


 30%|███       | 90/300 [02:17<05:57,  1.70s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.2613847553730011
Epoch 90 Validation Loss 0.2632666826248169


 30%|███       | 91/300 [02:19<05:45,  1.65s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.26167893409729004
Epoch 91 Validation Loss 0.2610456943511963


 31%|███       | 92/300 [02:20<05:29,  1.58s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.2630283236503601
Epoch 92 Validation Loss 0.2624909281730652


 31%|███       | 93/300 [02:22<05:35,  1.62s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.26314693689346313
Epoch 93 Validation Loss 0.26147642731666565


 31%|███▏      | 94/300 [02:24<05:22,  1.57s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.26234716176986694
Epoch 94 Validation Loss 0.2628875970840454


 32%|███▏      | 95/300 [02:25<05:29,  1.61s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.26427415013313293
Epoch 95 Validation Loss 0.26119139790534973


 32%|███▏      | 96/300 [02:27<05:07,  1.51s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.2613620460033417
Epoch 96 Validation Loss 0.26087695360183716


 32%|███▏      | 97/300 [02:28<05:01,  1.48s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.2675476372241974
Epoch 97 Validation Loss 0.2622144818305969


 33%|███▎      | 98/300 [02:29<05:00,  1.49s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.27066174149513245
Epoch 98 Validation Loss 0.2688521444797516


 33%|███▎      | 99/300 [02:31<05:02,  1.51s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.2669738829135895
Epoch 99 Validation Loss 0.26166659593582153


 33%|███▎      | 100/300 [02:32<04:57,  1.49s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.2620986998081207
Epoch 100 Validation Loss 0.2603766620159149


 34%|███▎      | 101/300 [02:34<05:10,  1.56s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.2636882960796356
Epoch 101 Validation Loss 0.2644784450531006


 34%|███▍      | 102/300 [02:35<04:52,  1.48s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.2642851769924164
Epoch 102 Validation Loss 0.26025938987731934


 34%|███▍      | 103/300 [02:37<04:48,  1.47s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.9086925983428955
Epoch 103 Validation Loss 0.4838939607143402


 35%|███▍      | 104/300 [02:38<04:47,  1.47s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.2609998285770416
Epoch 104 Validation Loss 0.2607114017009735


 35%|███▌      | 105/300 [02:40<04:50,  1.49s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.2653924226760864
Epoch 105 Validation Loss 0.2650342583656311


 35%|███▌      | 106/300 [02:41<04:44,  1.46s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.26210153102874756
Epoch 106 Validation Loss 0.2610304653644562


 36%|███▌      | 107/300 [02:43<04:45,  1.48s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.2626441717147827
Epoch 107 Validation Loss 0.26036497950553894


 36%|███▌      | 108/300 [02:44<04:45,  1.49s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.2610630393028259
Epoch 108 Validation Loss 0.26075315475463867


 36%|███▋      | 109/300 [02:46<04:51,  1.53s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.2612910270690918
Epoch 109 Validation Loss 0.26033997535705566


 37%|███▋      | 110/300 [02:47<04:49,  1.52s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.26223474740982056
Epoch 110 Validation Loss 0.260479211807251


 37%|███▋      | 111/300 [02:49<05:03,  1.60s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.26837557554244995
Epoch 111 Validation Loss 0.2601899206638336


 37%|███▋      | 112/300 [02:51<05:01,  1.60s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.26140978932380676
Epoch 112 Validation Loss 0.2603437304496765


 38%|███▊      | 113/300 [02:52<04:50,  1.55s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.2633005380630493
Epoch 113 Validation Loss 0.26181718707084656


 38%|███▊      | 114/300 [02:54<04:54,  1.59s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.26019543409347534
Epoch 114 Validation Loss 0.25976499915122986


 38%|███▊      | 115/300 [02:55<04:41,  1.52s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.26377981901168823
Epoch 115 Validation Loss 0.26128536462783813


 39%|███▊      | 116/300 [02:57<04:33,  1.48s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.2624412775039673
Epoch 116 Validation Loss 0.26063862442970276


 39%|███▉      | 117/300 [02:58<04:44,  1.55s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.2575811743736267
Epoch 117 Validation Loss 0.26148971915245056


 39%|███▉      | 118/300 [03:00<04:38,  1.53s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.2639828324317932
Epoch 118 Validation Loss 0.26252296566963196


 40%|███▉      | 119/300 [03:01<04:36,  1.53s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.26150208711624146
Epoch 119 Validation Loss 0.26141828298568726


 40%|████      | 120/300 [03:03<04:33,  1.52s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.26234304904937744
Epoch 120 Validation Loss 0.2609384059906006


 40%|████      | 121/300 [03:05<04:42,  1.58s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.2584279477596283
Epoch 121 Validation Loss 0.26019489765167236


 41%|████      | 122/300 [03:06<04:42,  1.59s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.2605772614479065
Epoch 122 Validation Loss 0.25975704193115234


 41%|████      | 123/300 [03:08<04:38,  1.57s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.25878679752349854
Epoch 123 Validation Loss 0.26050296425819397


 41%|████▏     | 124/300 [03:09<04:36,  1.57s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.2641092538833618
Epoch 124 Validation Loss 0.25969791412353516


 42%|████▏     | 125/300 [03:11<04:28,  1.54s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.2634337246417999
Epoch 125 Validation Loss 0.2609434127807617


 42%|████▏     | 126/300 [03:12<04:20,  1.50s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.2607058584690094
Epoch 126 Validation Loss 0.2594313323497772


 42%|████▏     | 127/300 [03:14<04:22,  1.52s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.2615765631198883
Epoch 127 Validation Loss 0.26041197776794434


 43%|████▎     | 128/300 [03:15<04:15,  1.48s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.26139476895332336
Epoch 128 Validation Loss 0.26260650157928467


 43%|████▎     | 129/300 [03:17<04:22,  1.53s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.26075878739356995
Epoch 129 Validation Loss 0.2600410580635071


 43%|████▎     | 130/300 [03:18<04:17,  1.52s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.2609773874282837
Epoch 130 Validation Loss 0.2598402202129364


 44%|████▎     | 131/300 [03:20<04:19,  1.53s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.26234328746795654
Epoch 131 Validation Loss 0.25977757573127747


 44%|████▍     | 132/300 [03:21<04:16,  1.53s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.2610970139503479
Epoch 132 Validation Loss 0.25979384779930115


 44%|████▍     | 133/300 [03:23<04:13,  1.52s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.2625223398208618
Epoch 133 Validation Loss 0.2598576247692108


 45%|████▍     | 134/300 [03:25<04:17,  1.55s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.2607284486293793
Epoch 134 Validation Loss 0.25968804955482483


 45%|████▌     | 135/300 [03:26<04:16,  1.55s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.2578021287918091
Epoch 135 Validation Loss 0.26025667786598206


 45%|████▌     | 136/300 [03:28<04:10,  1.53s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.26832324266433716
Epoch 136 Validation Loss 0.2641202509403229


 46%|████▌     | 137/300 [03:29<04:08,  1.52s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.2619038224220276
Epoch 137 Validation Loss 0.26034635305404663


 46%|████▌     | 138/300 [03:31<04:08,  1.53s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.2589784264564514
Epoch 138 Validation Loss 0.25990283489227295


 46%|████▋     | 139/300 [03:32<04:07,  1.54s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.2652965784072876
Epoch 139 Validation Loss 0.2597987651824951


 47%|████▋     | 140/300 [03:34<04:06,  1.54s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.26112547516822815
Epoch 140 Validation Loss 0.2600661516189575


 47%|████▋     | 141/300 [03:35<04:07,  1.56s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.2599312961101532
Epoch 141 Validation Loss 0.25923267006874084


 47%|████▋     | 142/300 [03:37<04:18,  1.64s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.25771719217300415
Epoch 142 Validation Loss 0.2589416205883026


 48%|████▊     | 143/300 [03:39<04:15,  1.63s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.2613222301006317
Epoch 143 Validation Loss 0.26154664158821106


 48%|████▊     | 144/300 [03:40<04:11,  1.61s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.26073309779167175
Epoch 144 Validation Loss 0.2601843476295471


 48%|████▊     | 145/300 [03:42<04:04,  1.58s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.2603820562362671
Epoch 145 Validation Loss 0.2597409188747406


 49%|████▊     | 146/300 [03:43<04:00,  1.56s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.2635417580604553
Epoch 146 Validation Loss 0.2599577009677887


 49%|████▉     | 147/300 [03:45<04:07,  1.61s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.2629781663417816
Epoch 147 Validation Loss 0.2617776393890381


 49%|████▉     | 148/300 [03:47<04:01,  1.59s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.2602359652519226
Epoch 148 Validation Loss 0.25875288248062134


 50%|████▉     | 149/300 [03:48<03:54,  1.55s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.26227569580078125
Epoch 149 Validation Loss 0.25897136330604553


 50%|█████     | 150/300 [03:50<03:59,  1.60s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.25981423258781433
Epoch 150 Validation Loss 0.258588969707489


 50%|█████     | 151/300 [03:51<03:51,  1.55s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.2608402967453003
Epoch 151 Validation Loss 0.26003849506378174


 51%|█████     | 152/300 [03:53<03:57,  1.61s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.2603719234466553
Epoch 152 Validation Loss 0.25916290283203125


 51%|█████     | 153/300 [03:55<04:05,  1.67s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.261729896068573
Epoch 153 Validation Loss 0.2623448073863983


 51%|█████▏    | 154/300 [03:56<03:37,  1.49s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.2594718039035797
Epoch 154 Validation Loss 0.2593981623649597


 52%|█████▏    | 155/300 [03:57<03:42,  1.54s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.2623458206653595
Epoch 155 Validation Loss 0.25910162925720215


 52%|█████▏    | 156/300 [03:59<03:31,  1.47s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.2630373239517212
Epoch 156 Validation Loss 0.2590084969997406


 52%|█████▏    | 157/300 [04:01<03:44,  1.57s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.2634316086769104
Epoch 157 Validation Loss 0.26076921820640564


 53%|█████▎    | 158/300 [04:02<03:41,  1.56s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.2610444724559784
Epoch 158 Validation Loss 0.25907906889915466


 53%|█████▎    | 159/300 [04:04<03:54,  1.67s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.25858211517333984
Epoch 159 Validation Loss 0.2591175436973572


 53%|█████▎    | 160/300 [04:06<03:52,  1.66s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.2591790556907654
Epoch 160 Validation Loss 0.2610045373439789


 54%|█████▎    | 161/300 [04:07<03:27,  1.49s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.26188230514526367
Epoch 161 Validation Loss 0.2600719630718231


 54%|█████▍    | 162/300 [04:08<03:25,  1.49s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.26011067628860474
Epoch 162 Validation Loss 0.2586279511451721


 54%|█████▍    | 163/300 [04:10<03:27,  1.51s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.26193925738334656
Epoch 163 Validation Loss 0.25919657945632935


 55%|█████▍    | 164/300 [04:12<03:36,  1.59s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.25731009244918823
Epoch 164 Validation Loss 0.2591238021850586


 55%|█████▌    | 165/300 [04:13<03:36,  1.60s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.2585894763469696
Epoch 165 Validation Loss 0.26005277037620544


 55%|█████▌    | 166/300 [04:15<03:36,  1.61s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.2598564922809601
Epoch 166 Validation Loss 0.2601495683193207


 56%|█████▌    | 167/300 [04:16<03:18,  1.49s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.263554185628891
Epoch 167 Validation Loss 0.26060760021209717


 56%|█████▌    | 168/300 [04:18<03:22,  1.54s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.25939780473709106
Epoch 168 Validation Loss 0.2591611444950104


 56%|█████▋    | 169/300 [04:19<03:20,  1.53s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.25936102867126465
Epoch 169 Validation Loss 0.2590692937374115


 57%|█████▋    | 170/300 [04:21<03:28,  1.61s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.2617705166339874
Epoch 170 Validation Loss 0.25885647535324097


 57%|█████▋    | 171/300 [04:23<03:25,  1.59s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.2596482038497925
Epoch 171 Validation Loss 0.2592456340789795


 57%|█████▋    | 172/300 [04:24<03:26,  1.61s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.26909300684928894
Epoch 172 Validation Loss 0.2591809928417206


 58%|█████▊    | 173/300 [04:26<03:20,  1.58s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.2615260183811188
Epoch 173 Validation Loss 0.25997573137283325


 58%|█████▊    | 174/300 [04:27<03:24,  1.62s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.26493632793426514
Epoch 174 Validation Loss 0.262721449136734


 58%|█████▊    | 175/300 [04:29<03:21,  1.61s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.26303908228874207
Epoch 175 Validation Loss 0.2608264684677124


 59%|█████▊    | 176/300 [04:31<03:17,  1.59s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.2577936351299286
Epoch 176 Validation Loss 0.2592637836933136


 59%|█████▉    | 177/300 [04:32<03:15,  1.59s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.26069343090057373
Epoch 177 Validation Loss 0.2594085931777954


 59%|█████▉    | 178/300 [04:34<03:12,  1.58s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.25641000270843506
Epoch 178 Validation Loss 0.2586748003959656


 60%|█████▉    | 179/300 [04:35<03:07,  1.55s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.2574361562728882
Epoch 179 Validation Loss 0.260667085647583


 60%|██████    | 180/300 [04:37<02:59,  1.50s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.258657842874527
Epoch 180 Validation Loss 0.25883129239082336


 60%|██████    | 181/300 [04:38<03:09,  1.59s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.2584298849105835
Epoch 181 Validation Loss 0.2589457929134369


 61%|██████    | 182/300 [04:40<03:00,  1.53s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.2602360248565674
Epoch 182 Validation Loss 0.25857067108154297


 61%|██████    | 183/300 [04:41<03:01,  1.55s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.261260986328125
Epoch 183 Validation Loss 0.2646596133708954


 61%|██████▏   | 184/300 [04:43<03:02,  1.58s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.26202479004859924
Epoch 184 Validation Loss 0.2588203251361847


 62%|██████▏   | 185/300 [04:45<02:59,  1.56s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.2596132755279541
Epoch 185 Validation Loss 0.25929051637649536


 62%|██████▏   | 186/300 [04:46<03:03,  1.61s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.2617962956428528
Epoch 186 Validation Loss 0.2589869797229767


 62%|██████▏   | 187/300 [04:48<02:56,  1.57s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.2590266466140747
Epoch 187 Validation Loss 0.2583026885986328


 63%|██████▎   | 188/300 [04:50<03:02,  1.63s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.2615866959095001
Epoch 188 Validation Loss 0.25989285111427307


 63%|██████▎   | 189/300 [04:51<02:57,  1.60s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.2579420208930969
Epoch 189 Validation Loss 0.259073942899704


 63%|██████▎   | 190/300 [04:53<02:55,  1.59s/it]

Epoch 190 Samples 8000 Step 124 Training Loss 0.26020342111587524
Epoch 190 Validation Loss 0.2592371106147766


 64%|██████▎   | 191/300 [04:54<02:49,  1.55s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.26102885603904724
Epoch 191 Validation Loss 0.25858980417251587


 64%|██████▍   | 192/300 [04:56<02:46,  1.54s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.2588430643081665
Epoch 192 Validation Loss 0.26057225465774536


 64%|██████▍   | 193/300 [04:57<02:43,  1.53s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.25702881813049316
Epoch 193 Validation Loss 0.258484810590744


 65%|██████▍   | 194/300 [04:59<02:46,  1.57s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.25878217816352844
Epoch 194 Validation Loss 0.25883424282073975


 65%|██████▌   | 195/300 [05:00<02:41,  1.54s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.260463148355484
Epoch 195 Validation Loss 0.2586919367313385


 65%|██████▌   | 196/300 [05:02<02:46,  1.60s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.26021191477775574
Epoch 196 Validation Loss 0.2625245153903961


 66%|██████▌   | 197/300 [05:04<02:46,  1.61s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.26046696305274963
Epoch 197 Validation Loss 0.2587292194366455


 66%|██████▌   | 198/300 [05:06<02:56,  1.73s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.2611030042171478
Epoch 198 Validation Loss 0.2583022713661194


 66%|██████▋   | 199/300 [05:07<02:44,  1.62s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.26069384813308716
Epoch 199 Validation Loss 0.25859302282333374


 67%|██████▋   | 200/300 [05:08<02:32,  1.53s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.25892964005470276
Epoch 200 Validation Loss 0.2581022083759308


 67%|██████▋   | 201/300 [05:10<02:34,  1.56s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.2625851035118103
Epoch 201 Validation Loss 0.26083818078041077


 67%|██████▋   | 202/300 [05:12<02:50,  1.74s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.26961830258369446
Epoch 202 Validation Loss 0.2723245918750763


 68%|██████▊   | 203/300 [05:14<02:54,  1.80s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.2620110809803009
Epoch 203 Validation Loss 0.26381027698516846


 68%|██████▊   | 204/300 [05:16<03:03,  1.91s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.2644256055355072
Epoch 204 Validation Loss 0.2641323208808899


 68%|██████▊   | 205/300 [05:18<03:02,  1.92s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.2639232277870178
Epoch 205 Validation Loss 0.26168256998062134


 69%|██████▊   | 206/300 [05:21<03:20,  2.14s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.26305291056632996
Epoch 206 Validation Loss 0.2664852738380432


 69%|██████▉   | 207/300 [05:23<03:24,  2.19s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.260286420583725
Epoch 207 Validation Loss 0.26118895411491394


 69%|██████▉   | 208/300 [05:25<03:13,  2.11s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.26374727487564087
Epoch 208 Validation Loss 0.2596578299999237


 70%|██████▉   | 209/300 [05:27<03:02,  2.00s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.2563641667366028
Epoch 209 Validation Loss 0.2586568593978882


 70%|███████   | 210/300 [05:28<02:45,  1.83s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.2609393894672394
Epoch 210 Validation Loss 0.26067453622817993


 70%|███████   | 211/300 [05:32<03:22,  2.27s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.25641733407974243
Epoch 211 Validation Loss 0.259499192237854


 71%|███████   | 212/300 [05:33<03:07,  2.13s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.2607036232948303
Epoch 212 Validation Loss 0.258929580450058


 71%|███████   | 213/300 [05:35<02:48,  1.94s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.25988227128982544
Epoch 213 Validation Loss 0.2585872411727905
Epoch 214 Samples 8000 Step 124 Training Loss 0.2608158588409424
Epoch 214 Validation Loss 0.26025626063346863


 72%|███████▏  | 215/300 [05:37<02:17,  1.62s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.2604096829891205
Epoch 215 Validation Loss 0.25931695103645325


 72%|███████▏  | 216/300 [05:39<02:04,  1.48s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.26092082262039185
Epoch 216 Validation Loss 0.258632630109787


 72%|███████▏  | 217/300 [05:41<02:33,  1.85s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.26727569103240967
Epoch 217 Validation Loss 0.2684783339500427


 73%|███████▎  | 218/300 [05:43<02:37,  1.92s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.2569262981414795
Epoch 218 Validation Loss 0.25770458579063416


 73%|███████▎  | 219/300 [05:45<02:24,  1.78s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.26004016399383545
Epoch 219 Validation Loss 0.2602923512458801
Epoch 220 Samples 8000 Step 124 Training Loss 0.25948086380958557


 73%|███████▎  | 220/300 [05:46<02:17,  1.72s/it]

Epoch 220 Validation Loss 0.2596991956233978


 74%|███████▎  | 221/300 [05:48<02:06,  1.60s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.2584412395954132
Epoch 221 Validation Loss 0.2578636407852173


 74%|███████▍  | 222/300 [05:49<02:05,  1.61s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.25846827030181885
Epoch 222 Validation Loss 0.25814273953437805


 74%|███████▍  | 223/300 [05:51<02:06,  1.65s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.26040706038475037
Epoch 223 Validation Loss 0.25885123014450073


 75%|███████▍  | 224/300 [05:53<02:04,  1.64s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.2571614384651184
Epoch 224 Validation Loss 0.2584826350212097


 75%|███████▌  | 225/300 [05:55<02:04,  1.66s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.2578466534614563
Epoch 225 Validation Loss 0.2580691874027252


 75%|███████▌  | 226/300 [05:57<02:14,  1.81s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.25707313418388367
Epoch 226 Validation Loss 0.25786834955215454


 76%|███████▌  | 227/300 [05:58<02:06,  1.74s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.2600271701812744
Epoch 227 Validation Loss 0.2590998709201813


 76%|███████▌  | 228/300 [06:00<01:59,  1.66s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.2575722634792328
Epoch 228 Validation Loss 0.2619127035140991


 76%|███████▋  | 229/300 [06:01<01:53,  1.60s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.25827181339263916
Epoch 229 Validation Loss 0.2575990557670593


 77%|███████▋  | 230/300 [06:03<01:52,  1.60s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.26646924018859863
Epoch 230 Validation Loss 0.2605016827583313


 77%|███████▋  | 231/300 [06:05<01:53,  1.65s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.2554275393486023
Epoch 231 Validation Loss 0.2591686546802521


 77%|███████▋  | 232/300 [06:06<01:51,  1.64s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.257660835981369
Epoch 232 Validation Loss 0.2570628225803375


 78%|███████▊  | 233/300 [06:07<01:42,  1.53s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.25630128383636475
Epoch 233 Validation Loss 0.2592432200908661


 78%|███████▊  | 234/300 [06:09<01:35,  1.45s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.2589186429977417
Epoch 234 Validation Loss 0.2584228217601776


 78%|███████▊  | 235/300 [06:10<01:38,  1.52s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.25697481632232666
Epoch 235 Validation Loss 0.2577686309814453


 79%|███████▊  | 236/300 [06:12<01:46,  1.67s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.25950074195861816
Epoch 236 Validation Loss 0.2573307752609253


 79%|███████▉  | 237/300 [06:14<01:43,  1.64s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.2572827637195587
Epoch 237 Validation Loss 0.25805413722991943


 79%|███████▉  | 238/300 [06:16<01:45,  1.71s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.25507301092147827
Epoch 238 Validation Loss 0.2577454149723053


 80%|███████▉  | 239/300 [06:17<01:37,  1.60s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.2584669291973114
Epoch 239 Validation Loss 0.25738534331321716


 80%|████████  | 240/300 [06:19<01:35,  1.60s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.25754430890083313
Epoch 240 Validation Loss 0.2577110528945923


 80%|████████  | 241/300 [06:20<01:31,  1.54s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.2644612193107605
Epoch 241 Validation Loss 0.26178404688835144


 81%|████████  | 242/300 [06:22<01:34,  1.63s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.26307135820388794
Epoch 242 Validation Loss 0.2576882839202881


 81%|████████  | 243/300 [06:24<01:35,  1.68s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.2622057795524597
Epoch 243 Validation Loss 0.25897297263145447


 81%|████████▏ | 244/300 [06:26<01:35,  1.71s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.2580086290836334
Epoch 244 Validation Loss 0.2577914297580719
Epoch 245 Samples 8000 Step 124 Training Loss 0.25945723056793213
Epoch 245 Validation Loss 0.26085373759269714


 82%|████████▏ | 246/300 [06:30<01:38,  1.83s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.2579493820667267
Epoch 246 Validation Loss 0.25778576731681824


 82%|████████▏ | 247/300 [06:31<01:31,  1.73s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.259713739156723
Epoch 247 Validation Loss 0.25699999928474426


 83%|████████▎ | 248/300 [06:33<01:27,  1.68s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.25730687379837036
Epoch 248 Validation Loss 0.2571163773536682
Epoch 249 Samples 8000 Step 124 Training Loss 0.2583727538585663


 83%|████████▎ | 249/300 [06:34<01:23,  1.64s/it]

Epoch 249 Validation Loss 0.25677210092544556


 83%|████████▎ | 250/300 [06:36<01:19,  1.59s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.2593016028404236
Epoch 250 Validation Loss 0.2577234208583832


 84%|████████▎ | 251/300 [06:37<01:13,  1.51s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.2581852376461029
Epoch 251 Validation Loss 0.25768741965293884


 84%|████████▍ | 252/300 [06:38<01:10,  1.47s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.25861650705337524
Epoch 252 Validation Loss 0.25703078508377075


 84%|████████▍ | 253/300 [06:40<01:06,  1.42s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.256518691778183
Epoch 253 Validation Loss 0.25726011395454407


 85%|████████▍ | 254/300 [06:41<01:10,  1.53s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.25892531871795654
Epoch 254 Validation Loss 0.25875794887542725


 85%|████████▌ | 255/300 [06:43<01:11,  1.59s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.2589411437511444
Epoch 255 Validation Loss 0.25794631242752075


 85%|████████▌ | 256/300 [06:44<01:06,  1.50s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.2568342387676239
Epoch 256 Validation Loss 0.25782567262649536


 86%|████████▌ | 257/300 [06:46<01:03,  1.47s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.2614287734031677
Epoch 257 Validation Loss 0.2580453157424927


 86%|████████▌ | 258/300 [06:47<00:59,  1.41s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.2591521143913269
Epoch 258 Validation Loss 0.25697922706604004


 86%|████████▋ | 259/300 [06:49<00:59,  1.44s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.25487226247787476
Epoch 259 Validation Loss 0.25827789306640625


 87%|████████▋ | 260/300 [06:50<00:57,  1.44s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.26236531138420105
Epoch 260 Validation Loss 0.26150619983673096


 87%|████████▋ | 261/300 [06:52<00:57,  1.48s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.25876113772392273
Epoch 261 Validation Loss 0.2570570111274719


 87%|████████▋ | 262/300 [06:53<00:56,  1.49s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.2587640583515167
Epoch 262 Validation Loss 0.25827908515930176


 88%|████████▊ | 263/300 [06:55<00:59,  1.60s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.2596469223499298
Epoch 263 Validation Loss 0.2569062113761902


 88%|████████▊ | 264/300 [06:56<00:56,  1.56s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.25864291191101074
Epoch 264 Validation Loss 0.2580612897872925


 88%|████████▊ | 265/300 [06:58<00:57,  1.65s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.2752634286880493
Epoch 265 Validation Loss 0.2599223256111145


 89%|████████▊ | 266/300 [07:00<00:53,  1.56s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.2592999041080475
Epoch 266 Validation Loss 0.2579551339149475


 89%|████████▉ | 267/300 [07:01<00:48,  1.48s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.25555726885795593
Epoch 267 Validation Loss 0.25698909163475037


 89%|████████▉ | 268/300 [07:03<00:50,  1.56s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.2591378390789032
Epoch 268 Validation Loss 0.2577490210533142


 90%|████████▉ | 269/300 [07:04<00:44,  1.45s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.25666865706443787
Epoch 269 Validation Loss 0.25748029351234436


 90%|█████████ | 270/300 [07:06<00:45,  1.51s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.25637954473495483
Epoch 270 Validation Loss 0.2574650049209595
Epoch 271 Samples 8000 Step 124 Training Loss 0.25926709175109863
Epoch 271 Validation Loss 0.2576862871646881


 91%|█████████ | 272/300 [07:09<00:42,  1.52s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.2562118172645569
Epoch 272 Validation Loss 0.25931066274642944


 91%|█████████ | 273/300 [07:10<00:40,  1.51s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.2593741714954376
Epoch 273 Validation Loss 0.2576867938041687


 91%|█████████▏| 274/300 [07:12<00:42,  1.63s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.25854820013046265
Epoch 274 Validation Loss 0.2571414113044739


 92%|█████████▏| 275/300 [07:14<00:38,  1.54s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.25904256105422974
Epoch 275 Validation Loss 0.25710511207580566


 92%|█████████▏| 276/300 [07:15<00:38,  1.59s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.2770158648490906
Epoch 276 Validation Loss 0.2763853967189789


 92%|█████████▏| 277/300 [07:17<00:34,  1.49s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.26760512590408325
Epoch 277 Validation Loss 0.2669582664966583


 93%|█████████▎| 278/300 [07:18<00:31,  1.45s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.2612422704696655
Epoch 278 Validation Loss 0.2622300684452057


 93%|█████████▎| 279/300 [07:19<00:28,  1.38s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.2608936131000519
Epoch 279 Validation Loss 0.26527372002601624


 93%|█████████▎| 280/300 [07:20<00:27,  1.35s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.26359614729881287
Epoch 280 Validation Loss 0.260733425617218


 94%|█████████▎| 281/300 [07:22<00:26,  1.40s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.2655165493488312
Epoch 281 Validation Loss 0.2614613473415375


 94%|█████████▍| 282/300 [07:23<00:25,  1.42s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.2616235911846161
Epoch 282 Validation Loss 0.2616574168205261


 94%|█████████▍| 283/300 [07:25<00:24,  1.45s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.2601725459098816
Epoch 283 Validation Loss 0.2605980932712555


 95%|█████████▍| 284/300 [07:26<00:23,  1.48s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.2628226578235626
Epoch 284 Validation Loss 0.2600799798965454


 95%|█████████▌| 285/300 [07:28<00:22,  1.50s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.26689213514328003
Epoch 285 Validation Loss 0.26126769185066223


 95%|█████████▌| 286/300 [07:30<00:21,  1.52s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.26052233576774597
Epoch 286 Validation Loss 0.26088085770606995


 96%|█████████▌| 287/300 [07:31<00:19,  1.51s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.2647247016429901
Epoch 287 Validation Loss 0.2593461871147156


 96%|█████████▌| 288/300 [07:33<00:18,  1.53s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.2607596516609192
Epoch 288 Validation Loss 0.2596336901187897


 96%|█████████▋| 289/300 [07:34<00:17,  1.56s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.2672380805015564
Epoch 289 Validation Loss 0.2609838843345642


 97%|█████████▋| 290/300 [07:36<00:15,  1.53s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.2581298351287842
Epoch 290 Validation Loss 0.2592271566390991


 97%|█████████▋| 291/300 [07:37<00:13,  1.51s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.257050096988678
Epoch 291 Validation Loss 0.2597185969352722


 97%|█████████▋| 292/300 [07:39<00:11,  1.47s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.2579597234725952
Epoch 292 Validation Loss 0.2613307237625122


 98%|█████████▊| 293/300 [07:40<00:10,  1.52s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.2584095895290375
Epoch 293 Validation Loss 0.25986093282699585


 98%|█████████▊| 294/300 [07:42<00:09,  1.54s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.2594662308692932
Epoch 294 Validation Loss 0.2590188980102539


 98%|█████████▊| 295/300 [07:43<00:07,  1.59s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.260593980550766
Epoch 295 Validation Loss 0.2595008611679077


 99%|█████████▊| 296/300 [07:45<00:06,  1.53s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.2568143904209137
Epoch 296 Validation Loss 0.2597379684448242


 99%|█████████▉| 297/300 [07:46<00:04,  1.53s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.2619757056236267
Epoch 297 Validation Loss 0.2595401406288147


 99%|█████████▉| 298/300 [07:48<00:03,  1.53s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.2622140645980835
Epoch 298 Validation Loss 0.25928953289985657


100%|█████████▉| 299/300 [07:49<00:01,  1.51s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.2619682550430298
Epoch 299 Validation Loss 0.2606712579727173


100%|██████████| 300/300 [07:51<00:00,  1.57s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.26109614968299866
Epoch 300 Validation Loss 0.25858303904533386


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▅▅▅▅▅▆▆▆▆▆▇▇▇▇▇▇▇████
samples,▆▇▇▆▁▂█▇▄▄▆▂▁▇▄▆▂▅▂▁▄▆█▁▂▃▅▄▅▅▄▄▆▃▄▆▆▅▃▅
train_loss,█▄▂▂▂▂▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
val_loss,█▂▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.2611
val_loss,0.25858


In [None]:
model_name = 'B1'
print(model_name)
print('B||M = KL(Markov || Model), M||B = KL(Model || Markov)', end='\n\n')
for i in range(5, 100, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

B1
B||M = KL(Markov || Model), M||B = KL(Model || Markov)

Model 5: B||M - 0.375, M||B - 4.106
Model 10: B||M - 0.087, M||B - 0.747
Model 15: B||M - 0.082, M||B - 0.495
Model 20: B||M - 0.116, M||B - 0.375
Model 25: B||M - 0.146, M||B - 0.359
Model 30: B||M - 0.282, M||B - 2.836
Model 35: B||M - 0.178, M||B - 0.913
Model 40: B||M - 0.181, M||B - 0.845
Model 45: B||M - 0.173, M||B - 0.583


In [None]:
model_name = 'B1'
epoch = 5
print(model_name, end='\n\n')
while True:
    try:
        print(f'Model_{epoch}')
        for i in test_on_all(load_model(f'{model_name}/model_{epoch}.pt', f'{model_name}/model_cfg.pt'), 30):
            print('|', end='')
            #print(f'Sequence: {i.tolist()}, Predictions: {model(i).argmax(dim=-1).flatten().tolist()}')
        print()
        epoch += 5
    except:
        break

B1

Model_5
Accuracy: 87.22 %
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||

# D1_

In [102]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=2,
            d_mlp=16,
            lr=0.05,
            normalization_type='LN',
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='D1_'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:00<04:37,  1.08it/s]

Epoch 1 Samples 8000 Step 124 Training Loss 0.687646746635437
Epoch 1 Validation Loss 0.6893687844276428


  1%|          | 2/300 [00:01<04:44,  1.05it/s]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6901377439498901
Epoch 2 Validation Loss 0.6893411874771118


  1%|          | 3/300 [00:02<04:59,  1.01s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6885019540786743
Epoch 3 Validation Loss 0.689323365688324


  1%|▏         | 4/300 [00:03<04:58,  1.01s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.6873109936714172
Epoch 4 Validation Loss 0.6893401145935059


  2%|▏         | 5/300 [00:04<04:53,  1.01it/s]

Epoch 5 Samples 8000 Step 124 Training Loss 0.6924023032188416
Epoch 5 Validation Loss 0.6893031001091003


  2%|▏         | 6/300 [00:06<04:59,  1.02s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.6861796975135803
Epoch 6 Validation Loss 0.6893067359924316


  2%|▏         | 7/300 [00:06<04:55,  1.01s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.6867983341217041
Epoch 7 Validation Loss 0.6892961859703064


  3%|▎         | 8/300 [00:07<04:45,  1.02it/s]

Epoch 8 Samples 8000 Step 124 Training Loss 0.6885842680931091
Epoch 8 Validation Loss 0.6893796920776367


  3%|▎         | 9/300 [00:08<04:50,  1.00it/s]

Epoch 9 Samples 8000 Step 124 Training Loss 0.6853165030479431
Epoch 9 Validation Loss 0.689298152923584


  3%|▎         | 10/300 [00:09<04:52,  1.01s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.6866243481636047
Epoch 10 Validation Loss 0.6893114447593689


  4%|▎         | 11/300 [00:11<04:53,  1.02s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.6880746483802795
Epoch 11 Validation Loss 0.6893063187599182


  4%|▍         | 12/300 [00:12<05:11,  1.08s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.686742901802063
Epoch 12 Validation Loss 0.6893002986907959


  4%|▍         | 13/300 [00:13<05:20,  1.12s/it]

Epoch 13 Samples 8000 Step 124 Training Loss 0.6884385943412781
Epoch 13 Validation Loss 0.6903355717658997


  5%|▍         | 14/300 [00:14<05:37,  1.18s/it]

Epoch 14 Samples 8000 Step 124 Training Loss 0.6880144476890564
Epoch 14 Validation Loss 0.6903524994850159


  5%|▌         | 15/300 [00:15<05:24,  1.14s/it]

Epoch 15 Samples 8000 Step 124 Training Loss 0.6866859197616577
Epoch 15 Validation Loss 0.6902957558631897


  5%|▌         | 16/300 [00:17<06:23,  1.35s/it]

Epoch 16 Samples 8000 Step 124 Training Loss 0.69077068567276
Epoch 16 Validation Loss 0.6903154850006104


  6%|▌         | 17/300 [00:18<06:12,  1.32s/it]

Epoch 17 Samples 8000 Step 124 Training Loss 0.6870015263557434
Epoch 17 Validation Loss 0.6903296113014221


  6%|▌         | 18/300 [00:20<06:29,  1.38s/it]

Epoch 18 Samples 8000 Step 124 Training Loss 0.6902754306793213
Epoch 18 Validation Loss 0.6903446316719055


  6%|▋         | 19/300 [00:21<06:37,  1.41s/it]

Epoch 19 Samples 8000 Step 124 Training Loss 0.6897672414779663
Epoch 19 Validation Loss 0.6903701424598694


  7%|▋         | 20/300 [00:23<06:51,  1.47s/it]

Epoch 20 Samples 8000 Step 124 Training Loss 0.687443196773529
Epoch 20 Validation Loss 0.6903871297836304


  7%|▋         | 21/300 [00:24<06:47,  1.46s/it]

Epoch 21 Samples 8000 Step 124 Training Loss 0.6890210509300232
Epoch 21 Validation Loss 0.691546618938446


  7%|▋         | 22/300 [00:26<06:46,  1.46s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.6917401552200317
Epoch 22 Validation Loss 0.6903084516525269


  8%|▊         | 23/300 [00:27<06:35,  1.43s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.6875320672988892
Epoch 23 Validation Loss 0.690315306186676


  8%|▊         | 24/300 [00:29<06:47,  1.48s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.6873933672904968
Epoch 24 Validation Loss 0.6903329491615295


  8%|▊         | 25/300 [00:30<06:45,  1.48s/it]

Epoch 25 Samples 8000 Step 124 Training Loss 0.6896949410438538
Epoch 25 Validation Loss 0.6903194785118103


  9%|▊         | 26/300 [00:32<06:25,  1.41s/it]

Epoch 26 Samples 8000 Step 124 Training Loss 0.6849755644798279
Epoch 26 Validation Loss 0.6884238719940186


  9%|▉         | 27/300 [00:33<06:30,  1.43s/it]

Epoch 27 Samples 8000 Step 124 Training Loss 0.6882847547531128
Epoch 27 Validation Loss 0.6869795322418213


  9%|▉         | 28/300 [00:35<06:31,  1.44s/it]

Epoch 28 Samples 8000 Step 124 Training Loss 0.6955597996711731
Epoch 28 Validation Loss 0.686962366104126


 10%|▉         | 29/300 [00:37<07:23,  1.64s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.6896714568138123
Epoch 29 Validation Loss 0.6869672536849976


 10%|█         | 30/300 [00:38<07:04,  1.57s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.6871662735939026
Epoch 30 Validation Loss 0.6869769096374512


 10%|█         | 31/300 [00:39<06:40,  1.49s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.6882233619689941
Epoch 31 Validation Loss 0.6869669556617737


 11%|█         | 32/300 [00:41<06:45,  1.51s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.6890627145767212
Epoch 32 Validation Loss 0.6869620084762573


 11%|█         | 33/300 [00:42<06:40,  1.50s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.6871774196624756
Epoch 33 Validation Loss 0.6869713068008423


 11%|█▏        | 34/300 [00:44<06:41,  1.51s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.6816071271896362
Epoch 34 Validation Loss 0.6869944334030151


 12%|█▏        | 35/300 [00:46<07:00,  1.59s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.6831792593002319
Epoch 35 Validation Loss 0.6869881749153137


 12%|█▏        | 36/300 [00:47<06:53,  1.56s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.6877122521400452
Epoch 36 Validation Loss 0.6869603991508484


 12%|█▏        | 37/300 [00:48<06:30,  1.48s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.6929473280906677
Epoch 37 Validation Loss 0.6869741082191467


 13%|█▎        | 38/300 [00:50<06:16,  1.44s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.6889568567276001
Epoch 38 Validation Loss 0.6869524717330933


 13%|█▎        | 39/300 [00:51<06:31,  1.50s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.6818695664405823
Epoch 39 Validation Loss 0.6870157718658447


 13%|█▎        | 40/300 [00:53<06:16,  1.45s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.6876118779182434
Epoch 40 Validation Loss 0.6870003938674927


 14%|█▎        | 41/300 [00:55<06:40,  1.54s/it]

Epoch 41 Samples 8000 Step 124 Training Loss 0.6876384019851685
Epoch 41 Validation Loss 0.6869478821754456


 14%|█▍        | 42/300 [00:56<06:48,  1.58s/it]

Epoch 42 Samples 8000 Step 124 Training Loss 0.6888099908828735
Epoch 42 Validation Loss 0.6869564652442932


 14%|█▍        | 43/300 [00:58<06:34,  1.54s/it]

Epoch 43 Samples 8000 Step 124 Training Loss 0.6858811974525452
Epoch 43 Validation Loss 0.6869513988494873


 15%|█▍        | 44/300 [00:59<06:20,  1.49s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.6917259097099304
Epoch 44 Validation Loss 0.6869522929191589


 15%|█▌        | 45/300 [01:01<06:26,  1.52s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.6895830035209656
Epoch 45 Validation Loss 0.6869441866874695


 15%|█▌        | 46/300 [01:02<06:22,  1.51s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.6837029457092285
Epoch 46 Validation Loss 0.6869939565658569


 16%|█▌        | 47/300 [01:04<06:20,  1.50s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.6748412847518921
Epoch 47 Validation Loss 0.6776456236839294


 16%|█▌        | 48/300 [01:06<06:59,  1.66s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.687249481678009
Epoch 48 Validation Loss 0.6872197985649109


 16%|█▋        | 49/300 [01:07<06:45,  1.61s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.6854515075683594
Epoch 49 Validation Loss 0.6870977878570557


 17%|█▋        | 50/300 [01:09<06:41,  1.61s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.6867052316665649
Epoch 50 Validation Loss 0.687117338180542


 17%|█▋        | 51/300 [01:10<06:39,  1.61s/it]

Epoch 51 Samples 8000 Step 124 Training Loss 0.6857470273971558
Epoch 51 Validation Loss 0.6870004534721375


 17%|█▋        | 52/300 [01:12<06:37,  1.60s/it]

Epoch 52 Samples 8000 Step 124 Training Loss 0.6886670589447021
Epoch 52 Validation Loss 0.6871185302734375


 18%|█▊        | 53/300 [01:13<06:23,  1.55s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.6893559694290161
Epoch 53 Validation Loss 0.6870058178901672


 18%|█▊        | 54/300 [01:15<06:03,  1.48s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.6878771781921387
Epoch 54 Validation Loss 0.6869889497756958


 18%|█▊        | 55/300 [01:16<06:17,  1.54s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.6764359474182129
Epoch 55 Validation Loss 0.6804263591766357


 19%|█▊        | 56/300 [01:18<06:04,  1.49s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.6702904105186462
Epoch 56 Validation Loss 0.6728286147117615
Epoch 57 Samples 8000 Step 124 Training Loss 0.6757416129112244


 19%|█▉        | 57/300 [01:19<06:06,  1.51s/it]

Epoch 57 Validation Loss 0.6706715226173401


 19%|█▉        | 58/300 [01:21<06:41,  1.66s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.6731832027435303
Epoch 58 Validation Loss 0.6689379215240479


 20%|█▉        | 59/300 [01:23<06:30,  1.62s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.6659521460533142
Epoch 59 Validation Loss 0.6689445972442627


 20%|██        | 60/300 [01:24<06:20,  1.59s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.6661703586578369
Epoch 60 Validation Loss 0.6689349412918091


 20%|██        | 61/300 [01:26<06:06,  1.53s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.6733673810958862
Epoch 61 Validation Loss 0.6719779372215271


 21%|██        | 62/300 [01:27<06:01,  1.52s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.6677931547164917
Epoch 62 Validation Loss 0.6689467430114746


 21%|██        | 63/300 [01:29<05:55,  1.50s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.6772023439407349
Epoch 63 Validation Loss 0.6689314842224121


 21%|██▏       | 64/300 [01:31<06:31,  1.66s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.6755821704864502
Epoch 64 Validation Loss 0.6689183115959167


 22%|██▏       | 65/300 [01:32<06:15,  1.60s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.6701772809028625
Epoch 65 Validation Loss 0.6689345240592957


 22%|██▏       | 66/300 [01:34<06:13,  1.60s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.6592648029327393
Epoch 66 Validation Loss 0.6691967844963074


 22%|██▏       | 67/300 [01:35<06:00,  1.55s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.6653467416763306
Epoch 67 Validation Loss 0.6691803336143494


 23%|██▎       | 68/300 [01:37<06:17,  1.63s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.668167769908905
Epoch 68 Validation Loss 0.669174313545227


 23%|██▎       | 69/300 [01:39<06:27,  1.68s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.6693316102027893
Epoch 69 Validation Loss 0.6690277457237244


 23%|██▎       | 70/300 [01:40<06:24,  1.67s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.6767953038215637
Epoch 70 Validation Loss 0.6773110032081604


 24%|██▎       | 71/300 [01:42<05:54,  1.55s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.6785895228385925
Epoch 71 Validation Loss 0.6747367978096008


 24%|██▍       | 72/300 [01:43<05:46,  1.52s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.6630132794380188
Epoch 72 Validation Loss 0.6615925431251526


 24%|██▍       | 73/300 [01:44<05:13,  1.38s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.65212482213974
Epoch 73 Validation Loss 0.6534708142280579


 25%|██▍       | 74/300 [01:46<05:18,  1.41s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.6454823017120361
Epoch 74 Validation Loss 0.6516703963279724


 25%|██▌       | 75/300 [01:47<05:34,  1.48s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.6507226824760437
Epoch 75 Validation Loss 0.6515480875968933


 25%|██▌       | 76/300 [01:49<05:44,  1.54s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.6515985727310181
Epoch 76 Validation Loss 0.6517229676246643


 26%|██▌       | 77/300 [01:50<05:35,  1.51s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.650703489780426
Epoch 77 Validation Loss 0.6509086489677429


 26%|██▌       | 78/300 [01:52<05:29,  1.49s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.6479701399803162
Epoch 78 Validation Loss 0.6500918865203857


 26%|██▋       | 79/300 [01:53<05:34,  1.51s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.6518923044204712
Epoch 79 Validation Loss 0.6511421799659729


 27%|██▋       | 80/300 [01:55<05:36,  1.53s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.6464359760284424
Epoch 80 Validation Loss 0.6441050171852112


 27%|██▋       | 81/300 [01:57<05:36,  1.53s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.6372522711753845
Epoch 81 Validation Loss 0.6495304703712463


 27%|██▋       | 82/300 [01:58<05:27,  1.50s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.6393689513206482
Epoch 82 Validation Loss 0.6401277184486389


 28%|██▊       | 83/300 [02:00<05:32,  1.53s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.6401365399360657
Epoch 83 Validation Loss 0.6400485038757324


 28%|██▊       | 84/300 [02:01<05:26,  1.51s/it]

Epoch 84 Samples 8000 Step 124 Training Loss 0.6381321549415588
Epoch 84 Validation Loss 0.6401183605194092


 28%|██▊       | 85/300 [02:02<05:19,  1.49s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.6440672874450684
Epoch 85 Validation Loss 0.6405292749404907


 29%|██▊       | 86/300 [02:04<05:20,  1.50s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.6410046219825745
Epoch 86 Validation Loss 0.6402848958969116


 29%|██▉       | 87/300 [02:05<05:13,  1.47s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.6364834308624268
Epoch 87 Validation Loss 0.6381984353065491


 29%|██▉       | 88/300 [02:07<05:17,  1.50s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.6377787590026855
Epoch 88 Validation Loss 0.6382970213890076


 30%|██▉       | 89/300 [02:08<05:13,  1.48s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.6409363150596619
Epoch 89 Validation Loss 0.6386144161224365


 30%|███       | 90/300 [02:10<05:07,  1.46s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.6398687958717346
Epoch 90 Validation Loss 0.6384303569793701


 30%|███       | 91/300 [02:12<05:29,  1.57s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.6381477117538452
Epoch 91 Validation Loss 0.6380909085273743


 31%|███       | 92/300 [02:13<05:20,  1.54s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.6318126916885376
Epoch 92 Validation Loss 0.6382419466972351


 31%|███       | 93/300 [02:15<05:15,  1.52s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.6381160616874695
Epoch 93 Validation Loss 0.6388822793960571


 31%|███▏      | 94/300 [02:16<05:16,  1.54s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.6392024755477905
Epoch 94 Validation Loss 0.6388825178146362


 32%|███▏      | 95/300 [02:18<05:08,  1.51s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.6377936005592346
Epoch 95 Validation Loss 0.6388828158378601


 32%|███▏      | 96/300 [02:19<05:11,  1.53s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.6392578482627869
Epoch 96 Validation Loss 0.6388844847679138


 32%|███▏      | 97/300 [02:21<05:06,  1.51s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.6369941234588623
Epoch 97 Validation Loss 0.6388828158378601


 33%|███▎      | 98/300 [02:22<05:01,  1.49s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.639159083366394
Epoch 98 Validation Loss 0.6387920379638672


 33%|███▎      | 99/300 [02:24<05:05,  1.52s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.634088397026062
Epoch 99 Validation Loss 0.6367208957672119


 33%|███▎      | 100/300 [02:25<04:58,  1.49s/it]

Epoch 100 Samples 8000 Step 124 Training Loss 0.6359544396400452
Epoch 100 Validation Loss 0.6389298439025879


 34%|███▎      | 101/300 [02:27<05:01,  1.51s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.6334169507026672
Epoch 101 Validation Loss 0.6359971761703491


 34%|███▍      | 102/300 [02:28<04:57,  1.50s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.634035587310791
Epoch 102 Validation Loss 0.6363176107406616


 34%|███▍      | 103/300 [02:30<04:53,  1.49s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.6358725428581238
Epoch 103 Validation Loss 0.6364184617996216


 35%|███▍      | 104/300 [02:31<04:49,  1.48s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.6293041110038757
Epoch 104 Validation Loss 0.6358664035797119


 35%|███▌      | 105/300 [02:33<04:51,  1.49s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.6358346343040466
Epoch 105 Validation Loss 0.6360384821891785


 35%|███▌      | 106/300 [02:34<04:44,  1.47s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.6393191814422607
Epoch 106 Validation Loss 0.6360329985618591


 36%|███▌      | 107/300 [02:36<04:45,  1.48s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.6369458436965942
Epoch 107 Validation Loss 0.6359989643096924


 36%|███▌      | 108/300 [02:37<05:01,  1.57s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.6366843581199646
Epoch 108 Validation Loss 0.6359923481941223


 36%|███▋      | 109/300 [02:39<04:52,  1.53s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.6336795091629028
Epoch 109 Validation Loss 0.6374979615211487


 37%|███▋      | 110/300 [02:40<04:51,  1.54s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.6343982815742493
Epoch 110 Validation Loss 0.6363467574119568


 37%|███▋      | 111/300 [02:42<04:42,  1.50s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.6376064419746399
Epoch 111 Validation Loss 0.6363464593887329


 37%|███▋      | 112/300 [02:43<04:38,  1.48s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.6342387199401855
Epoch 112 Validation Loss 0.6388878226280212


 38%|███▊      | 113/300 [02:45<04:51,  1.56s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.6354299187660217
Epoch 113 Validation Loss 0.6363422870635986


 38%|███▊      | 114/300 [02:46<04:43,  1.52s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.631331205368042
Epoch 114 Validation Loss 0.6359707713127136


 38%|███▊      | 115/300 [02:48<04:41,  1.52s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.6365726590156555
Epoch 115 Validation Loss 0.6359953284263611


 39%|███▊      | 116/300 [02:49<04:36,  1.51s/it]

Epoch 116 Samples 8000 Step 124 Training Loss 0.6335614919662476
Epoch 116 Validation Loss 0.6359817385673523


 39%|███▉      | 117/300 [02:51<04:30,  1.48s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.6354407668113708
Epoch 117 Validation Loss 0.6362233757972717


 39%|███▉      | 118/300 [02:52<04:33,  1.50s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.628519594669342
Epoch 118 Validation Loss 0.6359957456588745


 40%|███▉      | 119/300 [02:54<04:29,  1.49s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.633720874786377
Epoch 119 Validation Loss 0.6360039114952087


 40%|████      | 120/300 [02:56<04:47,  1.60s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.6351293921470642
Epoch 120 Validation Loss 0.6362175345420837


 40%|████      | 121/300 [02:57<04:38,  1.56s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.6374510526657104
Epoch 121 Validation Loss 0.6359502077102661


 41%|████      | 122/300 [02:59<04:38,  1.56s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.6316948533058167
Epoch 122 Validation Loss 0.6364179849624634


 41%|████      | 123/300 [03:00<04:30,  1.53s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.638934850692749
Epoch 123 Validation Loss 0.6360053420066833


 41%|████▏     | 124/300 [03:02<04:29,  1.53s/it]

Epoch 124 Samples 8000 Step 124 Training Loss 0.6326600313186646
Epoch 124 Validation Loss 0.635806679725647


 42%|████▏     | 125/300 [03:03<04:21,  1.50s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.637971043586731
Epoch 125 Validation Loss 0.6360660791397095


 42%|████▏     | 126/300 [03:05<04:19,  1.49s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.6342838406562805
Epoch 126 Validation Loss 0.6360442042350769


 42%|████▏     | 127/300 [03:06<04:21,  1.51s/it]

Epoch 127 Samples 8000 Step 124 Training Loss 0.6396873593330383
Epoch 127 Validation Loss 0.636290431022644


 43%|████▎     | 128/300 [03:07<04:13,  1.48s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.6381339430809021
Epoch 128 Validation Loss 0.6363357901573181


 43%|████▎     | 129/300 [03:09<04:15,  1.50s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.6365244388580322
Epoch 129 Validation Loss 0.6361746191978455


 43%|████▎     | 130/300 [03:10<04:13,  1.49s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.6324405074119568
Epoch 130 Validation Loss 0.6363718509674072


 44%|████▎     | 131/300 [03:12<04:08,  1.47s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.6346251964569092
Epoch 131 Validation Loss 0.6359868049621582


 44%|████▍     | 132/300 [03:14<04:18,  1.54s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.6301891207695007
Epoch 132 Validation Loss 0.6359601020812988


 44%|████▍     | 133/300 [03:15<04:17,  1.54s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.6347598433494568
Epoch 133 Validation Loss 0.6360180377960205


 45%|████▍     | 134/300 [03:17<04:15,  1.54s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.6321693658828735
Epoch 134 Validation Loss 0.6360505223274231


 45%|████▌     | 135/300 [03:18<04:09,  1.51s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.6385371685028076
Epoch 135 Validation Loss 0.6363370418548584


 45%|████▌     | 136/300 [03:20<04:02,  1.48s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.6382260918617249
Epoch 136 Validation Loss 0.636309802532196


 46%|████▌     | 137/300 [03:21<03:57,  1.46s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.6356144547462463
Epoch 137 Validation Loss 0.6360015869140625


 46%|████▌     | 138/300 [03:23<04:01,  1.49s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.6348992586135864
Epoch 138 Validation Loss 0.6360153555870056


 46%|████▋     | 139/300 [03:24<03:58,  1.48s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.6329225897789001
Epoch 139 Validation Loss 0.635999858379364


 47%|████▋     | 140/300 [03:26<04:07,  1.54s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.6354894042015076
Epoch 140 Validation Loss 0.6358760595321655


 47%|████▋     | 141/300 [03:27<03:58,  1.50s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.6374943256378174
Epoch 141 Validation Loss 0.6359012722969055


 47%|████▋     | 142/300 [03:29<03:56,  1.50s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.6343408226966858
Epoch 142 Validation Loss 0.6360649466514587


 48%|████▊     | 143/300 [03:30<03:57,  1.51s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.6339124441146851
Epoch 143 Validation Loss 0.6360095739364624


 48%|████▊     | 144/300 [03:31<03:50,  1.47s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.6332169771194458
Epoch 144 Validation Loss 0.6359981298446655


 48%|████▊     | 145/300 [03:33<03:46,  1.46s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.6390973329544067
Epoch 145 Validation Loss 0.6363520622253418


 49%|████▊     | 146/300 [03:34<03:48,  1.48s/it]

Epoch 146 Samples 8000 Step 124 Training Loss 0.6351521015167236
Epoch 146 Validation Loss 0.6359796524047852


 49%|████▉     | 147/300 [03:36<03:43,  1.46s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.6371122002601624
Epoch 147 Validation Loss 0.6361013054847717


 49%|████▉     | 148/300 [03:37<03:45,  1.48s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.6427883505821228
Epoch 148 Validation Loss 0.6360678672790527


 50%|████▉     | 149/300 [03:39<03:43,  1.48s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.6332709193229675
Epoch 149 Validation Loss 0.6359640955924988


 50%|█████     | 150/300 [03:40<03:43,  1.49s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.6301822662353516
Epoch 150 Validation Loss 0.635845959186554


 50%|█████     | 151/300 [03:42<03:41,  1.48s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.6328608989715576
Epoch 151 Validation Loss 0.6360872983932495


 51%|█████     | 152/300 [03:43<03:37,  1.47s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.6362151503562927
Epoch 152 Validation Loss 0.6360571980476379


 51%|█████     | 153/300 [03:45<03:41,  1.51s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.6353201866149902
Epoch 153 Validation Loss 0.6360633373260498


 51%|█████▏    | 154/300 [03:46<03:35,  1.47s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.635242223739624
Epoch 154 Validation Loss 0.6361212134361267


 52%|█████▏    | 155/300 [03:48<03:37,  1.50s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.6345018148422241
Epoch 155 Validation Loss 0.6358107328414917


 52%|█████▏    | 156/300 [03:49<03:33,  1.49s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.6368955969810486
Epoch 156 Validation Loss 0.6363385915756226


 52%|█████▏    | 157/300 [03:51<03:31,  1.48s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.6357027292251587
Epoch 157 Validation Loss 0.6360685229301453


 53%|█████▎    | 158/300 [03:52<03:31,  1.49s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.6351341009140015
Epoch 158 Validation Loss 0.6360206007957458


 53%|█████▎    | 159/300 [03:54<03:26,  1.47s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.6362941265106201
Epoch 159 Validation Loss 0.6360005140304565


 53%|█████▎    | 160/300 [03:55<03:33,  1.52s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.6307263374328613
Epoch 160 Validation Loss 0.6360334157943726


 54%|█████▎    | 161/300 [03:57<03:33,  1.53s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.6364302635192871
Epoch 161 Validation Loss 0.6362847685813904


 54%|█████▍    | 162/300 [03:58<03:28,  1.51s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.6346456408500671
Epoch 162 Validation Loss 0.6361036896705627


 54%|█████▍    | 163/300 [04:00<03:28,  1.52s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.639112114906311
Epoch 163 Validation Loss 0.6361113786697388


 55%|█████▍    | 164/300 [04:01<03:21,  1.48s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.6305848956108093
Epoch 164 Validation Loss 0.6359890103340149


 55%|█████▌    | 165/300 [04:03<03:20,  1.48s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.638589084148407
Epoch 165 Validation Loss 0.6360092163085938


 55%|█████▌    | 166/300 [04:04<03:18,  1.48s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.6388776898384094
Epoch 166 Validation Loss 0.6365020275115967


 56%|█████▌    | 167/300 [04:06<03:16,  1.48s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.6369264721870422
Epoch 167 Validation Loss 0.6360334157943726


 56%|█████▌    | 168/300 [04:07<03:10,  1.45s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.6382832527160645
Epoch 168 Validation Loss 0.636072039604187


 56%|█████▋    | 169/300 [04:08<02:55,  1.34s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.6333857178688049
Epoch 169 Validation Loss 0.6360125541687012


 57%|█████▋    | 170/300 [04:10<02:55,  1.35s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.635611355304718
Epoch 170 Validation Loss 0.6357874274253845


 57%|█████▋    | 171/300 [04:11<02:58,  1.38s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.6392349600791931
Epoch 171 Validation Loss 0.6359978914260864


 57%|█████▋    | 172/300 [04:12<02:54,  1.36s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.6378461718559265
Epoch 172 Validation Loss 0.635999321937561


 58%|█████▊    | 173/300 [04:14<03:02,  1.44s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.634516179561615
Epoch 173 Validation Loss 0.6360043287277222


 58%|█████▊    | 174/300 [04:15<02:55,  1.39s/it]

Epoch 174 Samples 8000 Step 124 Training Loss 0.633283257484436
Epoch 174 Validation Loss 0.6360113024711609


 58%|█████▊    | 175/300 [04:16<02:48,  1.35s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.6374213099479675
Epoch 175 Validation Loss 0.6363445520401001


 59%|█████▊    | 176/300 [04:18<03:00,  1.46s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.6351428627967834
Epoch 176 Validation Loss 0.6360257267951965


 59%|█████▉    | 177/300 [04:20<02:58,  1.45s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.6298498511314392
Epoch 177 Validation Loss 0.6360000967979431


 59%|█████▉    | 178/300 [04:21<03:07,  1.53s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.6406856775283813
Epoch 178 Validation Loss 0.6360162496566772


 60%|█████▉    | 179/300 [04:23<03:07,  1.55s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.6335100531578064
Epoch 179 Validation Loss 0.6360141634941101
Epoch 180 Samples 8000 Step 124 Training Loss 0.6337809562683105


 60%|██████    | 180/300 [04:24<03:04,  1.54s/it]

Epoch 180 Validation Loss 0.6360032558441162


 60%|██████    | 181/300 [04:26<02:58,  1.50s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.6327809691429138
Epoch 181 Validation Loss 0.6360782384872437


 61%|██████    | 182/300 [04:27<02:56,  1.50s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.6355870962142944
Epoch 182 Validation Loss 0.6359594464302063


 61%|██████    | 183/300 [04:29<02:57,  1.52s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.6304256916046143
Epoch 183 Validation Loss 0.6359972953796387


 61%|██████▏   | 184/300 [04:30<02:54,  1.51s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.6343385577201843
Epoch 184 Validation Loss 0.6360204815864563


 62%|██████▏   | 185/300 [04:32<02:50,  1.49s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.6363810896873474
Epoch 185 Validation Loss 0.6359786987304688


 62%|██████▏   | 186/300 [04:34<02:56,  1.55s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.629479706287384
Epoch 186 Validation Loss 0.6360178589820862


 62%|██████▏   | 187/300 [04:35<03:05,  1.64s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.6346314549446106
Epoch 187 Validation Loss 0.6360209584236145


 63%|██████▎   | 188/300 [04:37<03:07,  1.67s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.6352092623710632
Epoch 188 Validation Loss 0.6360290050506592


 63%|██████▎   | 189/300 [04:38<02:48,  1.52s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.6399544477462769
Epoch 189 Validation Loss 0.6359081268310547
Epoch 190 Samples 8000 Step 124 Training Loss 0.6377682089805603


 63%|██████▎   | 190/300 [04:40<02:46,  1.51s/it]

Epoch 190 Validation Loss 0.6359841227531433


 64%|██████▎   | 191/300 [04:41<02:39,  1.46s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.6331021785736084
Epoch 191 Validation Loss 0.636114239692688


 64%|██████▍   | 192/300 [04:42<02:28,  1.37s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.6403564810752869
Epoch 192 Validation Loss 0.6359802484512329
Epoch 193 Samples 8000 Step 124 Training Loss 0.6389158964157104


 64%|██████▍   | 193/300 [04:44<02:35,  1.45s/it]

Epoch 193 Validation Loss 0.6359797716140747


 65%|██████▍   | 194/300 [04:46<02:42,  1.53s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.6321617364883423
Epoch 194 Validation Loss 0.6360523700714111


 65%|██████▌   | 195/300 [04:47<02:50,  1.62s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.633379340171814
Epoch 195 Validation Loss 0.6360114216804504


 65%|██████▌   | 196/300 [04:49<02:50,  1.64s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.6404362320899963
Epoch 196 Validation Loss 0.6360145211219788


 66%|██████▌   | 197/300 [04:51<02:39,  1.55s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.6335791349411011
Epoch 197 Validation Loss 0.6360173225402832


 66%|██████▌   | 198/300 [04:52<02:35,  1.53s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.6357662081718445
Epoch 198 Validation Loss 0.6359983682632446


 66%|██████▋   | 199/300 [04:53<02:22,  1.41s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.6345531940460205
Epoch 199 Validation Loss 0.6360568404197693


 67%|██████▋   | 200/300 [04:54<02:17,  1.38s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.6388195753097534
Epoch 200 Validation Loss 0.6360039114952087


 67%|██████▋   | 201/300 [04:56<02:21,  1.43s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.6381306052207947
Epoch 201 Validation Loss 0.6360599994659424


 67%|██████▋   | 202/300 [04:57<02:13,  1.37s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.6384575963020325
Epoch 202 Validation Loss 0.636105477809906


 68%|██████▊   | 203/300 [04:59<02:17,  1.41s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.6383235454559326
Epoch 203 Validation Loss 0.6357987523078918


 68%|██████▊   | 204/300 [05:00<02:10,  1.36s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.6329963207244873
Epoch 204 Validation Loss 0.6360925436019897


 68%|██████▊   | 205/300 [05:01<02:09,  1.36s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.6345869302749634
Epoch 205 Validation Loss 0.6360388994216919


 69%|██████▊   | 206/300 [05:03<02:08,  1.37s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.6308069825172424
Epoch 206 Validation Loss 0.636012613773346
Epoch 207 Samples 8000 Step 124 Training Loss 0.6369494199752808


 69%|██████▉   | 207/300 [05:04<02:12,  1.42s/it]

Epoch 207 Validation Loss 0.636355459690094


 69%|██████▉   | 208/300 [05:06<02:07,  1.39s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.6324692964553833
Epoch 208 Validation Loss 0.6360911130905151


 70%|██████▉   | 209/300 [05:07<01:58,  1.30s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.6340782046318054
Epoch 209 Validation Loss 0.6361120939254761


 70%|███████   | 210/300 [05:08<02:03,  1.37s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.6337940692901611
Epoch 210 Validation Loss 0.6360098123550415


 70%|███████   | 211/300 [05:10<02:02,  1.38s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.631711483001709
Epoch 211 Validation Loss 0.6360350251197815


 71%|███████   | 212/300 [05:11<02:02,  1.39s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.6345585584640503
Epoch 212 Validation Loss 0.6359604001045227


 71%|███████   | 213/300 [05:13<02:05,  1.45s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.6343846321105957
Epoch 213 Validation Loss 0.6360170245170593


 71%|███████▏  | 214/300 [05:14<02:04,  1.45s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.6332952976226807
Epoch 214 Validation Loss 0.6360148191452026


 72%|███████▏  | 215/300 [05:16<02:05,  1.48s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.6352651715278625
Epoch 215 Validation Loss 0.6363469362258911


 72%|███████▏  | 216/300 [05:17<02:08,  1.52s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.6352257132530212
Epoch 216 Validation Loss 0.6360737085342407


 72%|███████▏  | 217/300 [05:19<02:05,  1.52s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.6361006498336792
Epoch 217 Validation Loss 0.6360102295875549


 73%|███████▎  | 218/300 [05:20<02:02,  1.50s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.6400493383407593
Epoch 218 Validation Loss 0.6360216736793518


 73%|███████▎  | 219/300 [05:21<01:50,  1.36s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.6320754289627075
Epoch 219 Validation Loss 0.6357882618904114


 73%|███████▎  | 220/300 [05:22<01:43,  1.30s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.6360923647880554
Epoch 220 Validation Loss 0.6362277865409851


 74%|███████▎  | 221/300 [05:24<01:45,  1.34s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.6397412419319153
Epoch 221 Validation Loss 0.6360177993774414


 74%|███████▍  | 222/300 [05:25<01:44,  1.35s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.6312065124511719
Epoch 222 Validation Loss 0.636023759841919


 74%|███████▍  | 223/300 [05:27<01:45,  1.37s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.6362773776054382
Epoch 223 Validation Loss 0.636005699634552


 75%|███████▍  | 224/300 [05:28<01:46,  1.40s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.6332316994667053
Epoch 224 Validation Loss 0.6360230445861816


 75%|███████▌  | 225/300 [05:30<01:47,  1.43s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.6351098418235779
Epoch 225 Validation Loss 0.6360120177268982


 75%|███████▌  | 226/300 [05:31<01:46,  1.44s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.6374900341033936
Epoch 226 Validation Loss 0.6366791725158691


 76%|███████▌  | 227/300 [05:32<01:44,  1.43s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.6373072862625122
Epoch 227 Validation Loss 0.6360188126564026


 76%|███████▌  | 228/300 [05:34<01:40,  1.40s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.6381359100341797
Epoch 228 Validation Loss 0.6361512541770935


 76%|███████▋  | 229/300 [05:36<01:49,  1.54s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.6401299834251404
Epoch 229 Validation Loss 0.6362577676773071


 77%|███████▋  | 230/300 [05:37<01:41,  1.45s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.637748658657074
Epoch 230 Validation Loss 0.6360170841217041


 77%|███████▋  | 231/300 [05:38<01:41,  1.47s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.6374589800834656
Epoch 231 Validation Loss 0.6358333826065063


 77%|███████▋  | 232/300 [05:40<01:40,  1.47s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.6359958648681641
Epoch 232 Validation Loss 0.6360304355621338


 78%|███████▊  | 233/300 [05:41<01:32,  1.38s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.6351528763771057
Epoch 233 Validation Loss 0.6360618472099304


 78%|███████▊  | 234/300 [05:42<01:24,  1.28s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.6354437470436096
Epoch 234 Validation Loss 0.6362240314483643


 78%|███████▊  | 235/300 [05:43<01:26,  1.33s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.6403777599334717
Epoch 235 Validation Loss 0.6360125541687012


 79%|███████▊  | 236/300 [05:45<01:30,  1.41s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.6343191862106323
Epoch 236 Validation Loss 0.636015772819519


 79%|███████▉  | 237/300 [05:47<01:33,  1.48s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.6398011445999146
Epoch 237 Validation Loss 0.6360012292861938


 79%|███████▉  | 238/300 [05:49<01:41,  1.63s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.6343835592269897
Epoch 238 Validation Loss 0.6358065605163574


 80%|███████▉  | 239/300 [05:50<01:33,  1.53s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.6299882531166077
Epoch 239 Validation Loss 0.6360123753547668


 80%|████████  | 240/300 [05:51<01:27,  1.45s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.6344772577285767
Epoch 240 Validation Loss 0.6360111832618713


 80%|████████  | 241/300 [05:53<01:22,  1.40s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.6349501609802246
Epoch 241 Validation Loss 0.6359936594963074
Epoch 242 Samples 8000 Step 124 Training Loss 0.6335786581039429


 81%|████████  | 242/300 [05:54<01:28,  1.52s/it]

Epoch 242 Validation Loss 0.6359723806381226


 81%|████████  | 243/300 [05:57<01:38,  1.73s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.6381842494010925
Epoch 243 Validation Loss 0.6360996961593628


 81%|████████▏ | 244/300 [05:58<01:36,  1.72s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.6391469240188599
Epoch 244 Validation Loss 0.636025071144104


 82%|████████▏ | 245/300 [05:59<01:24,  1.54s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.6325773596763611
Epoch 245 Validation Loss 0.6360262036323547


 82%|████████▏ | 246/300 [06:00<01:15,  1.40s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.6374074816703796
Epoch 246 Validation Loss 0.635860025882721


 82%|████████▏ | 247/300 [06:02<01:09,  1.31s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.6332182884216309
Epoch 247 Validation Loss 0.6360014081001282


 83%|████████▎ | 248/300 [06:03<01:04,  1.24s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.6365426182746887
Epoch 248 Validation Loss 0.6360174417495728


 83%|████████▎ | 249/300 [06:04<01:00,  1.18s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.6378363370895386
Epoch 249 Validation Loss 0.6360258460044861


 83%|████████▎ | 250/300 [06:05<00:57,  1.15s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.6377905607223511
Epoch 250 Validation Loss 0.6359975934028625


 84%|████████▎ | 251/300 [06:06<00:55,  1.13s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.6351061463356018
Epoch 251 Validation Loss 0.635988712310791


 84%|████████▍ | 252/300 [06:07<00:53,  1.11s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.6307149529457092
Epoch 252 Validation Loss 0.6360883712768555


 84%|████████▍ | 253/300 [06:08<00:50,  1.08s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.6373560428619385
Epoch 253 Validation Loss 0.6360247135162354


 85%|████████▍ | 254/300 [06:09<00:54,  1.18s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.6315816044807434
Epoch 254 Validation Loss 0.6366644501686096


 85%|████████▌ | 255/300 [06:11<01:00,  1.35s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.6394404768943787
Epoch 255 Validation Loss 0.6363850831985474


 85%|████████▌ | 256/300 [06:13<01:01,  1.39s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.6411553025245667
Epoch 256 Validation Loss 0.6360188126564026


 86%|████████▌ | 257/300 [06:14<00:56,  1.31s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.6309882998466492
Epoch 257 Validation Loss 0.6360132098197937


 86%|████████▌ | 258/300 [06:15<00:52,  1.25s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.6332429647445679
Epoch 258 Validation Loss 0.6368948221206665


 86%|████████▋ | 259/300 [06:16<00:49,  1.21s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.6352077126502991
Epoch 259 Validation Loss 0.636471152305603


 87%|████████▋ | 260/300 [06:17<00:46,  1.17s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.6339796781539917
Epoch 260 Validation Loss 0.6360989809036255


 87%|████████▋ | 261/300 [06:18<00:44,  1.15s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.6377219557762146
Epoch 261 Validation Loss 0.6360885500907898


 87%|████████▋ | 262/300 [06:19<00:43,  1.13s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.6330181360244751
Epoch 262 Validation Loss 0.6363976001739502


 88%|████████▊ | 263/300 [06:20<00:41,  1.12s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.6327738761901855
Epoch 263 Validation Loss 0.6360642313957214


 88%|████████▊ | 264/300 [06:21<00:39,  1.11s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.6335251331329346
Epoch 264 Validation Loss 0.6360188722610474


 88%|████████▊ | 265/300 [06:22<00:38,  1.10s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.6350173354148865
Epoch 265 Validation Loss 0.6360058188438416


 89%|████████▊ | 266/300 [06:24<00:37,  1.09s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.6335611939430237
Epoch 266 Validation Loss 0.6360924243927002


 89%|████████▉ | 267/300 [06:25<00:36,  1.09s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.6364952325820923
Epoch 267 Validation Loss 0.6361029148101807


 89%|████████▉ | 268/300 [06:26<00:36,  1.13s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.6439188122749329
Epoch 268 Validation Loss 0.6360656023025513


 90%|████████▉ | 269/300 [06:27<00:34,  1.13s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.6383972764015198
Epoch 269 Validation Loss 0.6360647678375244


 90%|█████████ | 270/300 [06:28<00:33,  1.11s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.6323986649513245
Epoch 270 Validation Loss 0.6358561515808105


 90%|█████████ | 271/300 [06:29<00:32,  1.12s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.6370746493339539
Epoch 271 Validation Loss 0.6360136270523071


 91%|█████████ | 272/300 [06:30<00:29,  1.07s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.6300991773605347
Epoch 272 Validation Loss 0.6360158920288086


 91%|█████████ | 273/300 [06:31<00:27,  1.03s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.6337972283363342
Epoch 273 Validation Loss 0.6359821557998657


 91%|█████████▏| 274/300 [06:32<00:25,  1.02it/s]

Epoch 274 Samples 8000 Step 124 Training Loss 0.6352723836898804
Epoch 274 Validation Loss 0.6359919905662537


 92%|█████████▏| 275/300 [06:33<00:23,  1.04it/s]

Epoch 275 Samples 8000 Step 124 Training Loss 0.6365847587585449
Epoch 275 Validation Loss 0.6362080574035645


 92%|█████████▏| 276/300 [06:34<00:24,  1.02s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.6367815732955933
Epoch 276 Validation Loss 0.6360015869140625


 92%|█████████▏| 277/300 [06:35<00:23,  1.01s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.633877158164978
Epoch 277 Validation Loss 0.6360167264938354


 93%|█████████▎| 278/300 [06:36<00:21,  1.03it/s]

Epoch 278 Samples 8000 Step 124 Training Loss 0.6348957419395447
Epoch 278 Validation Loss 0.636237382888794


 93%|█████████▎| 279/300 [06:37<00:19,  1.05it/s]

Epoch 279 Samples 8000 Step 124 Training Loss 0.6331403851509094
Epoch 279 Validation Loss 0.6360082626342773


 93%|█████████▎| 280/300 [06:38<00:19,  1.04it/s]

Epoch 280 Samples 8000 Step 124 Training Loss 0.6358729600906372
Epoch 280 Validation Loss 0.6357930898666382


 94%|█████████▎| 281/300 [06:39<00:18,  1.02it/s]

Epoch 281 Samples 8000 Step 124 Training Loss 0.6311281323432922
Epoch 281 Validation Loss 0.6360570192337036


 94%|█████████▍| 282/300 [06:40<00:17,  1.01it/s]

Epoch 282 Samples 8000 Step 124 Training Loss 0.6336044073104858
Epoch 282 Validation Loss 0.636109471321106


 94%|█████████▍| 283/300 [06:41<00:16,  1.04it/s]

Epoch 283 Samples 8000 Step 124 Training Loss 0.6360567212104797
Epoch 283 Validation Loss 0.6360794901847839


 95%|█████████▍| 284/300 [06:42<00:14,  1.07it/s]

Epoch 284 Samples 8000 Step 124 Training Loss 0.6371652483940125
Epoch 284 Validation Loss 0.6360553503036499


 95%|█████████▌| 285/300 [06:42<00:13,  1.08it/s]

Epoch 285 Samples 8000 Step 124 Training Loss 0.635330080986023
Epoch 285 Validation Loss 0.6362187266349792


 95%|█████████▌| 286/300 [06:43<00:13,  1.08it/s]

Epoch 286 Samples 8000 Step 124 Training Loss 0.63422030210495
Epoch 286 Validation Loss 0.6360305547714233


 96%|█████████▌| 287/300 [06:44<00:11,  1.10it/s]

Epoch 287 Samples 8000 Step 124 Training Loss 0.6375008225440979
Epoch 287 Validation Loss 0.6363370418548584


 96%|█████████▌| 288/300 [06:45<00:11,  1.07it/s]

Epoch 288 Samples 8000 Step 124 Training Loss 0.6316652894020081
Epoch 288 Validation Loss 0.6358882188796997


 96%|█████████▋| 289/300 [06:46<00:10,  1.08it/s]

Epoch 289 Samples 8000 Step 124 Training Loss 0.6340997815132141
Epoch 289 Validation Loss 0.6360033750534058


 97%|█████████▋| 290/300 [06:47<00:09,  1.09it/s]

Epoch 290 Samples 8000 Step 124 Training Loss 0.633334219455719
Epoch 290 Validation Loss 0.6378750205039978


 97%|█████████▋| 291/300 [06:48<00:08,  1.10it/s]

Epoch 291 Samples 8000 Step 124 Training Loss 0.6366493701934814
Epoch 291 Validation Loss 0.6361036896705627


 97%|█████████▋| 292/300 [06:49<00:07,  1.11it/s]

Epoch 292 Samples 8000 Step 124 Training Loss 0.6364687085151672
Epoch 292 Validation Loss 0.6360965371131897


 98%|█████████▊| 293/300 [06:50<00:06,  1.04it/s]

Epoch 293 Samples 8000 Step 124 Training Loss 0.6313006281852722
Epoch 293 Validation Loss 0.6361062526702881


 98%|█████████▊| 294/300 [06:51<00:05,  1.04it/s]

Epoch 294 Samples 8000 Step 124 Training Loss 0.639096200466156
Epoch 294 Validation Loss 0.6361081600189209


 98%|█████████▊| 295/300 [06:52<00:04,  1.07it/s]

Epoch 295 Samples 8000 Step 124 Training Loss 0.6344103217124939
Epoch 295 Validation Loss 0.6360615491867065


 99%|█████████▊| 296/300 [06:53<00:03,  1.01it/s]

Epoch 296 Samples 8000 Step 124 Training Loss 0.6383646130561829
Epoch 296 Validation Loss 0.6360035538673401


 99%|█████████▉| 297/300 [06:54<00:03,  1.01s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.6335003972053528
Epoch 297 Validation Loss 0.6360923647880554


 99%|█████████▉| 298/300 [06:55<00:02,  1.05s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.6329668760299683
Epoch 298 Validation Loss 0.6359714269638062


100%|█████████▉| 299/300 [06:56<00:01,  1.13s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.6358596086502075
Epoch 299 Validation Loss 0.6359239220619202


100%|██████████| 300/300 [06:58<00:00,  1.39s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.635223925113678
Epoch 300 Validation Loss 0.6361018419265747


0,1
epoch,▁▁▁▁▁▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▄▄▄▅▅▅▅▆▆▆▆▆▆▇▇▇▇███
samples,▅█▇▆█▃▆▆▇▃▆█▃▄▅▂▅▄▄▃▂▇▄▇▂▁▂▅▁▆▅█▃█▃█▄▄█▅
train_loss,██▇▇█▅▇▆▄▂▂▁▂▂▁▂▂▂▁▂▁▁▂▂▁▂▁▂▁▂▂▂▂▂▂▂▂▂▂▂
val_loss,██████▅▅▄▃▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.63522
val_loss,0.6361


In [103]:
model = train_model(
            dataset=dataset,
            n_epochs=300,
            n_layers=1,
            d_model=2,
            d_mlp=16,
            lr=0.05,
            normalization_type='LNPre',
            wandb=True,
            wandb_project_name='superposition',
            save_every=5,
            save_dir='D1__noscale'
        )
wandb.finish()

Moving model to device:  cpu


  0%|          | 1/300 [00:01<06:05,  1.22s/it]

Epoch 1 Samples 8000 Step 124 Training Loss 0.6927733421325684
Epoch 1 Validation Loss 0.6925334334373474


  1%|          | 2/300 [00:02<05:40,  1.14s/it]

Epoch 2 Samples 8000 Step 124 Training Loss 0.6920993328094482
Epoch 2 Validation Loss 0.6925355195999146


  1%|          | 3/300 [00:03<05:10,  1.05s/it]

Epoch 3 Samples 8000 Step 124 Training Loss 0.6930111646652222
Epoch 3 Validation Loss 0.6925277709960938


  1%|▏         | 4/300 [00:04<05:10,  1.05s/it]

Epoch 4 Samples 8000 Step 124 Training Loss 0.6918786764144897
Epoch 4 Validation Loss 0.6925217509269714


  2%|▏         | 5/300 [00:05<05:05,  1.03s/it]

Epoch 5 Samples 8000 Step 124 Training Loss 0.692318320274353
Epoch 5 Validation Loss 0.6925208568572998


  2%|▏         | 6/300 [00:06<05:01,  1.03s/it]

Epoch 6 Samples 8000 Step 124 Training Loss 0.6920473575592041
Epoch 6 Validation Loss 0.692521333694458


  2%|▏         | 7/300 [00:07<04:58,  1.02s/it]

Epoch 7 Samples 8000 Step 124 Training Loss 0.6922563314437866
Epoch 7 Validation Loss 0.6925255060195923


  3%|▎         | 8/300 [00:08<05:10,  1.06s/it]

Epoch 8 Samples 8000 Step 124 Training Loss 0.6921833157539368
Epoch 8 Validation Loss 0.6925252079963684


  3%|▎         | 9/300 [00:09<05:04,  1.05s/it]

Epoch 9 Samples 8000 Step 124 Training Loss 0.6925240755081177
Epoch 9 Validation Loss 0.6925233006477356


  3%|▎         | 10/300 [00:10<04:52,  1.01s/it]

Epoch 10 Samples 8000 Step 124 Training Loss 0.6932123899459839
Epoch 10 Validation Loss 0.6925265192985535


  4%|▎         | 11/300 [00:11<04:59,  1.04s/it]

Epoch 11 Samples 8000 Step 124 Training Loss 0.6923285126686096
Epoch 11 Validation Loss 0.6925254464149475


  4%|▍         | 12/300 [00:12<04:49,  1.01s/it]

Epoch 12 Samples 8000 Step 124 Training Loss 0.6917792558670044
Epoch 12 Validation Loss 0.692524254322052


  4%|▍         | 13/300 [00:13<04:32,  1.05it/s]

Epoch 13 Samples 8000 Step 124 Training Loss 0.6928605437278748
Epoch 13 Validation Loss 0.6925203204154968


  5%|▍         | 14/300 [00:14<04:19,  1.10it/s]

Epoch 14 Samples 8000 Step 124 Training Loss 0.6937559247016907
Epoch 14 Validation Loss 0.6925228834152222


  5%|▌         | 15/300 [00:15<04:22,  1.08it/s]

Epoch 15 Samples 8000 Step 124 Training Loss 0.6937373876571655
Epoch 15 Validation Loss 0.6925199627876282


  5%|▌         | 16/300 [00:15<04:21,  1.08it/s]

Epoch 16 Samples 8000 Step 124 Training Loss 0.6917330026626587
Epoch 16 Validation Loss 0.6925234198570251


  6%|▌         | 17/300 [00:16<04:19,  1.09it/s]

Epoch 17 Samples 8000 Step 124 Training Loss 0.6920897364616394
Epoch 17 Validation Loss 0.6925196051597595


  6%|▌         | 18/300 [00:17<04:18,  1.09it/s]

Epoch 18 Samples 8000 Step 124 Training Loss 0.6938918828964233
Epoch 18 Validation Loss 0.6925264000892639


  6%|▋         | 19/300 [00:18<04:21,  1.08it/s]

Epoch 19 Samples 8000 Step 124 Training Loss 0.6912672519683838
Epoch 19 Validation Loss 0.6925234198570251


  7%|▋         | 20/300 [00:19<04:20,  1.07it/s]

Epoch 20 Samples 8000 Step 124 Training Loss 0.6913573145866394
Epoch 20 Validation Loss 0.6925208568572998


  7%|▋         | 21/300 [00:20<04:19,  1.07it/s]

Epoch 21 Samples 8000 Step 124 Training Loss 0.6927623152732849
Epoch 21 Validation Loss 0.6925249695777893


  7%|▋         | 22/300 [00:21<04:40,  1.01s/it]

Epoch 22 Samples 8000 Step 124 Training Loss 0.6932848691940308
Epoch 22 Validation Loss 0.6925228834152222


  8%|▊         | 23/300 [00:22<04:48,  1.04s/it]

Epoch 23 Samples 8000 Step 124 Training Loss 0.6945580840110779
Epoch 23 Validation Loss 0.6925239562988281


  8%|▊         | 24/300 [00:23<04:38,  1.01s/it]

Epoch 24 Samples 8000 Step 124 Training Loss 0.6926736235618591
Epoch 24 Validation Loss 0.6925249099731445


  8%|▊         | 25/300 [00:24<04:32,  1.01it/s]

Epoch 25 Samples 8000 Step 124 Training Loss 0.6924599409103394
Epoch 25 Validation Loss 0.6925212740898132


  9%|▊         | 26/300 [00:25<04:31,  1.01it/s]

Epoch 26 Samples 8000 Step 124 Training Loss 0.6923425197601318
Epoch 26 Validation Loss 0.692521870136261


  9%|▉         | 27/300 [00:26<04:32,  1.00it/s]

Epoch 27 Samples 8000 Step 124 Training Loss 0.6922028660774231
Epoch 27 Validation Loss 0.6925212740898132


  9%|▉         | 28/300 [00:27<04:25,  1.02it/s]

Epoch 28 Samples 8000 Step 124 Training Loss 0.6935475468635559
Epoch 28 Validation Loss 0.6925181746482849


 10%|▉         | 29/300 [00:28<04:41,  1.04s/it]

Epoch 29 Samples 8000 Step 124 Training Loss 0.6921998262405396
Epoch 29 Validation Loss 0.6925203204154968


 10%|█         | 30/300 [00:30<04:45,  1.06s/it]

Epoch 30 Samples 8000 Step 124 Training Loss 0.6930839419364929
Epoch 30 Validation Loss 0.6925179362297058


 10%|█         | 31/300 [00:31<04:42,  1.05s/it]

Epoch 31 Samples 8000 Step 124 Training Loss 0.6929033994674683
Epoch 31 Validation Loss 0.6925181746482849


 11%|█         | 32/300 [00:31<04:32,  1.01s/it]

Epoch 32 Samples 8000 Step 124 Training Loss 0.6921201944351196
Epoch 32 Validation Loss 0.6925226449966431


 11%|█         | 33/300 [00:33<04:39,  1.05s/it]

Epoch 33 Samples 8000 Step 124 Training Loss 0.6917879581451416
Epoch 33 Validation Loss 0.6925290822982788


 11%|█▏        | 34/300 [00:34<04:38,  1.05s/it]

Epoch 34 Samples 8000 Step 124 Training Loss 0.6932753324508667
Epoch 34 Validation Loss 0.692518413066864


 12%|█▏        | 35/300 [00:35<04:31,  1.02s/it]

Epoch 35 Samples 8000 Step 124 Training Loss 0.6927276849746704
Epoch 35 Validation Loss 0.6925234198570251


 12%|█▏        | 36/300 [00:36<04:32,  1.03s/it]

Epoch 36 Samples 8000 Step 124 Training Loss 0.692223310470581
Epoch 36 Validation Loss 0.6925224661827087


 12%|█▏        | 37/300 [00:37<04:36,  1.05s/it]

Epoch 37 Samples 8000 Step 124 Training Loss 0.6929582357406616
Epoch 37 Validation Loss 0.6925187706947327


 13%|█▎        | 38/300 [00:38<04:41,  1.07s/it]

Epoch 38 Samples 8000 Step 124 Training Loss 0.6928185224533081
Epoch 38 Validation Loss 0.6920556426048279


 13%|█▎        | 39/300 [00:39<04:27,  1.02s/it]

Epoch 39 Samples 8000 Step 124 Training Loss 0.6918617486953735
Epoch 39 Validation Loss 0.692011833190918


 13%|█▎        | 40/300 [00:40<04:25,  1.02s/it]

Epoch 40 Samples 8000 Step 124 Training Loss 0.6907568573951721
Epoch 40 Validation Loss 0.6920183897018433


 14%|█▎        | 41/300 [00:41<04:17,  1.01it/s]

Epoch 41 Samples 8000 Step 124 Training Loss 0.6923244595527649
Epoch 41 Validation Loss 0.692009449005127


 14%|█▍        | 42/300 [00:42<04:13,  1.02it/s]

Epoch 42 Samples 8000 Step 124 Training Loss 0.6933313608169556
Epoch 42 Validation Loss 0.6920238137245178


 14%|█▍        | 43/300 [00:43<04:11,  1.02it/s]

Epoch 43 Samples 8000 Step 124 Training Loss 0.6928418278694153
Epoch 43 Validation Loss 0.6920043230056763


 15%|█▍        | 44/300 [00:44<04:33,  1.07s/it]

Epoch 44 Samples 8000 Step 124 Training Loss 0.6918562650680542
Epoch 44 Validation Loss 0.6920042634010315


 15%|█▌        | 45/300 [00:45<04:27,  1.05s/it]

Epoch 45 Samples 8000 Step 124 Training Loss 0.6924055814743042
Epoch 45 Validation Loss 0.6920052766799927


 15%|█▌        | 46/300 [00:46<04:28,  1.06s/it]

Epoch 46 Samples 8000 Step 124 Training Loss 0.6926416754722595
Epoch 46 Validation Loss 0.692010223865509


 16%|█▌        | 47/300 [00:47<04:25,  1.05s/it]

Epoch 47 Samples 8000 Step 124 Training Loss 0.6925246119499207
Epoch 47 Validation Loss 0.6920103430747986


 16%|█▌        | 48/300 [00:48<04:22,  1.04s/it]

Epoch 48 Samples 8000 Step 124 Training Loss 0.6922394037246704
Epoch 48 Validation Loss 0.6920057535171509


 16%|█▋        | 49/300 [00:49<04:21,  1.04s/it]

Epoch 49 Samples 8000 Step 124 Training Loss 0.6920728087425232
Epoch 49 Validation Loss 0.6920049786567688


 17%|█▋        | 50/300 [00:50<04:12,  1.01s/it]

Epoch 50 Samples 8000 Step 124 Training Loss 0.6913420557975769
Epoch 50 Validation Loss 0.6920030117034912


 17%|█▋        | 51/300 [00:51<04:05,  1.01it/s]

Epoch 51 Samples 8000 Step 124 Training Loss 0.6932714581489563
Epoch 51 Validation Loss 0.6920033693313599


 17%|█▋        | 52/300 [00:52<03:59,  1.04it/s]

Epoch 52 Samples 8000 Step 124 Training Loss 0.6916705965995789
Epoch 52 Validation Loss 0.6920042037963867


 18%|█▊        | 53/300 [00:53<04:09,  1.01s/it]

Epoch 53 Samples 8000 Step 124 Training Loss 0.6910449862480164
Epoch 53 Validation Loss 0.6920047998428345


 18%|█▊        | 54/300 [00:54<04:31,  1.10s/it]

Epoch 54 Samples 8000 Step 124 Training Loss 0.6918396949768066
Epoch 54 Validation Loss 0.692003071308136


 18%|█▊        | 55/300 [00:56<04:40,  1.14s/it]

Epoch 55 Samples 8000 Step 124 Training Loss 0.6919072866439819
Epoch 55 Validation Loss 0.6920043230056763


 19%|█▊        | 56/300 [00:57<04:36,  1.13s/it]

Epoch 56 Samples 8000 Step 124 Training Loss 0.6913014054298401
Epoch 56 Validation Loss 0.6920079588890076


 19%|█▉        | 57/300 [00:58<04:26,  1.10s/it]

Epoch 57 Samples 8000 Step 124 Training Loss 0.6930692195892334
Epoch 57 Validation Loss 0.6920042037963867


 19%|█▉        | 58/300 [00:59<04:23,  1.09s/it]

Epoch 58 Samples 8000 Step 124 Training Loss 0.6913164258003235
Epoch 58 Validation Loss 0.692002534866333


 20%|█▉        | 59/300 [01:00<04:25,  1.10s/it]

Epoch 59 Samples 8000 Step 124 Training Loss 0.6917823553085327
Epoch 59 Validation Loss 0.6920142769813538


 20%|██        | 60/300 [01:01<04:24,  1.10s/it]

Epoch 60 Samples 8000 Step 124 Training Loss 0.6916152834892273
Epoch 60 Validation Loss 0.6920028328895569


 20%|██        | 61/300 [01:02<04:21,  1.09s/it]

Epoch 61 Samples 8000 Step 124 Training Loss 0.6913115382194519
Epoch 61 Validation Loss 0.6920018792152405


 21%|██        | 62/300 [01:03<04:18,  1.09s/it]

Epoch 62 Samples 8000 Step 124 Training Loss 0.6931767463684082
Epoch 62 Validation Loss 0.6920107007026672


 21%|██        | 63/300 [01:04<04:19,  1.10s/it]

Epoch 63 Samples 8000 Step 124 Training Loss 0.6927049160003662
Epoch 63 Validation Loss 0.6920021772384644


 21%|██▏       | 64/300 [01:05<04:11,  1.07s/it]

Epoch 64 Samples 8000 Step 124 Training Loss 0.6926114559173584
Epoch 64 Validation Loss 0.6920056343078613


 22%|██▏       | 65/300 [01:06<04:07,  1.05s/it]

Epoch 65 Samples 8000 Step 124 Training Loss 0.6927443742752075
Epoch 65 Validation Loss 0.6920016407966614


 22%|██▏       | 66/300 [01:08<04:18,  1.11s/it]

Epoch 66 Samples 8000 Step 124 Training Loss 0.6911991238594055
Epoch 66 Validation Loss 0.692003071308136


 22%|██▏       | 67/300 [01:09<04:26,  1.14s/it]

Epoch 67 Samples 8000 Step 124 Training Loss 0.6924930214881897
Epoch 67 Validation Loss 0.6920014023780823


 23%|██▎       | 68/300 [01:10<04:19,  1.12s/it]

Epoch 68 Samples 8000 Step 124 Training Loss 0.6908232569694519
Epoch 68 Validation Loss 0.6920020580291748


 23%|██▎       | 69/300 [01:11<04:13,  1.10s/it]

Epoch 69 Samples 8000 Step 124 Training Loss 0.6919029355049133
Epoch 69 Validation Loss 0.6920033693313599


 23%|██▎       | 70/300 [01:12<04:08,  1.08s/it]

Epoch 70 Samples 8000 Step 124 Training Loss 0.692216694355011
Epoch 70 Validation Loss 0.6920037865638733


 24%|██▎       | 71/300 [01:13<04:07,  1.08s/it]

Epoch 71 Samples 8000 Step 124 Training Loss 0.6913883090019226
Epoch 71 Validation Loss 0.6920019388198853


 24%|██▍       | 72/300 [01:14<04:01,  1.06s/it]

Epoch 72 Samples 8000 Step 124 Training Loss 0.6925220489501953
Epoch 72 Validation Loss 0.692002534866333


 24%|██▍       | 73/300 [01:15<04:02,  1.07s/it]

Epoch 73 Samples 8000 Step 124 Training Loss 0.6929120421409607
Epoch 73 Validation Loss 0.6920008063316345


 25%|██▍       | 74/300 [01:16<03:56,  1.04s/it]

Epoch 74 Samples 8000 Step 124 Training Loss 0.6917188167572021
Epoch 74 Validation Loss 0.6920024156570435


 25%|██▌       | 75/300 [01:17<04:09,  1.11s/it]

Epoch 75 Samples 8000 Step 124 Training Loss 0.6930281519889832
Epoch 75 Validation Loss 0.6920138597488403


 25%|██▌       | 76/300 [01:19<04:35,  1.23s/it]

Epoch 76 Samples 8000 Step 124 Training Loss 0.6917305588722229
Epoch 76 Validation Loss 0.6920011043548584


 26%|██▌       | 77/300 [01:20<04:44,  1.27s/it]

Epoch 77 Samples 8000 Step 124 Training Loss 0.6927284598350525
Epoch 77 Validation Loss 0.6920009851455688


 26%|██▌       | 78/300 [01:22<04:51,  1.31s/it]

Epoch 78 Samples 8000 Step 124 Training Loss 0.6901026964187622
Epoch 78 Validation Loss 0.6920151114463806


 26%|██▋       | 79/300 [01:23<05:02,  1.37s/it]

Epoch 79 Samples 8000 Step 124 Training Loss 0.6924541592597961
Epoch 79 Validation Loss 0.6920139193534851


 27%|██▋       | 80/300 [01:25<05:07,  1.40s/it]

Epoch 80 Samples 8000 Step 124 Training Loss 0.6908017992973328
Epoch 80 Validation Loss 0.6920117139816284


 27%|██▋       | 81/300 [01:26<05:20,  1.46s/it]

Epoch 81 Samples 8000 Step 124 Training Loss 0.6900596618652344
Epoch 81 Validation Loss 0.6920042634010315


 27%|██▋       | 82/300 [01:28<05:18,  1.46s/it]

Epoch 82 Samples 8000 Step 124 Training Loss 0.6925234198570251
Epoch 82 Validation Loss 0.6920034885406494


 28%|██▊       | 83/300 [01:29<05:14,  1.45s/it]

Epoch 83 Samples 8000 Step 124 Training Loss 0.6922678351402283
Epoch 83 Validation Loss 0.6920016407966614
Epoch 84 Samples 8000 Step 124 Training Loss 0.6918812394142151


 28%|██▊       | 84/300 [01:31<05:18,  1.47s/it]

Epoch 84 Validation Loss 0.69200199842453


 28%|██▊       | 85/300 [01:32<05:27,  1.52s/it]

Epoch 85 Samples 8000 Step 124 Training Loss 0.6928366422653198
Epoch 85 Validation Loss 0.6920024156570435


 29%|██▊       | 86/300 [01:34<05:48,  1.63s/it]

Epoch 86 Samples 8000 Step 124 Training Loss 0.691138505935669
Epoch 86 Validation Loss 0.692000150680542


 29%|██▉       | 87/300 [01:36<05:32,  1.56s/it]

Epoch 87 Samples 8000 Step 124 Training Loss 0.6912211179733276
Epoch 87 Validation Loss 0.6920011639595032


 29%|██▉       | 88/300 [01:38<06:05,  1.72s/it]

Epoch 88 Samples 8000 Step 124 Training Loss 0.6925740838050842
Epoch 88 Validation Loss 0.6920022964477539


 30%|██▉       | 89/300 [01:39<05:57,  1.70s/it]

Epoch 89 Samples 8000 Step 124 Training Loss 0.6921970248222351
Epoch 89 Validation Loss 0.6920005679130554


 30%|███       | 90/300 [01:41<06:12,  1.77s/it]

Epoch 90 Samples 8000 Step 124 Training Loss 0.6914912462234497
Epoch 90 Validation Loss 0.6920230388641357


 30%|███       | 91/300 [01:43<05:51,  1.68s/it]

Epoch 91 Samples 8000 Step 124 Training Loss 0.6937936544418335
Epoch 91 Validation Loss 0.6920033693313599


 31%|███       | 92/300 [01:44<05:58,  1.72s/it]

Epoch 92 Samples 8000 Step 124 Training Loss 0.6920933127403259
Epoch 92 Validation Loss 0.6920070052146912


 31%|███       | 93/300 [01:46<05:51,  1.70s/it]

Epoch 93 Samples 8000 Step 124 Training Loss 0.691157877445221
Epoch 93 Validation Loss 0.69200199842453


 31%|███▏      | 94/300 [01:48<05:34,  1.62s/it]

Epoch 94 Samples 8000 Step 124 Training Loss 0.6919824481010437
Epoch 94 Validation Loss 0.6920017004013062


 32%|███▏      | 95/300 [01:49<05:23,  1.58s/it]

Epoch 95 Samples 8000 Step 124 Training Loss 0.6930343508720398
Epoch 95 Validation Loss 0.692003071308136


 32%|███▏      | 96/300 [01:51<05:34,  1.64s/it]

Epoch 96 Samples 8000 Step 124 Training Loss 0.6913949251174927
Epoch 96 Validation Loss 0.6920017004013062


 32%|███▏      | 97/300 [01:53<05:33,  1.64s/it]

Epoch 97 Samples 8000 Step 124 Training Loss 0.6912178993225098
Epoch 97 Validation Loss 0.6920071244239807


 33%|███▎      | 98/300 [01:54<05:27,  1.62s/it]

Epoch 98 Samples 8000 Step 124 Training Loss 0.6924534440040588
Epoch 98 Validation Loss 0.691999614238739


 33%|███▎      | 99/300 [01:55<05:14,  1.56s/it]

Epoch 99 Samples 8000 Step 124 Training Loss 0.6919103860855103
Epoch 99 Validation Loss 0.6920011639595032
Epoch 100 Samples 8000 Step 124 Training Loss 0.6930744051933289


 33%|███▎      | 100/300 [01:57<05:21,  1.61s/it]

Epoch 100 Validation Loss 0.6920074820518494


 34%|███▎      | 101/300 [01:59<05:24,  1.63s/it]

Epoch 101 Samples 8000 Step 124 Training Loss 0.6939053535461426
Epoch 101 Validation Loss 0.6920065879821777


 34%|███▍      | 102/300 [02:01<05:29,  1.66s/it]

Epoch 102 Samples 8000 Step 124 Training Loss 0.6930809020996094
Epoch 102 Validation Loss 0.6919999718666077


 34%|███▍      | 103/300 [02:02<05:14,  1.59s/it]

Epoch 103 Samples 8000 Step 124 Training Loss 0.6924625039100647
Epoch 103 Validation Loss 0.6920009255409241


 35%|███▍      | 104/300 [02:03<05:01,  1.54s/it]

Epoch 104 Samples 8000 Step 124 Training Loss 0.6927512884140015
Epoch 104 Validation Loss 0.6920017600059509


 35%|███▌      | 105/300 [02:05<04:55,  1.52s/it]

Epoch 105 Samples 8000 Step 124 Training Loss 0.6903534531593323
Epoch 105 Validation Loss 0.6920039057731628


 35%|███▌      | 106/300 [02:06<04:51,  1.50s/it]

Epoch 106 Samples 8000 Step 124 Training Loss 0.6922447085380554
Epoch 106 Validation Loss 0.6920106410980225


 36%|███▌      | 107/300 [02:08<04:46,  1.48s/it]

Epoch 107 Samples 8000 Step 124 Training Loss 0.6911536455154419
Epoch 107 Validation Loss 0.6920108199119568


 36%|███▌      | 108/300 [02:09<04:43,  1.48s/it]

Epoch 108 Samples 8000 Step 124 Training Loss 0.692272961139679
Epoch 108 Validation Loss 0.691999614238739


 36%|███▋      | 109/300 [02:11<04:38,  1.46s/it]

Epoch 109 Samples 8000 Step 124 Training Loss 0.6918676495552063
Epoch 109 Validation Loss 0.6919991970062256


 37%|███▋      | 110/300 [02:12<04:15,  1.34s/it]

Epoch 110 Samples 8000 Step 124 Training Loss 0.6926888227462769
Epoch 110 Validation Loss 0.6920257806777954


 37%|███▋      | 111/300 [02:13<04:29,  1.43s/it]

Epoch 111 Samples 8000 Step 124 Training Loss 0.6922897100448608
Epoch 111 Validation Loss 0.6920018196105957


 37%|███▋      | 112/300 [02:15<04:32,  1.45s/it]

Epoch 112 Samples 8000 Step 124 Training Loss 0.692055344581604
Epoch 112 Validation Loss 0.6919998526573181


 38%|███▊      | 113/300 [02:16<04:37,  1.48s/it]

Epoch 113 Samples 8000 Step 124 Training Loss 0.6939899921417236
Epoch 113 Validation Loss 0.6920021772384644


 38%|███▊      | 114/300 [02:18<04:35,  1.48s/it]

Epoch 114 Samples 8000 Step 124 Training Loss 0.6908327341079712
Epoch 114 Validation Loss 0.6920067071914673


 38%|███▊      | 115/300 [02:19<04:33,  1.48s/it]

Epoch 115 Samples 8000 Step 124 Training Loss 0.6920492053031921
Epoch 115 Validation Loss 0.6920017600059509
Epoch 116 Samples 8000 Step 124 Training Loss 0.691024661064148


 39%|███▊      | 116/300 [02:21<04:40,  1.53s/it]

Epoch 116 Validation Loss 0.692012369632721


 39%|███▉      | 117/300 [02:23<05:02,  1.65s/it]

Epoch 117 Samples 8000 Step 124 Training Loss 0.6910024285316467
Epoch 117 Validation Loss 0.6920032501220703


 39%|███▉      | 118/300 [02:25<04:51,  1.60s/it]

Epoch 118 Samples 8000 Step 124 Training Loss 0.6927241683006287
Epoch 118 Validation Loss 0.6920009851455688


 40%|███▉      | 119/300 [02:26<04:50,  1.60s/it]

Epoch 119 Samples 8000 Step 124 Training Loss 0.6912224888801575
Epoch 119 Validation Loss 0.6920004487037659


 40%|████      | 120/300 [02:28<04:41,  1.57s/it]

Epoch 120 Samples 8000 Step 124 Training Loss 0.6920091509819031
Epoch 120 Validation Loss 0.6920031905174255


 40%|████      | 121/300 [02:29<04:33,  1.53s/it]

Epoch 121 Samples 8000 Step 124 Training Loss 0.6913889646530151
Epoch 121 Validation Loss 0.6920029520988464


 41%|████      | 122/300 [02:31<04:31,  1.53s/it]

Epoch 122 Samples 8000 Step 124 Training Loss 0.6935503482818604
Epoch 122 Validation Loss 0.6920033693313599


 41%|████      | 123/300 [02:32<04:37,  1.57s/it]

Epoch 123 Samples 8000 Step 124 Training Loss 0.691724419593811
Epoch 123 Validation Loss 0.691999077796936
Epoch 124 Samples 8000 Step 124 Training Loss 0.6924852132797241


 41%|████▏     | 124/300 [02:34<04:50,  1.65s/it]

Epoch 124 Validation Loss 0.6920040249824524


 42%|████▏     | 125/300 [02:36<05:08,  1.76s/it]

Epoch 125 Samples 8000 Step 124 Training Loss 0.691893994808197
Epoch 125 Validation Loss 0.6920053958892822


 42%|████▏     | 126/300 [02:38<05:11,  1.79s/it]

Epoch 126 Samples 8000 Step 124 Training Loss 0.6914897561073303
Epoch 126 Validation Loss 0.6919997334480286
Epoch 127 Samples 8000 Step 124 Training Loss 0.6910786628723145


 42%|████▏     | 127/300 [02:40<05:16,  1.83s/it]

Epoch 127 Validation Loss 0.6920042037963867


 43%|████▎     | 128/300 [02:42<05:06,  1.78s/it]

Epoch 128 Samples 8000 Step 124 Training Loss 0.6918404698371887
Epoch 128 Validation Loss 0.6919999122619629


 43%|████▎     | 129/300 [02:43<04:55,  1.73s/it]

Epoch 129 Samples 8000 Step 124 Training Loss 0.6929922699928284
Epoch 129 Validation Loss 0.6920046210289001


 43%|████▎     | 130/300 [02:44<04:22,  1.54s/it]

Epoch 130 Samples 8000 Step 124 Training Loss 0.6916854381561279
Epoch 130 Validation Loss 0.6920003890991211


 44%|████▎     | 131/300 [02:45<04:01,  1.43s/it]

Epoch 131 Samples 8000 Step 124 Training Loss 0.6925297975540161
Epoch 131 Validation Loss 0.6919991374015808


 44%|████▍     | 132/300 [02:47<04:27,  1.59s/it]

Epoch 132 Samples 8000 Step 124 Training Loss 0.6923843026161194
Epoch 132 Validation Loss 0.6920008063316345


 44%|████▍     | 133/300 [02:50<05:04,  1.82s/it]

Epoch 133 Samples 8000 Step 124 Training Loss 0.6913503408432007
Epoch 133 Validation Loss 0.6919983625411987


 45%|████▍     | 134/300 [02:51<04:30,  1.63s/it]

Epoch 134 Samples 8000 Step 124 Training Loss 0.6933311223983765
Epoch 134 Validation Loss 0.6920070648193359


 45%|████▌     | 135/300 [02:52<04:21,  1.58s/it]

Epoch 135 Samples 8000 Step 124 Training Loss 0.6910083293914795
Epoch 135 Validation Loss 0.692000150680542


 45%|████▌     | 136/300 [02:54<04:30,  1.65s/it]

Epoch 136 Samples 8000 Step 124 Training Loss 0.6918050646781921
Epoch 136 Validation Loss 0.692000687122345


 46%|████▌     | 137/300 [02:56<04:36,  1.69s/it]

Epoch 137 Samples 8000 Step 124 Training Loss 0.691343367099762
Epoch 137 Validation Loss 0.6920055150985718


 46%|████▌     | 138/300 [02:57<04:04,  1.51s/it]

Epoch 138 Samples 8000 Step 124 Training Loss 0.6921870708465576
Epoch 138 Validation Loss 0.6920021772384644


 46%|████▋     | 139/300 [02:59<04:03,  1.51s/it]

Epoch 139 Samples 8000 Step 124 Training Loss 0.6909912824630737
Epoch 139 Validation Loss 0.691999614238739


 47%|████▋     | 140/300 [03:00<04:01,  1.51s/it]

Epoch 140 Samples 8000 Step 124 Training Loss 0.6912548542022705
Epoch 140 Validation Loss 0.6919981837272644


 47%|████▋     | 141/300 [03:02<04:08,  1.56s/it]

Epoch 141 Samples 8000 Step 124 Training Loss 0.6918730139732361
Epoch 141 Validation Loss 0.6920091509819031


 47%|████▋     | 142/300 [03:03<03:59,  1.52s/it]

Epoch 142 Samples 8000 Step 124 Training Loss 0.6921893358230591
Epoch 142 Validation Loss 0.6920028328895569


 48%|████▊     | 143/300 [03:05<03:59,  1.53s/it]

Epoch 143 Samples 8000 Step 124 Training Loss 0.6910712122917175
Epoch 143 Validation Loss 0.6919985413551331


 48%|████▊     | 144/300 [03:06<04:08,  1.59s/it]

Epoch 144 Samples 8000 Step 124 Training Loss 0.6918905973434448
Epoch 144 Validation Loss 0.692000150680542


 48%|████▊     | 145/300 [03:08<03:49,  1.48s/it]

Epoch 145 Samples 8000 Step 124 Training Loss 0.6911419034004211
Epoch 145 Validation Loss 0.69200199842453
Epoch 146 Samples 8000 Step 124 Training Loss 0.6922732591629028


 49%|████▊     | 146/300 [03:10<04:32,  1.77s/it]

Epoch 146 Validation Loss 0.6919977068901062


 49%|████▉     | 147/300 [03:13<05:02,  1.98s/it]

Epoch 147 Samples 8000 Step 124 Training Loss 0.6933801174163818
Epoch 147 Validation Loss 0.6919977068901062


 49%|████▉     | 148/300 [03:14<04:40,  1.84s/it]

Epoch 148 Samples 8000 Step 124 Training Loss 0.6906822323799133
Epoch 148 Validation Loss 0.6920138597488403


 50%|████▉     | 149/300 [03:16<04:55,  1.95s/it]

Epoch 149 Samples 8000 Step 124 Training Loss 0.6933206915855408
Epoch 149 Validation Loss 0.6919988989830017


 50%|█████     | 150/300 [03:18<04:50,  1.94s/it]

Epoch 150 Samples 8000 Step 124 Training Loss 0.6931220889091492
Epoch 150 Validation Loss 0.6920185685157776


 50%|█████     | 151/300 [03:20<04:42,  1.89s/it]

Epoch 151 Samples 8000 Step 124 Training Loss 0.6910825967788696
Epoch 151 Validation Loss 0.6919999718666077


 51%|█████     | 152/300 [03:22<04:41,  1.90s/it]

Epoch 152 Samples 8000 Step 124 Training Loss 0.6922399997711182
Epoch 152 Validation Loss 0.691997230052948


 51%|█████     | 153/300 [03:24<04:24,  1.80s/it]

Epoch 153 Samples 8000 Step 124 Training Loss 0.6918651461601257
Epoch 153 Validation Loss 0.6920008063316345


 51%|█████▏    | 154/300 [03:25<04:25,  1.82s/it]

Epoch 154 Samples 8000 Step 124 Training Loss 0.6914075016975403
Epoch 154 Validation Loss 0.6919991970062256


 52%|█████▏    | 155/300 [03:27<04:09,  1.72s/it]

Epoch 155 Samples 8000 Step 124 Training Loss 0.6920529007911682
Epoch 155 Validation Loss 0.6919976472854614


 52%|█████▏    | 156/300 [03:28<03:51,  1.61s/it]

Epoch 156 Samples 8000 Step 124 Training Loss 0.6904996633529663
Epoch 156 Validation Loss 0.6919981241226196


 52%|█████▏    | 157/300 [03:30<03:50,  1.61s/it]

Epoch 157 Samples 8000 Step 124 Training Loss 0.6937254667282104
Epoch 157 Validation Loss 0.692000150680542


 53%|█████▎    | 158/300 [03:32<04:12,  1.78s/it]

Epoch 158 Samples 8000 Step 124 Training Loss 0.6927582025527954
Epoch 158 Validation Loss 0.6919997334480286


 53%|█████▎    | 159/300 [03:33<03:51,  1.64s/it]

Epoch 159 Samples 8000 Step 124 Training Loss 0.6928990483283997
Epoch 159 Validation Loss 0.6919967532157898


 53%|█████▎    | 160/300 [03:34<03:28,  1.49s/it]

Epoch 160 Samples 8000 Step 124 Training Loss 0.6934666633605957
Epoch 160 Validation Loss 0.692011833190918


 54%|█████▎    | 161/300 [03:36<03:33,  1.54s/it]

Epoch 161 Samples 8000 Step 124 Training Loss 0.694017767906189
Epoch 161 Validation Loss 0.6919966340065002


 54%|█████▍    | 162/300 [03:38<03:31,  1.53s/it]

Epoch 162 Samples 8000 Step 124 Training Loss 0.6927714943885803
Epoch 162 Validation Loss 0.6919966340065002


 54%|█████▍    | 163/300 [03:39<03:14,  1.42s/it]

Epoch 163 Samples 8000 Step 124 Training Loss 0.6917229294776917
Epoch 163 Validation Loss 0.6919944882392883


 55%|█████▍    | 164/300 [03:40<03:20,  1.48s/it]

Epoch 164 Samples 8000 Step 124 Training Loss 0.6910645961761475
Epoch 164 Validation Loss 0.6913827061653137


 55%|█████▌    | 165/300 [03:42<03:28,  1.54s/it]

Epoch 165 Samples 8000 Step 124 Training Loss 0.6917432546615601
Epoch 165 Validation Loss 0.6913931369781494


 55%|█████▌    | 166/300 [03:44<03:22,  1.51s/it]

Epoch 166 Samples 8000 Step 124 Training Loss 0.6888759732246399
Epoch 166 Validation Loss 0.6913851499557495


 56%|█████▌    | 167/300 [03:45<03:28,  1.57s/it]

Epoch 167 Samples 8000 Step 124 Training Loss 0.6924312114715576
Epoch 167 Validation Loss 0.6913840770721436


 56%|█████▌    | 168/300 [03:47<03:38,  1.65s/it]

Epoch 168 Samples 8000 Step 124 Training Loss 0.6881006360054016
Epoch 168 Validation Loss 0.6914048790931702


 56%|█████▋    | 169/300 [03:49<03:27,  1.59s/it]

Epoch 169 Samples 8000 Step 124 Training Loss 0.6908835768699646
Epoch 169 Validation Loss 0.6905282735824585


 57%|█████▋    | 170/300 [03:50<03:24,  1.57s/it]

Epoch 170 Samples 8000 Step 124 Training Loss 0.6918864846229553
Epoch 170 Validation Loss 0.6905187368392944


 57%|█████▋    | 171/300 [03:51<03:15,  1.51s/it]

Epoch 171 Samples 8000 Step 124 Training Loss 0.6897691488265991
Epoch 171 Validation Loss 0.6905168294906616


 57%|█████▋    | 172/300 [03:53<03:09,  1.48s/it]

Epoch 172 Samples 8000 Step 124 Training Loss 0.6890938878059387
Epoch 172 Validation Loss 0.6905117630958557


 58%|█████▊    | 173/300 [03:54<03:09,  1.49s/it]

Epoch 173 Samples 8000 Step 124 Training Loss 0.6883867979049683
Epoch 173 Validation Loss 0.6905110478401184
Epoch 174 Samples 8000 Step 124 Training Loss 0.6895580887794495


 58%|█████▊    | 174/300 [03:56<03:25,  1.63s/it]

Epoch 174 Validation Loss 0.6890979409217834


 58%|█████▊    | 175/300 [03:58<03:27,  1.66s/it]

Epoch 175 Samples 8000 Step 124 Training Loss 0.6898612380027771
Epoch 175 Validation Loss 0.6889541745185852


 59%|█████▊    | 176/300 [04:00<03:50,  1.86s/it]

Epoch 176 Samples 8000 Step 124 Training Loss 0.6883450150489807
Epoch 176 Validation Loss 0.6903007626533508


 59%|█████▉    | 177/300 [04:03<04:01,  1.96s/it]

Epoch 177 Samples 8000 Step 124 Training Loss 0.6894129514694214
Epoch 177 Validation Loss 0.6883425116539001


 59%|█████▉    | 178/300 [04:04<03:30,  1.73s/it]

Epoch 178 Samples 8000 Step 124 Training Loss 0.6849892735481262
Epoch 178 Validation Loss 0.6883357167243958


 60%|█████▉    | 179/300 [04:06<03:59,  1.98s/it]

Epoch 179 Samples 8000 Step 124 Training Loss 0.6890956163406372
Epoch 179 Validation Loss 0.6883468627929688


 60%|██████    | 180/300 [04:08<03:42,  1.85s/it]

Epoch 180 Samples 8000 Step 124 Training Loss 0.6872346997261047
Epoch 180 Validation Loss 0.6883333325386047


 60%|██████    | 181/300 [04:09<03:19,  1.67s/it]

Epoch 181 Samples 8000 Step 124 Training Loss 0.6908268332481384
Epoch 181 Validation Loss 0.6883323788642883


 61%|██████    | 182/300 [04:12<03:45,  1.91s/it]

Epoch 182 Samples 8000 Step 124 Training Loss 0.6875814199447632
Epoch 182 Validation Loss 0.6883296966552734


 61%|██████    | 183/300 [04:13<03:23,  1.74s/it]

Epoch 183 Samples 8000 Step 124 Training Loss 0.6878709197044373
Epoch 183 Validation Loss 0.6883285045623779


 61%|██████▏   | 184/300 [04:15<03:49,  1.98s/it]

Epoch 184 Samples 8000 Step 124 Training Loss 0.6873676776885986
Epoch 184 Validation Loss 0.6883261799812317


 62%|██████▏   | 185/300 [04:18<04:02,  2.10s/it]

Epoch 185 Samples 8000 Step 124 Training Loss 0.687682569026947
Epoch 185 Validation Loss 0.6883262395858765


 62%|██████▏   | 186/300 [04:19<03:38,  1.92s/it]

Epoch 186 Samples 8000 Step 124 Training Loss 0.6881564855575562
Epoch 186 Validation Loss 0.6883259415626526


 62%|██████▏   | 187/300 [04:21<03:25,  1.82s/it]

Epoch 187 Samples 8000 Step 124 Training Loss 0.68636155128479
Epoch 187 Validation Loss 0.6883199214935303


 63%|██████▎   | 188/300 [04:22<03:13,  1.72s/it]

Epoch 188 Samples 8000 Step 124 Training Loss 0.6839146614074707
Epoch 188 Validation Loss 0.6883276104927063


 63%|██████▎   | 189/300 [04:24<03:14,  1.75s/it]

Epoch 189 Samples 8000 Step 124 Training Loss 0.6883457899093628
Epoch 189 Validation Loss 0.6883091330528259
Epoch 190 Samples 8000 Step 124 Training Loss 0.6868680715560913
Epoch 190 Validation Loss 0.6881939768791199


 64%|██████▎   | 191/300 [04:27<03:03,  1.68s/it]

Epoch 191 Samples 8000 Step 124 Training Loss 0.6868616342544556
Epoch 191 Validation Loss 0.6877605319023132


 64%|██████▍   | 192/300 [04:29<02:53,  1.61s/it]

Epoch 192 Samples 8000 Step 124 Training Loss 0.6880296468734741
Epoch 192 Validation Loss 0.6877647042274475


 64%|██████▍   | 193/300 [04:30<02:45,  1.55s/it]

Epoch 193 Samples 8000 Step 124 Training Loss 0.6856095790863037
Epoch 193 Validation Loss 0.6877569556236267


 65%|██████▍   | 194/300 [04:31<02:32,  1.44s/it]

Epoch 194 Samples 8000 Step 124 Training Loss 0.6893007755279541
Epoch 194 Validation Loss 0.6877580881118774


 65%|██████▌   | 195/300 [04:33<02:28,  1.41s/it]

Epoch 195 Samples 8000 Step 124 Training Loss 0.6858335137367249
Epoch 195 Validation Loss 0.6877561807632446


 65%|██████▌   | 196/300 [04:34<02:28,  1.43s/it]

Epoch 196 Samples 8000 Step 124 Training Loss 0.6862361431121826
Epoch 196 Validation Loss 0.6877564787864685


 66%|██████▌   | 197/300 [04:36<02:32,  1.48s/it]

Epoch 197 Samples 8000 Step 124 Training Loss 0.6860724091529846
Epoch 197 Validation Loss 0.6877536177635193


 66%|██████▌   | 198/300 [04:38<02:40,  1.57s/it]

Epoch 198 Samples 8000 Step 124 Training Loss 0.6871348023414612
Epoch 198 Validation Loss 0.6877580881118774


 66%|██████▋   | 199/300 [04:39<02:28,  1.47s/it]

Epoch 199 Samples 8000 Step 124 Training Loss 0.6835481524467468
Epoch 199 Validation Loss 0.6877515912055969


 67%|██████▋   | 200/300 [04:40<02:30,  1.51s/it]

Epoch 200 Samples 8000 Step 124 Training Loss 0.6872252225875854
Epoch 200 Validation Loss 0.6877514719963074


 67%|██████▋   | 201/300 [04:42<02:30,  1.52s/it]

Epoch 201 Samples 8000 Step 124 Training Loss 0.6849112510681152
Epoch 201 Validation Loss 0.6877521276473999


 67%|██████▋   | 202/300 [04:43<02:21,  1.45s/it]

Epoch 202 Samples 8000 Step 124 Training Loss 0.6862448453903198
Epoch 202 Validation Loss 0.6877502799034119


 68%|██████▊   | 203/300 [04:45<02:19,  1.43s/it]

Epoch 203 Samples 8000 Step 124 Training Loss 0.6857687830924988
Epoch 203 Validation Loss 0.687750518321991


 68%|██████▊   | 204/300 [04:46<02:17,  1.44s/it]

Epoch 204 Samples 8000 Step 124 Training Loss 0.688724935054779
Epoch 204 Validation Loss 0.6877685785293579


 68%|██████▊   | 205/300 [04:48<02:16,  1.43s/it]

Epoch 205 Samples 8000 Step 124 Training Loss 0.6858420968055725
Epoch 205 Validation Loss 0.6877498626708984


 69%|██████▊   | 206/300 [04:49<02:20,  1.50s/it]

Epoch 206 Samples 8000 Step 124 Training Loss 0.6898208856582642
Epoch 206 Validation Loss 0.6877513527870178


 69%|██████▉   | 207/300 [04:51<02:16,  1.46s/it]

Epoch 207 Samples 8000 Step 124 Training Loss 0.6862247586250305
Epoch 207 Validation Loss 0.6877506971359253


 69%|██████▉   | 208/300 [04:52<02:15,  1.47s/it]

Epoch 208 Samples 8000 Step 124 Training Loss 0.685894787311554
Epoch 208 Validation Loss 0.6877518892288208


 70%|██████▉   | 209/300 [04:54<02:13,  1.46s/it]

Epoch 209 Samples 8000 Step 124 Training Loss 0.6888011693954468
Epoch 209 Validation Loss 0.687748908996582


 70%|███████   | 210/300 [04:55<02:14,  1.49s/it]

Epoch 210 Samples 8000 Step 124 Training Loss 0.6875914931297302
Epoch 210 Validation Loss 0.687749981880188


 70%|███████   | 211/300 [04:56<02:09,  1.45s/it]

Epoch 211 Samples 8000 Step 124 Training Loss 0.6883975267410278
Epoch 211 Validation Loss 0.6877482533454895


 71%|███████   | 212/300 [04:58<02:08,  1.46s/it]

Epoch 212 Samples 8000 Step 124 Training Loss 0.6868199110031128
Epoch 212 Validation Loss 0.6877517700195312


 71%|███████   | 213/300 [04:59<02:06,  1.46s/it]

Epoch 213 Samples 8000 Step 124 Training Loss 0.6886996626853943
Epoch 213 Validation Loss 0.6877486705780029


 71%|███████▏  | 214/300 [05:01<02:06,  1.47s/it]

Epoch 214 Samples 8000 Step 124 Training Loss 0.6921563148498535
Epoch 214 Validation Loss 0.6877507567405701


 72%|███████▏  | 215/300 [05:02<02:03,  1.45s/it]

Epoch 215 Samples 8000 Step 124 Training Loss 0.6865912079811096
Epoch 215 Validation Loss 0.6877484917640686


 72%|███████▏  | 216/300 [05:04<02:02,  1.46s/it]

Epoch 216 Samples 8000 Step 124 Training Loss 0.6866697072982788
Epoch 216 Validation Loss 0.687747597694397


 72%|███████▏  | 217/300 [05:05<02:04,  1.50s/it]

Epoch 217 Samples 8000 Step 124 Training Loss 0.6853355765342712
Epoch 217 Validation Loss 0.6877498626708984


 73%|███████▎  | 218/300 [05:07<02:06,  1.55s/it]

Epoch 218 Samples 8000 Step 124 Training Loss 0.6898636817932129
Epoch 218 Validation Loss 0.6877490282058716


 73%|███████▎  | 219/300 [05:08<02:01,  1.50s/it]

Epoch 219 Samples 8000 Step 124 Training Loss 0.6866240501403809
Epoch 219 Validation Loss 0.6877543330192566


 73%|███████▎  | 220/300 [05:10<02:05,  1.57s/it]

Epoch 220 Samples 8000 Step 124 Training Loss 0.6869903206825256
Epoch 220 Validation Loss 0.6877497434616089


 74%|███████▎  | 221/300 [05:12<01:59,  1.51s/it]

Epoch 221 Samples 8000 Step 124 Training Loss 0.6871706247329712
Epoch 221 Validation Loss 0.687752366065979


 74%|███████▍  | 222/300 [05:13<01:46,  1.37s/it]

Epoch 222 Samples 8000 Step 124 Training Loss 0.6886881589889526
Epoch 222 Validation Loss 0.6877548694610596


 74%|███████▍  | 223/300 [05:14<01:51,  1.44s/it]

Epoch 223 Samples 8000 Step 124 Training Loss 0.6889270544052124
Epoch 223 Validation Loss 0.6877493262290955


 75%|███████▍  | 224/300 [05:17<02:18,  1.83s/it]

Epoch 224 Samples 8000 Step 124 Training Loss 0.6870608329772949
Epoch 224 Validation Loss 0.6877527236938477


 75%|███████▌  | 225/300 [05:19<02:28,  1.98s/it]

Epoch 225 Samples 8000 Step 124 Training Loss 0.6884055733680725
Epoch 225 Validation Loss 0.6877484917640686


 75%|███████▌  | 226/300 [05:21<02:19,  1.88s/it]

Epoch 226 Samples 8000 Step 124 Training Loss 0.6871261596679688
Epoch 226 Validation Loss 0.6877502799034119


 76%|███████▌  | 227/300 [05:22<02:09,  1.77s/it]

Epoch 227 Samples 8000 Step 124 Training Loss 0.6892795562744141
Epoch 227 Validation Loss 0.6877614259719849


 76%|███████▌  | 228/300 [05:24<02:03,  1.72s/it]

Epoch 228 Samples 8000 Step 124 Training Loss 0.6837993860244751
Epoch 228 Validation Loss 0.6877651810646057


 76%|███████▋  | 229/300 [05:25<01:55,  1.62s/it]

Epoch 229 Samples 8000 Step 124 Training Loss 0.6890614032745361
Epoch 229 Validation Loss 0.687747061252594


 77%|███████▋  | 230/300 [05:27<01:50,  1.58s/it]

Epoch 230 Samples 8000 Step 124 Training Loss 0.6896336078643799
Epoch 230 Validation Loss 0.6877495050430298


 77%|███████▋  | 231/300 [05:28<01:44,  1.51s/it]

Epoch 231 Samples 8000 Step 124 Training Loss 0.6876636147499084
Epoch 231 Validation Loss 0.6877479553222656


 77%|███████▋  | 232/300 [05:30<01:42,  1.50s/it]

Epoch 232 Samples 8000 Step 124 Training Loss 0.6914889812469482
Epoch 232 Validation Loss 0.6877496242523193


 78%|███████▊  | 233/300 [05:31<01:40,  1.50s/it]

Epoch 233 Samples 8000 Step 124 Training Loss 0.6872393488883972
Epoch 233 Validation Loss 0.6877467632293701


 78%|███████▊  | 234/300 [05:33<01:39,  1.51s/it]

Epoch 234 Samples 8000 Step 124 Training Loss 0.6891822814941406
Epoch 234 Validation Loss 0.6877484321594238


 78%|███████▊  | 235/300 [05:34<01:35,  1.47s/it]

Epoch 235 Samples 8000 Step 124 Training Loss 0.6870685815811157
Epoch 235 Validation Loss 0.6877483129501343


 79%|███████▊  | 236/300 [05:36<01:36,  1.51s/it]

Epoch 236 Samples 8000 Step 124 Training Loss 0.6857621669769287
Epoch 236 Validation Loss 0.6877532601356506


 79%|███████▉  | 237/300 [05:37<01:32,  1.47s/it]

Epoch 237 Samples 8000 Step 124 Training Loss 0.6872473955154419
Epoch 237 Validation Loss 0.6877487897872925


 79%|███████▉  | 238/300 [05:39<01:30,  1.46s/it]

Epoch 238 Samples 8000 Step 124 Training Loss 0.6859226822853088
Epoch 238 Validation Loss 0.687746524810791


 80%|███████▉  | 239/300 [05:40<01:30,  1.48s/it]

Epoch 239 Samples 8000 Step 124 Training Loss 0.6850003004074097
Epoch 239 Validation Loss 0.6877502799034119


 80%|████████  | 240/300 [05:41<01:27,  1.46s/it]

Epoch 240 Samples 8000 Step 124 Training Loss 0.6919053792953491
Epoch 240 Validation Loss 0.687751293182373


 80%|████████  | 241/300 [05:43<01:30,  1.54s/it]

Epoch 241 Samples 8000 Step 124 Training Loss 0.6891576647758484
Epoch 241 Validation Loss 0.687746524810791


 81%|████████  | 242/300 [05:45<01:32,  1.60s/it]

Epoch 242 Samples 8000 Step 124 Training Loss 0.6886643171310425
Epoch 242 Validation Loss 0.6877471804618835


 81%|████████  | 243/300 [05:46<01:27,  1.53s/it]

Epoch 243 Samples 8000 Step 124 Training Loss 0.6836809515953064
Epoch 243 Validation Loss 0.6877541542053223


 81%|████████▏ | 244/300 [05:48<01:26,  1.54s/it]

Epoch 244 Samples 8000 Step 124 Training Loss 0.68647700548172
Epoch 244 Validation Loss 0.6877463459968567


 82%|████████▏ | 245/300 [05:49<01:22,  1.50s/it]

Epoch 245 Samples 8000 Step 124 Training Loss 0.6904190182685852
Epoch 245 Validation Loss 0.6877496242523193


 82%|████████▏ | 246/300 [05:51<01:22,  1.52s/it]

Epoch 246 Samples 8000 Step 124 Training Loss 0.6895979642868042
Epoch 246 Validation Loss 0.6877484917640686


 82%|████████▏ | 247/300 [05:52<01:20,  1.51s/it]

Epoch 247 Samples 8000 Step 124 Training Loss 0.6893556714057922
Epoch 247 Validation Loss 0.6877476572990417


 83%|████████▎ | 248/300 [05:54<01:16,  1.47s/it]

Epoch 248 Samples 8000 Step 124 Training Loss 0.6859007477760315
Epoch 248 Validation Loss 0.6877530813217163


 83%|████████▎ | 249/300 [05:55<01:15,  1.49s/it]

Epoch 249 Samples 8000 Step 124 Training Loss 0.6890655755996704
Epoch 249 Validation Loss 0.6877458095550537


 83%|████████▎ | 250/300 [05:57<01:13,  1.47s/it]

Epoch 250 Samples 8000 Step 124 Training Loss 0.6856434345245361
Epoch 250 Validation Loss 0.687745988368988


 84%|████████▎ | 251/300 [05:58<01:10,  1.43s/it]

Epoch 251 Samples 8000 Step 124 Training Loss 0.6865379810333252
Epoch 251 Validation Loss 0.6877456307411194


 84%|████████▍ | 252/300 [05:59<01:08,  1.43s/it]

Epoch 252 Samples 8000 Step 124 Training Loss 0.6882796287536621
Epoch 252 Validation Loss 0.6877463459968567


 84%|████████▍ | 253/300 [06:01<01:07,  1.43s/it]

Epoch 253 Samples 8000 Step 124 Training Loss 0.6859452724456787
Epoch 253 Validation Loss 0.6877455711364746


 85%|████████▍ | 254/300 [06:02<01:03,  1.39s/it]

Epoch 254 Samples 8000 Step 124 Training Loss 0.687127411365509
Epoch 254 Validation Loss 0.6877458095550537


 85%|████████▌ | 255/300 [06:04<01:03,  1.41s/it]

Epoch 255 Samples 8000 Step 124 Training Loss 0.6847674250602722
Epoch 255 Validation Loss 0.6877472996711731


 85%|████████▌ | 256/300 [06:05<01:00,  1.37s/it]

Epoch 256 Samples 8000 Step 124 Training Loss 0.6858135461807251
Epoch 256 Validation Loss 0.6877586245536804


 86%|████████▌ | 257/300 [06:06<01:01,  1.43s/it]

Epoch 257 Samples 8000 Step 124 Training Loss 0.6875902414321899
Epoch 257 Validation Loss 0.687752902507782


 86%|████████▌ | 258/300 [06:08<00:59,  1.42s/it]

Epoch 258 Samples 8000 Step 124 Training Loss 0.6860899925231934
Epoch 258 Validation Loss 0.687747061252594


 86%|████████▋ | 259/300 [06:09<00:58,  1.43s/it]

Epoch 259 Samples 8000 Step 124 Training Loss 0.6871883869171143
Epoch 259 Validation Loss 0.6877487301826477


 87%|████████▋ | 260/300 [06:11<00:56,  1.41s/it]

Epoch 260 Samples 8000 Step 124 Training Loss 0.6860650181770325
Epoch 260 Validation Loss 0.6877455711364746


 87%|████████▋ | 261/300 [06:12<00:55,  1.43s/it]

Epoch 261 Samples 8000 Step 124 Training Loss 0.6872034668922424
Epoch 261 Validation Loss 0.6877448558807373


 87%|████████▋ | 262/300 [06:13<00:53,  1.40s/it]

Epoch 262 Samples 8000 Step 124 Training Loss 0.6882050633430481
Epoch 262 Validation Loss 0.6877482533454895


 88%|████████▊ | 263/300 [06:15<00:53,  1.46s/it]

Epoch 263 Samples 8000 Step 124 Training Loss 0.6892325282096863
Epoch 263 Validation Loss 0.6877452731132507


 88%|████████▊ | 264/300 [06:17<00:54,  1.53s/it]

Epoch 264 Samples 8000 Step 124 Training Loss 0.6859182715415955
Epoch 264 Validation Loss 0.6877472996711731


 88%|████████▊ | 265/300 [06:18<00:52,  1.49s/it]

Epoch 265 Samples 8000 Step 124 Training Loss 0.6870753169059753
Epoch 265 Validation Loss 0.6877467036247253


 89%|████████▊ | 266/300 [06:20<00:50,  1.49s/it]

Epoch 266 Samples 8000 Step 124 Training Loss 0.6886636018753052
Epoch 266 Validation Loss 0.6877551078796387


 89%|████████▉ | 267/300 [06:21<00:48,  1.46s/it]

Epoch 267 Samples 8000 Step 124 Training Loss 0.6869927644729614
Epoch 267 Validation Loss 0.6877455711364746


 89%|████████▉ | 268/300 [06:23<00:46,  1.46s/it]

Epoch 268 Samples 8000 Step 124 Training Loss 0.6853466629981995
Epoch 268 Validation Loss 0.6877486109733582


 90%|████████▉ | 269/300 [06:24<00:44,  1.43s/it]

Epoch 269 Samples 8000 Step 124 Training Loss 0.6859437227249146
Epoch 269 Validation Loss 0.6877462863922119


 90%|█████████ | 270/300 [06:25<00:43,  1.44s/it]

Epoch 270 Samples 8000 Step 124 Training Loss 0.6905195713043213
Epoch 270 Validation Loss 0.6877442598342896


 90%|█████████ | 271/300 [06:27<00:40,  1.41s/it]

Epoch 271 Samples 8000 Step 124 Training Loss 0.6850324869155884
Epoch 271 Validation Loss 0.6877490282058716


 91%|█████████ | 272/300 [06:28<00:40,  1.43s/it]

Epoch 272 Samples 8000 Step 124 Training Loss 0.6835477948188782
Epoch 272 Validation Loss 0.6877438426017761


 91%|█████████ | 273/300 [06:30<00:38,  1.42s/it]

Epoch 273 Samples 8000 Step 124 Training Loss 0.6845075488090515
Epoch 273 Validation Loss 0.6877497434616089


 91%|█████████▏| 274/300 [06:31<00:37,  1.43s/it]

Epoch 274 Samples 8000 Step 124 Training Loss 0.6854078769683838
Epoch 274 Validation Loss 0.6877474188804626


 92%|█████████▏| 275/300 [06:32<00:35,  1.44s/it]

Epoch 275 Samples 8000 Step 124 Training Loss 0.6842653155326843
Epoch 275 Validation Loss 0.6877456903457642


 92%|█████████▏| 276/300 [06:34<00:33,  1.41s/it]

Epoch 276 Samples 8000 Step 124 Training Loss 0.687813937664032
Epoch 276 Validation Loss 0.6877479553222656


 92%|█████████▏| 277/300 [06:35<00:31,  1.38s/it]

Epoch 277 Samples 8000 Step 124 Training Loss 0.6849322319030762
Epoch 277 Validation Loss 0.6877477169036865


 93%|█████████▎| 278/300 [06:37<00:30,  1.41s/it]

Epoch 278 Samples 8000 Step 124 Training Loss 0.691670298576355
Epoch 278 Validation Loss 0.6877466440200806


 93%|█████████▎| 279/300 [06:38<00:29,  1.39s/it]

Epoch 279 Samples 8000 Step 124 Training Loss 0.6876951456069946
Epoch 279 Validation Loss 0.6877482533454895


 93%|█████████▎| 280/300 [06:39<00:28,  1.41s/it]

Epoch 280 Samples 8000 Step 124 Training Loss 0.6877522468566895
Epoch 280 Validation Loss 0.6877474188804626


 94%|█████████▎| 281/300 [06:41<00:27,  1.44s/it]

Epoch 281 Samples 8000 Step 124 Training Loss 0.6895061731338501
Epoch 281 Validation Loss 0.687751829624176


 94%|█████████▍| 282/300 [06:42<00:25,  1.42s/it]

Epoch 282 Samples 8000 Step 124 Training Loss 0.6889714002609253
Epoch 282 Validation Loss 0.6877444982528687


 94%|█████████▍| 283/300 [06:44<00:23,  1.39s/it]

Epoch 283 Samples 8000 Step 124 Training Loss 0.6880241632461548
Epoch 283 Validation Loss 0.6877536177635193


 95%|█████████▍| 284/300 [06:45<00:23,  1.44s/it]

Epoch 284 Samples 8000 Step 124 Training Loss 0.687100887298584
Epoch 284 Validation Loss 0.687747597694397


 95%|█████████▌| 285/300 [06:47<00:21,  1.43s/it]

Epoch 285 Samples 8000 Step 124 Training Loss 0.6890726685523987
Epoch 285 Validation Loss 0.6877450346946716


 95%|█████████▌| 286/300 [06:48<00:20,  1.47s/it]

Epoch 286 Samples 8000 Step 124 Training Loss 0.6856589913368225
Epoch 286 Validation Loss 0.6877471804618835


 96%|█████████▌| 287/300 [06:50<00:19,  1.47s/it]

Epoch 287 Samples 8000 Step 124 Training Loss 0.6866145730018616
Epoch 287 Validation Loss 0.6877467036247253


 96%|█████████▌| 288/300 [06:51<00:16,  1.40s/it]

Epoch 288 Samples 8000 Step 124 Training Loss 0.6847906708717346
Epoch 288 Validation Loss 0.6877443194389343


 96%|█████████▋| 289/300 [06:53<00:16,  1.50s/it]

Epoch 289 Samples 8000 Step 124 Training Loss 0.6863297820091248
Epoch 289 Validation Loss 0.6877447366714478


 97%|█████████▋| 290/300 [06:54<00:14,  1.42s/it]

Epoch 290 Samples 8000 Step 124 Training Loss 0.6890578866004944
Epoch 290 Validation Loss 0.687766432762146


 97%|█████████▋| 291/300 [06:55<00:13,  1.45s/it]

Epoch 291 Samples 8000 Step 124 Training Loss 0.6874317526817322
Epoch 291 Validation Loss 0.687748908996582


 97%|█████████▋| 292/300 [06:56<00:10,  1.32s/it]

Epoch 292 Samples 8000 Step 124 Training Loss 0.6891323328018188
Epoch 292 Validation Loss 0.687749981880188


 98%|█████████▊| 293/300 [06:57<00:08,  1.25s/it]

Epoch 293 Samples 8000 Step 124 Training Loss 0.6887152194976807
Epoch 293 Validation Loss 0.687747061252594


 98%|█████████▊| 294/300 [06:58<00:07,  1.19s/it]

Epoch 294 Samples 8000 Step 124 Training Loss 0.6899051666259766
Epoch 294 Validation Loss 0.687747597694397


 98%|█████████▊| 295/300 [07:00<00:06,  1.26s/it]

Epoch 295 Samples 8000 Step 124 Training Loss 0.6873441934585571
Epoch 295 Validation Loss 0.6877461671829224


 99%|█████████▊| 296/300 [07:01<00:04,  1.17s/it]

Epoch 296 Samples 8000 Step 124 Training Loss 0.6878337264060974
Epoch 296 Validation Loss 0.6877435445785522


 99%|█████████▉| 297/300 [07:03<00:04,  1.36s/it]

Epoch 297 Samples 8000 Step 124 Training Loss 0.6890788674354553
Epoch 297 Validation Loss 0.6877437233924866


 99%|█████████▉| 298/300 [07:04<00:02,  1.40s/it]

Epoch 298 Samples 8000 Step 124 Training Loss 0.6862351894378662
Epoch 298 Validation Loss 0.6877444982528687


100%|█████████▉| 299/300 [07:05<00:01,  1.36s/it]

Epoch 299 Samples 8000 Step 124 Training Loss 0.6899139881134033
Epoch 299 Validation Loss 0.6877452731132507


100%|██████████| 300/300 [07:07<00:00,  1.43s/it]
[34m[1mwandb[0m: [32m[41mERROR[0m The nbformat package was not found. It is required to save notebook history.


Epoch 300 Samples 8000 Step 124 Training Loss 0.6833004355430603
Epoch 300 Validation Loss 0.687743604183197


0,1
epoch,▁▁▁▁▂▂▂▂▂▂▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▇▇▇▇▇▇▇████
samples,▆▇▃▅▄▅▂▂▃▄██▆▆▆▆▁▄▃▇▆▇▂▁▅▆▆▆▆▅▄▃▂▇▆▅▅▅▆▇
train_loss,▇▇██▇▇▆▇▇▇▇▇▆▇▇▇▇▆▆▇▆█▇▇▆▅▃▂▄▃▃▃▄▁▆▅▄▅▂▄
val_loss,█████▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▇▅▂▂▁▁▁▁▁▁▁▁▁▁▁▁

0,1
epoch,300.0
samples,8000.0
train_loss,0.6833
val_loss,0.68774


In [None]:
model_name = 'B1'
print(model_name)
print('B||M = KL(Markov || Model), M||B = KL(Model || Markov)', end='\n\n')
for i in range(5, 100, 5):
    try:
        bm, mb = markov_kl(load_model(f'{model_name}/model_{i}.pt', f'{model_name}/model_cfg.pt'))
        print(f"Model {i}: B||M - {bm:.3f}, M||B - {mb:.3f}")
    except Exception:
        break

B1
B||M = KL(Markov || Model), M||B = KL(Model || Markov)

Model 5: B||M - 0.375, M||B - 4.106
Model 10: B||M - 0.087, M||B - 0.747
Model 15: B||M - 0.082, M||B - 0.495
Model 20: B||M - 0.116, M||B - 0.375
Model 25: B||M - 0.146, M||B - 0.359
Model 30: B||M - 0.282, M||B - 2.836
Model 35: B||M - 0.178, M||B - 0.913
Model 40: B||M - 0.181, M||B - 0.845
Model 45: B||M - 0.173, M||B - 0.583


In [None]:
model_name = 'B1'
epoch = 5
print(model_name, end='\n\n')
while True:
    try:
        print(f'Model_{epoch}')
        for i in test_on_all(load_model(f'{model_name}/model_{epoch}.pt', f'{model_name}/model_cfg.pt'), 30):
            print('|', end='')
            #print(f'Sequence: {i.tolist()}, Predictions: {model(i).argmax(dim=-1).flatten().tolist()}')
        print()
        epoch += 5
    except:
        break

B1

Model_5
Accuracy: 87.22 %
||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||||