In [1]:
import torch
import torch.nn as nn
from transformer_lens import HookedTransformer, HookedTransformerConfig

# Configuration
config = HookedTransformerConfig(
    n_layers=2,
    n_heads=8,
    d_model=128,
    d_head=16,  # d_model / n_heads
    d_mlp=None,  # No MLPs (attention-only)
    act_fn=None,  # No activation (no MLPs)
    attention_dir="causal",  # Causal attention
    attn_only=True,  # Attention-only model
    normalization_type=None,  # No LayerNorm for simplicity
    d_vocab=50,  # 26 letters + 10 digits + special tokens
    n_ctx=60,  # Max sequence length
    init_weights=True,
    device="cuda" if torch.cuda.is_available() else "cpu"
)

model = HookedTransformer(config)
print(f"Model parameters: {sum(p.numel() for p in model.parameters()):,}")
print(f"Model device: {next(model.parameters()).device}")
print(f"Model device name: {torch.cuda.get_device_name(0)}")

Model parameters: 152,626
Model device: cuda:0
Model device name: NVIDIA GeForce RTX 4080 SUPER


In [2]:
class Vocabulary:
    class TrieNode:
        def __init__(self):
            self.id = None
            self.next = {}

    def __init__(self):
        self.root = self.TrieNode()
        self.token_map = {} # Stores mapping of ID to token
        self.size = 0

    # Adds a new token into the vocabulary
    def add_token(self, token):
        node = self.root
        for c in token:
            if c not in node.next:
                node.next[c] = self.TrieNode()
            node = node.next[c]
        if node.id is None:
            node.id = self.size
            self.token_map[self.size] = token
            self.size += 1

    # Finds id of longest prefix token of text[start:end], and returns length of token
    def longest_prefix_token(self, text, start):
        longest_token = None
        longest_length = 0
        node = self.root
        for i in range(start, len(text)):
            if text[i] not in node.next:
                break
            node = node.next[text[i]]
            if node.id is not None:
                longest_token = node.id
                longest_length = i - start + 1
        assert longest_token is not None
        return longest_token, longest_length

    # Converts an id to the corresponding token
    def get_token(self, id):
        return self.token_map[id]

In [3]:
# Simple character-level tokenizer
class CountingTokenizer:
    def __init__(self):
        # Vocabulary: letters + digits + special tokens
        self.vocab = Vocabulary()
        chars = list("abcdefghijklmnopqrstuvwxyz0123456789")
        special = ["<PAD>", "<BOS>", "<EOS>", ":", " ", "Count", "the", "letter", "in"]
        raw_vocab = special + chars
        for token in raw_vocab:
            self.vocab.add_token(token)

    def encode(self, text, include_lengths = False):
        """Convert text to token IDs"""
        ids = []
        i = 0
        while i < len(text):
            id, token_length = self.vocab.longest_prefix_token(text, i)
            assert id != -1
            if include_lengths:
                ids.append((id, token_length))
            else:
                ids.append(id)
            i += token_length
        return ids
    
    def decode(self, ids):
        """Convert token IDs to text"""
        return "".join([self.vocab.get_token(id) for id in ids])

    def apply_bpe(self, words, max_token_length=3):
        """Adds merge rules based on a list of words for BPE"""
        text = "".join([f"<BOS>{word}<EOS>" for word in words])
        ignore_tokens = ["<PAD>", "<BOS>", "<EOS>", ":", " "]
        while True:
            encoded = self.encode(text, include_lengths=True)
            pairs = {}
            merge_pair = ()
            for i in range(len(encoded) - 1):
                token_pair = encoded[i], encoded[i + 1]
                if token_pair[0][1] + token_pair[1][1] > max_token_length:
                    continue
                if any(self.vocab.get_token(token[0]) in ignore_tokens for token in token_pair):
                    continue
                pairs[token_pair] = pairs.get(token_pair, 0) + 1
                if not merge_pair or pairs[merge_pair] < pairs[token_pair]:
                    merge_pair = token_pair
            if not merge_pair or pairs[merge_pair] < 2:
                break
            self.vocab.add_token("".join([self.vocab.get_token(token[0]) for token in merge_pair]))

tokenizer = CountingTokenizer()

In [4]:
import random

def generate_counting_question(target_letter='a', multiplicity_range=(1, 2), length_range=(5, 10)):
    """
    Generates a new word based on input parameters and gives the answer to the question
    """
    
    # Sample words with target letter
    words_with_target = []
    count = random.randint(*multiplicity_range)
    
    # Generate string with exact count of target letter
    chars = list("abcdefghijklmnopqrstuvwxyz")
    chars.remove(target_letter)
    
    length = random.randint(max(count, length_range[0]), length_range[1])
    string_chars = random.choices(chars, k=length - count)
    
    # Insert target letters
    positions = random.sample(range(length), count)
    for pos in positions:
        string_chars.insert(pos, target_letter)
    
    input_string = "".join(string_chars[:length])
    question = f"Count the letter {target_letter} in: {input_string}"
    answer = str(count)

    return question, answer

def generate_counting_example(qa, tokenizer=None):
    """
    Generate: "Count the letter a in: banana" -> "3"
    Format: [question tokens] [answer token]
    """
    question, answer = qa
    
    # Tokenize
    question_tokens = tokenizer.encode(question)
    question_tokens_decoded = [tokenizer.decode([token]) for token in question_tokens]
    answer_token = tokenizer.encode(answer)[0]  # Single digit
    
    # Combine: question + answer
    full_tokens = question_tokens + [answer_token]
    
    return {
        'tokens': full_tokens,
        'question_tokens_decoded': question_tokens_decoded,
        'question_length': len(question_tokens),  # For loss masking
        'answer': int(answer),
        'text': question + answer
    }

# Test
tokenizer = CountingTokenizer()
qa = generate_counting_question()
example = generate_counting_example(qa, tokenizer=tokenizer)
print(f"Text: {example['text']}")
print(f"Tokens: {example['tokens']}")
print(f"Question Tokens Decoded: {example['question_tokens_decoded']}")
print(f"Question length: {example['question_length']}")
print(f"Answer: {example['answer']}")

Text: Count the letter a in: nzrwqall1
Tokens: [5, 4, 6, 4, 7, 4, 9, 4, 8, 3, 4, 22, 34, 26, 31, 25, 9, 20, 20, 36]
Question Tokens Decoded: ['Count', ' ', 'the', ' ', 'letter', ' ', 'a', ' ', 'in', ':', ' ', 'n', 'z', 'r', 'w', 'q', 'a', 'l', 'l']
Question length: 19
Answer: 1


In [5]:
# Test
qas = [generate_counting_question() for i in range(10)]
tokenizer = CountingTokenizer()
tokenizer.apply_bpe([qa[0] for qa in qas])
qa = generate_counting_question()
example = generate_counting_example(qa, tokenizer=tokenizer)
print(f"Sample Pre-Processing Text: {"".join([f"<BOS>{qa[0]}<EOS>" for qa in qas])}")
print(f"Text: {example['text']}")
print(f"Tokens: {example['tokens']}")
print(f"Question Tokens Decoded: {example['question_tokens_decoded']}")
print(f"Question length: {example['question_length']}")
print(f"Answer: {example['answer']}")

Sample Pre-Processing Text: <BOS>Count the letter a in: rqarehi<EOS><BOS>Count the letter a in: unawt<EOS><BOS>Count the letter a in: ibanwuxtn<EOS><BOS>Count the letter a in: hmndrna<EOS><BOS>Count the letter a in: zvbagxcy<EOS><BOS>Count the letter a in: umxjmayhub<EOS><BOS>Count the letter a in: laegcy<EOS><BOS>Count the letter a in: jitaabdtc<EOS><BOS>Count the letter a in: kpooqaflyr<EOS><BOS>Count the letter a in: nqldbozaga<EOS>
Text: Count the letter a in: efanj1
Tokens: [5, 4, 6, 4, 7, 4, 9, 4, 8, 3, 4, 13, 14, 9, 22, 18, 36]
Question Tokens Decoded: ['Count', ' ', 'the', ' ', 'letter', ' ', 'a', ' ', 'in', ':', ' ', 'e', 'f', 'a', 'n', 'j']
Question length: 16
Answer: 1


In [7]:
import pickle
from torch.utils.data import Dataset, DataLoader

GENERATE_NEW = False # IMPORTANT: Set this to True only if you want to generate new datasets

class CountingDataset(Dataset):
    def __init__(self, n_examples=50000, difficulty='easy', tokenizer=None):
        """
        difficulty: 'easy', 'bpe-hard', 'mult-hard', etc.
        """
        self.tokenizer = tokenizer
        self.examples = []

        use_bpe = False
        # Set parameters based on difficulty
        if difficulty == 'easy':
            mult_range = (1, 2)
            len_range = (5, 10)
        elif difficulty == 'bpe-hard':
            mult_range = (1, 2)
            len_range = (5, 10)
            use_bpe = True
        elif difficulty == 'mult-hard':
            mult_range = (3, 10)
            len_range = (5, 10)
        elif difficulty == 'length-hard':
            mult_range = (1, 2)
            len_range = (20, 50)
        elif difficulty == 'all-hard':
            mult_range = (3, 10)
            len_range = (20, 50)
            use_bpe = True
        elif difficulty == 'mixed':
            mult_range = (1,10)
            len_range = (5, 50)
            use_bpe = True
        else:
            assert False
        
        # Generate basic words
        target_letters = list("abcdefghijklmnopqrstuvwxyz")
        qas = []
        for _ in range(n_examples):
            target = random.choice(target_letters)
            qa = generate_counting_question(
                target_letter=target,
                multiplicity_range=mult_range,
                length_range=len_range,
            )
            qas.append(qa)

        # Add basic words to the vocabulary if we are applying BPE
        if use_bpe:
            tokenizer.apply_bpe([qa[0] for qa in qas])

        # Generate the full questions and examples
        basic_tokenizer = CountingTokenizer()
        if difficulty == "mixed":
            bpe_set = random.sample(range(len(qas)), len(qas) // 2)
        else:
            bpe_set = range(len(qas))
        for i in range(len(qas)):
            example = generate_counting_example(
                qas[i],
                tokenizer=tokenizer if i in bpe_set else basic_tokenizer
            )
            self.examples.append(example)
    
    def __len__(self):
        return len(self.examples)
    
    def __getitem__(self, idx):
        return self.examples[idx]

def collate_fn(batch, pad_id=0, max_len=60):
    """Pad sequences to same length"""
    # Pad tokens
    tokens = [ex['tokens'] for ex in batch]
    max_batch_len = min(max(len(t) for t in tokens), max_len)
    
    padded_tokens = []
    masks = []  # Loss mask: 1 for answer token, 0 elsewhere
    
    for ex in batch:
        seq = ex['tokens'][:max_batch_len]
        q_len = min(ex['question_length'], max_batch_len - 1)
        
        # Pad sequence
        padded = seq + [pad_id] * (max_batch_len - len(seq))
        padded_tokens.append(padded)
        
        # Create mask: only compute loss on answer token
        mask = [0] * max_batch_len
        if q_len < len(seq):  # If answer token exists
            mask[q_len] = 1  # Answer is right after question
        masks.append(mask)
    
    return {
        'input_ids': torch.tensor(padded_tokens, dtype=torch.long),
        'loss_mask': torch.tensor(masks, dtype=torch.float),
        'answers': torch.tensor([ex['answer'] for ex in batch], dtype=torch.long)
    }

# Create dataloaders
train_dataset_names = ["easy", "bpe-hard", "mult-hard", "length-hard", "all-hard", "mixed"]

# Training dataloaders
train_datasets = {}
train_loaders = {}
train_tokenizers = {}
for name in train_dataset_names:
    if GENERATE_NEW:
        train_tokenizers[name] = CountingTokenizer()
        train_datasets[name] = CountingDataset(n_examples=20000, difficulty=name, tokenizer=train_tokenizers[name])
        with open(f"train-{name}-dataset.pkl", "wb") as f:
            pickle.dump(train_datasets[name], f)
        with open(f"train-{name}-tokenizer.pkl", "wb") as f:
            pickle.dump(train_tokenizers[name], f)
    else:
        with open(f"train-{name}-dataset.pkl", "rb") as f:
            train_datasets[name] = pickle.load(f)
        with open(f"train-{name}-tokenizer.pkl", "rb") as f:
            train_tokenizers[name] = pickle.load(f)
            print(train_tokenizers[name].vocab.size)
    train_loaders[name] = DataLoader(train_datasets[name], batch_size=64, shuffle=True, collate_fn=collate_fn)

# Testing dataloaders
test_dataset_names = ["easy", "bpe-hard", "mult-hard", "length-hard", "all-hard"]
test_datasets = {}
test_tokenizers = {}
for name in test_dataset_names:
    if GENERATE_NEW:
        test_tokenizers[name] = CountingTokenizer()
        test_datasets[name] = CountingDataset(n_examples=2000, difficulty=name, tokenizer=test_tokenizers[name])
        with open(f"test-{name}-dataset.pkl", "wb") as f:
            pickle.dump(test_datasets[name], f)
        with open(f"test-{name}-tokenizer.pkl", "wb") as f:
            pickle.dump(test_tokenizers[name], f)
    else:
        with open(f"test-{name}-dataset.pkl", "rb") as f:
            test_datasets[name] = pickle.load(f)
        with open(f"test-{name}-tokenizer.pkl", "rb") as f:
            test_tokenizers[name] = pickle.load(f)

# Test batch
batch = next(iter(train_loaders["bpe-hard"]))
print(f"Input shape: {batch['input_ids'].shape}")
print(f"Mask shape: {batch['loss_mask'].shape}")
print(f"Example tokens: {batch['input_ids'][0]}")
print(f"Example mask: {batch['loss_mask'][0]}")

45
3004
45
45
3588
3785
Input shape: torch.Size([64, 17])
Mask shape: torch.Size([64, 17])
Example tokens: tensor([  5,   4,   6,   4,   7,   4,  15,   4,   8,   3,   4, 534, 196, 117,
         36,   0,   0])
Example mask: tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0.])


In [8]:
import torch.optim as optim
from tqdm import tqdm

RETRAIN = True # IMPORTANT: Only toggle to True if you want to actually train new models

def train_model(model, train_loader, n_epochs=10, lr=1e-3, device='cuda'):
    """
    Train with MASKED loss (only on answer token)
    
    Classification loss: CrossEntropy on vocabulary
    (Can also try regression loss on digit value)
    """
    model = model.to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=n_epochs)
    
    # Classification loss (predict token ID)
    criterion = nn.CrossEntropyLoss(reduction='none')  # Per-token loss
    
    for epoch in range(n_epochs):
        model.train()
        total_loss = 0
        correct = 0
        total = 0
        
        pbar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{n_epochs}")
        for batch in pbar:
            input_ids = batch['input_ids'].to(device)  # [batch, seq_len]
            loss_mask = batch['loss_mask'].to(device)  # [batch, seq_len]
            
            # Forward pass
            # Input: all tokens except last
            # Target: all tokens except first (shifted by 1)
            logits = model(input_ids[:, :-1])  # [batch, seq_len-1, vocab]
            targets = input_ids[:, 1:]  # [batch, seq_len-1]
            
            # Compute loss
            loss_per_token = criterion(
                logits.reshape(-1, logits.size(-1)),  # [batch*(seq_len-1), vocab]
                targets.reshape(-1)  # [batch*(seq_len-1)]
            )
            loss_per_token = loss_per_token.reshape(targets.shape)  # [batch, seq_len-1]
            
            # Apply mask: only compute loss on answer token
            mask = loss_mask[:, 1:]  # Align with targets
            masked_loss = (loss_per_token * mask).sum() / mask.sum()
            
            # Backward pass
            optimizer.zero_grad()
            masked_loss.backward()
            optimizer.step()
            
            # Metrics
            total_loss += masked_loss.item()
            
            # Accuracy: check if predicted answer digit is correct
            preds = logits.argmax(dim=-1)  # [batch, seq_len-1]
            answer_positions = mask.bool()
            if answer_positions.any():
                correct += (preds[answer_positions] == targets[answer_positions]).sum().item()
                total += answer_positions.sum().item()
            
            pbar.set_postfix({'loss': masked_loss.item(), 
                            'acc': correct/total if total > 0 else 0})
        
        scheduler.step()
        
        print(f"Epoch {epoch+1}: Loss={total_loss/len(train_loader):.4f}, "
              f"Acc={correct/total:.4f}")
        
        # Save checkpoint
        if (epoch + 1) % 25 == 0:
            torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
            }, f'checkpoint_epoch_{epoch+1}.pt')
    
    return model

# Train!
models = {}
for name in train_dataset_names:
    config = HookedTransformerConfig(
        n_layers=2,
        n_heads=8,
        d_model=128,
        d_head=16,  # d_model / n_heads
        d_mlp=None,  # No MLPs (attention-only)
        act_fn=None,  # No activation (no MLPs)
        attention_dir="causal",  # Causal attention
        attn_only=True,  # Attention-only model
        normalization_type=None,  # No LayerNorm for simplicity
        d_vocab=train_tokenizers[name].vocab.size + 5,  # Total vocab size for particular tokenizer
        n_ctx=60,  # Max sequence length
        init_weights=True,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
    print(f"Starting to train model for {name} with vocab size {train_tokenizers[name].vocab.size}")
    model = HookedTransformer(config)
    models[name] = train_model(model, train_loaders[name], n_epochs=100, lr=1e-3)

Starting to train model for easy with vocab size 45
Moving model to device:  cuda


Epoch 1/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 263.68it/s, loss=0.682, acc=0.494]


Epoch 1: Loss=0.7863, Acc=0.4941


Epoch 2/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 320.83it/s, loss=0.701, acc=0.506]


Epoch 2: Loss=0.6985, Acc=0.5065


Epoch 3/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.60it/s, loss=0.686, acc=0.51]


Epoch 3: Loss=0.6963, Acc=0.5098


Epoch 4/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 323.40it/s, loss=0.681, acc=0.509]


Epoch 4: Loss=0.6960, Acc=0.5095


Epoch 5/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 338.94it/s, loss=0.652, acc=0.534]


Epoch 5: Loss=0.6909, Acc=0.5343


Epoch 6/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 328.91it/s, loss=0.662, acc=0.572]


Epoch 6: Loss=0.6792, Acc=0.5716


Epoch 7/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 335.35it/s, loss=0.525, acc=0.63]


Epoch 7: Loss=0.6372, Acc=0.6301


Epoch 8/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 335.45it/s, loss=0.425, acc=0.836]


Epoch 8: Loss=0.3594, Acc=0.8365


Epoch 9/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 328.61it/s, loss=0.002, acc=0.969]


Epoch 9: Loss=0.0931, Acc=0.9688


Epoch 10/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 319.38it/s, loss=0.0261, acc=0.979]


Epoch 10: Loss=0.0630, Acc=0.9794


Epoch 11/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 317.98it/s, loss=0.0105, acc=0.995]


Epoch 11: Loss=0.0211, Acc=0.9946


Epoch 12/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 325.85it/s, loss=0.000596, acc=0.994]


Epoch 12: Loss=0.0194, Acc=0.9942


Epoch 13/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 324.74it/s, loss=0.00166, acc=0.997]


Epoch 13: Loss=0.0131, Acc=0.9966


Epoch 14/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 332.70it/s, loss=0.00438, acc=0.992]


Epoch 14: Loss=0.0264, Acc=0.9923


Epoch 15/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 341.58it/s, loss=9.85e-5, acc=1]


Epoch 15: Loss=0.0013, Acc=1.0000


Epoch 16/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 335.40it/s, loss=0.00012, acc=1]


Epoch 16: Loss=0.0006, Acc=1.0000


Epoch 17/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 331.86it/s, loss=0.0003, acc=1]


Epoch 17: Loss=0.0003, Acc=1.0000


Epoch 18/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.97it/s, loss=1.27e-5, acc=1]


Epoch 18: Loss=0.0002, Acc=1.0000


Epoch 19/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.85it/s, loss=5.12e-5, acc=1]


Epoch 19: Loss=0.0001, Acc=1.0000


Epoch 20/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 332.08it/s, loss=0.000143, acc=1]


Epoch 20: Loss=0.0001, Acc=1.0000


Epoch 21/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 337.53it/s, loss=0.000169, acc=1]


Epoch 21: Loss=0.0001, Acc=1.0000


Epoch 22/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.06it/s, loss=3.47e-5, acc=1]


Epoch 22: Loss=0.0001, Acc=1.0000


Epoch 23/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.21it/s, loss=2.31e-5, acc=1]


Epoch 23: Loss=0.0000, Acc=1.0000


Epoch 24/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.41it/s, loss=9.02e-5, acc=1]


Epoch 24: Loss=0.0000, Acc=1.0000


Epoch 25/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.50it/s, loss=4.66e-7, acc=1]


Epoch 25: Loss=0.0000, Acc=1.0000


Epoch 26/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.08it/s, loss=5.48e-5, acc=1]


Epoch 26: Loss=0.0000, Acc=1.0000


Epoch 27/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.56it/s, loss=3.41e-5, acc=1]


Epoch 27: Loss=0.0000, Acc=1.0000


Epoch 28/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.98it/s, loss=4.95e-5, acc=1]


Epoch 28: Loss=0.0000, Acc=1.0000


Epoch 29/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.24it/s, loss=1.13e-5, acc=1]


Epoch 29: Loss=0.0000, Acc=1.0000


Epoch 30/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 332.11it/s, loss=7.36e-6, acc=1]


Epoch 30: Loss=0.0000, Acc=1.0000


Epoch 31/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 315.80it/s, loss=0.000936, acc=0.965]


Epoch 31: Loss=0.1179, Acc=0.9648


Epoch 32/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 325.69it/s, loss=0.000667, acc=1]


Epoch 32: Loss=0.0023, Acc=0.9997


Epoch 33/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 336.99it/s, loss=0.00978, acc=0.995]


Epoch 33: Loss=0.0184, Acc=0.9950


Epoch 34/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 333.07it/s, loss=0.000325, acc=0.997]


Epoch 34: Loss=0.0112, Acc=0.9972


Epoch 35/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 328.62it/s, loss=0.0133, acc=0.995]


Epoch 35: Loss=0.0196, Acc=0.9951


Epoch 36/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 335.75it/s, loss=0.00254, acc=0.996]


Epoch 36: Loss=0.0184, Acc=0.9962


Epoch 37/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 327.44it/s, loss=0.000729, acc=0.998]


Epoch 37: Loss=0.0087, Acc=0.9977


Epoch 38/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 324.79it/s, loss=0.000489, acc=0.999]


Epoch 38: Loss=0.0040, Acc=0.9993


Epoch 39/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 330.85it/s, loss=0.000142, acc=1]


Epoch 39: Loss=0.0004, Acc=1.0000


Epoch 40/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 335.25it/s, loss=0.000171, acc=1]


Epoch 40: Loss=0.0002, Acc=1.0000


Epoch 41/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 333.24it/s, loss=0.000327, acc=1]


Epoch 41: Loss=0.0001, Acc=1.0000


Epoch 42/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 335.22it/s, loss=0.0001, acc=1]


Epoch 42: Loss=0.0001, Acc=1.0000


Epoch 43/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 318.18it/s, loss=5.56e-5, acc=1]


Epoch 43: Loss=0.0001, Acc=1.0000


Epoch 44/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.56it/s, loss=2.33e-5, acc=1]


Epoch 44: Loss=0.0001, Acc=1.0000


Epoch 45/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.98it/s, loss=7.49e-6, acc=1]


Epoch 45: Loss=0.0000, Acc=1.0000


Epoch 46/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.02it/s, loss=2.78e-5, acc=1]


Epoch 46: Loss=0.0000, Acc=1.0000


Epoch 47/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.91it/s, loss=1.58e-5, acc=1]


Epoch 47: Loss=0.0000, Acc=1.0000


Epoch 48/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.82it/s, loss=4.59e-5, acc=1]


Epoch 48: Loss=0.0000, Acc=1.0000


Epoch 49/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.68it/s, loss=4.38e-5, acc=1]


Epoch 49: Loss=0.0000, Acc=1.0000


Epoch 50/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 320.83it/s, loss=8.23e-7, acc=1]


Epoch 50: Loss=0.0000, Acc=1.0000


Epoch 51/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 320.90it/s, loss=2.91e-6, acc=1]


Epoch 51: Loss=0.0000, Acc=1.0000


Epoch 52/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.54it/s, loss=9.51e-6, acc=1]


Epoch 52: Loss=0.0000, Acc=1.0000


Epoch 53/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 339.15it/s, loss=2.47e-5, acc=1]


Epoch 53: Loss=0.0000, Acc=1.0000


Epoch 54/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.27it/s, loss=5.33e-6, acc=1]


Epoch 54: Loss=0.0000, Acc=1.0000


Epoch 55/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.29it/s, loss=4.47e-7, acc=1]


Epoch 55: Loss=0.0000, Acc=1.0000


Epoch 56/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 319.61it/s, loss=1.22e-6, acc=1]


Epoch 56: Loss=0.0000, Acc=1.0000


Epoch 57/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.26it/s, loss=1.83e-5, acc=1]


Epoch 57: Loss=0.0000, Acc=1.0000


Epoch 58/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.27it/s, loss=1.37e-5, acc=1]


Epoch 58: Loss=0.0000, Acc=1.0000


Epoch 59/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.05it/s, loss=4.47e-7, acc=1]


Epoch 59: Loss=0.0000, Acc=1.0000


Epoch 60/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 332.23it/s, loss=2.05e-7, acc=1]


Epoch 60: Loss=0.0000, Acc=1.0000


Epoch 61/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.14it/s, loss=6.71e-7, acc=1]


Epoch 61: Loss=0.0000, Acc=1.0000


Epoch 62/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.91it/s, loss=5.52e-6, acc=1]


Epoch 62: Loss=0.0000, Acc=1.0000


Epoch 63/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 325.58it/s, loss=2.72e-6, acc=1]


Epoch 63: Loss=0.0000, Acc=1.0000


Epoch 64/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 325.62it/s, loss=2.94e-7, acc=1]


Epoch 64: Loss=0.0000, Acc=1.0000


Epoch 65/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.50it/s, loss=2.24e-7, acc=1]


Epoch 65: Loss=0.0000, Acc=1.0000


Epoch 66/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.65it/s, loss=1.86e-6, acc=1]


Epoch 66: Loss=0.0000, Acc=1.0000


Epoch 67/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.68it/s, loss=7.45e-6, acc=1]


Epoch 67: Loss=0.0000, Acc=1.0000


Epoch 68/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.81it/s, loss=1.79e-7, acc=1]


Epoch 68: Loss=0.0000, Acc=1.0000


Epoch 69/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.94it/s, loss=1.19e-6, acc=1]


Epoch 69: Loss=0.0000, Acc=1.0000


Epoch 70/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.60it/s, loss=6.67e-7, acc=1]


Epoch 70: Loss=0.0000, Acc=1.0000


Epoch 71/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.18it/s, loss=6.15e-7, acc=1]


Epoch 71: Loss=0.0000, Acc=1.0000


Epoch 72/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.71it/s, loss=8.68e-7, acc=1]


Epoch 72: Loss=0.0000, Acc=1.0000


Epoch 73/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 341.59it/s, loss=4.66e-7, acc=1]


Epoch 73: Loss=0.0000, Acc=1.0000


Epoch 74/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 337.68it/s, loss=4.84e-8, acc=1]


Epoch 74: Loss=0.0000, Acc=1.0000


Epoch 75/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 333.08it/s, loss=9.24e-7, acc=1]


Epoch 75: Loss=0.0000, Acc=1.0000


Epoch 76/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.00it/s, loss=3.35e-8, acc=1]


Epoch 76: Loss=0.0000, Acc=1.0000


Epoch 77/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 319.70it/s, loss=1.83e-7, acc=1]


Epoch 77: Loss=0.0000, Acc=1.0000


Epoch 78/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 319.12it/s, loss=1.12e-8, acc=1]


Epoch 78: Loss=0.0000, Acc=1.0000


Epoch 79/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.54it/s, loss=2.98e-8, acc=1]


Epoch 79: Loss=0.0000, Acc=1.0000


Epoch 80/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 339.90it/s, loss=6.33e-8, acc=1]


Epoch 80: Loss=0.0000, Acc=1.0000


Epoch 81/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.26it/s, loss=6.71e-8, acc=1]


Epoch 81: Loss=0.0000, Acc=1.0000


Epoch 82/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.53it/s, loss=2.76e-7, acc=1]


Epoch 82: Loss=0.0000, Acc=1.0000


Epoch 83/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.56it/s, loss=1.12e-8, acc=1]


Epoch 83: Loss=0.0000, Acc=1.0000


Epoch 84/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.03it/s, loss=1.49e-8, acc=1]


Epoch 84: Loss=0.0000, Acc=1.0000


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.61it/s, loss=2.24e-8, acc=1]


Epoch 85: Loss=0.0000, Acc=1.0000


Epoch 86/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 316.55it/s, loss=2.5e-7, acc=1]


Epoch 86: Loss=0.0000, Acc=1.0000


Epoch 87/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.60it/s, loss=1.49e-8, acc=1]


Epoch 87: Loss=0.0000, Acc=1.0000


Epoch 88/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.83it/s, loss=1.53e-7, acc=1]


Epoch 88: Loss=0.0000, Acc=1.0000


Epoch 89/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.26it/s, loss=8.94e-8, acc=1]


Epoch 89: Loss=0.0000, Acc=1.0000


Epoch 90/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.69it/s, loss=7.45e-9, acc=1]


Epoch 90: Loss=0.0000, Acc=1.0000


Epoch 91/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.33it/s, loss=1.56e-7, acc=1]


Epoch 91: Loss=0.0000, Acc=1.0000


Epoch 92/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.02it/s, loss=1.86e-8, acc=1]


Epoch 92: Loss=0.0000, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.79it/s, loss=1.23e-7, acc=1]


Epoch 93: Loss=0.0000, Acc=1.0000


Epoch 94/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 318.70it/s, loss=1.49e-7, acc=1]


Epoch 94: Loss=0.0000, Acc=1.0000


Epoch 95/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.18it/s, loss=3.73e-9, acc=1]


Epoch 95: Loss=0.0000, Acc=1.0000


Epoch 96/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.12it/s, loss=3.73e-9, acc=1]


Epoch 96: Loss=0.0000, Acc=1.0000


Epoch 97/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 334.26it/s, loss=3.73e-9, acc=1]


Epoch 97: Loss=0.0000, Acc=1.0000


Epoch 98/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 338.38it/s, loss=4.1e-8, acc=1]


Epoch 98: Loss=0.0000, Acc=1.0000


Epoch 99/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.23it/s, loss=1.86e-8, acc=1]


Epoch 99: Loss=0.0000, Acc=1.0000


Epoch 100/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 327.74it/s, loss=7.45e-9, acc=1]


Epoch 100: Loss=0.0000, Acc=1.0000
Starting to train model for bpe-hard with vocab size 3004
Moving model to device:  cuda


Epoch 1/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.14it/s, loss=0.72, acc=0.503]


Epoch 1: Loss=0.9507, Acc=0.5030


Epoch 2/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 324.64it/s, loss=0.687, acc=0.555]


Epoch 2: Loss=0.6836, Acc=0.5554


Epoch 3/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 338.33it/s, loss=0.632, acc=0.585]


Epoch 3: Loss=0.6588, Acc=0.5853


Epoch 4/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.39it/s, loss=0.65, acc=0.628]


Epoch 4: Loss=0.6194, Acc=0.6276


Epoch 5/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 333.78it/s, loss=0.642, acc=0.663]


Epoch 5: Loss=0.5802, Acc=0.6634


Epoch 6/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.58it/s, loss=0.521, acc=0.69]


Epoch 6: Loss=0.5434, Acc=0.6897


Epoch 7/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 329.67it/s, loss=0.619, acc=0.723]


Epoch 7: Loss=0.5029, Acc=0.7233


Epoch 8/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 335.61it/s, loss=0.598, acc=0.753]


Epoch 8: Loss=0.4556, Acc=0.7533


Epoch 9/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 325.35it/s, loss=0.402, acc=0.785]


Epoch 9: Loss=0.4134, Acc=0.7849


Epoch 10/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 330.42it/s, loss=0.497, acc=0.811]


Epoch 10: Loss=0.3690, Acc=0.8111


Epoch 11/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 335.18it/s, loss=0.315, acc=0.835]


Epoch 11: Loss=0.3305, Acc=0.8354


Epoch 12/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 336.35it/s, loss=0.119, acc=0.854]


Epoch 12: Loss=0.3047, Acc=0.8542


Epoch 13/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 330.79it/s, loss=0.286, acc=0.881]


Epoch 13: Loss=0.2591, Acc=0.8808


Epoch 14/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 337.87it/s, loss=0.233, acc=0.899]


Epoch 14: Loss=0.2244, Acc=0.8993


Epoch 15/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 337.97it/s, loss=0.486, acc=0.91]


Epoch 15: Loss=0.2011, Acc=0.9097


Epoch 16/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 334.63it/s, loss=0.102, acc=0.93]


Epoch 16: Loss=0.1642, Acc=0.9297


Epoch 17/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 333.28it/s, loss=0.269, acc=0.94]


Epoch 17: Loss=0.1424, Acc=0.9399


Epoch 18/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 338.61it/s, loss=0.272, acc=0.953]


Epoch 18: Loss=0.1160, Acc=0.9528


Epoch 19/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 335.18it/s, loss=0.211, acc=0.96]


Epoch 19: Loss=0.0992, Acc=0.9600


Epoch 20/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 336.28it/s, loss=0.0208, acc=0.966]


Epoch 20: Loss=0.0907, Acc=0.9658


Epoch 21/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 342.48it/s, loss=0.0105, acc=0.971]


Epoch 21: Loss=0.0787, Acc=0.9714


Epoch 22/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 338.64it/s, loss=0.0886, acc=0.976]


Epoch 22: Loss=0.0643, Acc=0.9763


Epoch 23/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 327.76it/s, loss=0.0162, acc=0.98]


Epoch 23: Loss=0.0556, Acc=0.9798


Epoch 24/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.11it/s, loss=0.12, acc=0.98]


Epoch 24: Loss=0.0552, Acc=0.9804


Epoch 25/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 334.37it/s, loss=0.00993, acc=0.986]


Epoch 25: Loss=0.0407, Acc=0.9858


Epoch 26/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 328.51it/s, loss=0.0356, acc=0.987]


Epoch 26: Loss=0.0391, Acc=0.9869


Epoch 27/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 334.07it/s, loss=0.0872, acc=0.988]


Epoch 27: Loss=0.0342, Acc=0.9885


Epoch 28/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 341.66it/s, loss=0.0391, acc=0.99]


Epoch 28: Loss=0.0290, Acc=0.9898


Epoch 29/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 337.22it/s, loss=0.00224, acc=0.987]


Epoch 29: Loss=0.0374, Acc=0.9872


Epoch 30/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 342.02it/s, loss=0.131, acc=0.991]


Epoch 30: Loss=0.0302, Acc=0.9905


Epoch 31/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 330.24it/s, loss=0.0261, acc=0.991]


Epoch 31: Loss=0.0280, Acc=0.9910


Epoch 32/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 333.73it/s, loss=0.0012, acc=0.992]


Epoch 32: Loss=0.0262, Acc=0.9916


Epoch 33/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 330.29it/s, loss=0.0224, acc=0.996]


Epoch 33: Loss=0.0151, Acc=0.9955


Epoch 34/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 340.63it/s, loss=0.0882, acc=0.996]


Epoch 34: Loss=0.0141, Acc=0.9957


Epoch 35/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 331.23it/s, loss=0.00255, acc=0.995]


Epoch 35: Loss=0.0169, Acc=0.9948


Epoch 36/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 340.97it/s, loss=0.122, acc=0.994]


Epoch 36: Loss=0.0182, Acc=0.9944


Epoch 37/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 329.14it/s, loss=0.0904, acc=0.992]


Epoch 37: Loss=0.0265, Acc=0.9921


Epoch 38/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 328.73it/s, loss=0.141, acc=0.996]


Epoch 38: Loss=0.0147, Acc=0.9959


Epoch 39/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 332.11it/s, loss=0.0383, acc=0.996]


Epoch 39: Loss=0.0137, Acc=0.9960


Epoch 40/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 343.84it/s, loss=0.00261, acc=0.996]


Epoch 40: Loss=0.0135, Acc=0.9959


Epoch 41/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 329.76it/s, loss=0.0302, acc=0.997]


Epoch 41: Loss=0.0103, Acc=0.9969


Epoch 42/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 326.10it/s, loss=0.0604, acc=0.996]


Epoch 42: Loss=0.0110, Acc=0.9961


Epoch 43/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 322.01it/s, loss=0.0971, acc=0.997]


Epoch 43: Loss=0.0105, Acc=0.9968


Epoch 44/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 324.84it/s, loss=0.146, acc=0.999]


Epoch 44: Loss=0.0043, Acc=0.9988


Epoch 45/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 327.57it/s, loss=7.86e-5, acc=0.998]


Epoch 45: Loss=0.0065, Acc=0.9978


Epoch 46/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 324.81it/s, loss=0.000221, acc=0.998]


Epoch 46: Loss=0.0072, Acc=0.9983


Epoch 47/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 331.40it/s, loss=6.65e-5, acc=0.999]


Epoch 47: Loss=0.0047, Acc=0.9988


Epoch 48/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 328.53it/s, loss=0.143, acc=0.999]


Epoch 48: Loss=0.0058, Acc=0.9986


Epoch 49/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 327.11it/s, loss=0.000359, acc=0.999]


Epoch 49: Loss=0.0044, Acc=0.9987


Epoch 50/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 330.30it/s, loss=0.000405, acc=0.997]


Epoch 50: Loss=0.0086, Acc=0.9970


Epoch 51/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 331.62it/s, loss=0.00272, acc=0.998]


Epoch 51: Loss=0.0053, Acc=0.9980


Epoch 52/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 330.13it/s, loss=7.45e-5, acc=0.999]


Epoch 52: Loss=0.0032, Acc=0.9988


Epoch 53/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.82it/s, loss=7.85e-6, acc=1]


Epoch 53: Loss=0.0014, Acc=0.9997


Epoch 54/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 327.49it/s, loss=0.000625, acc=0.999]


Epoch 54: Loss=0.0027, Acc=0.9993


Epoch 55/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 317.89it/s, loss=4.37e-6, acc=0.999]


Epoch 55: Loss=0.0018, Acc=0.9992


Epoch 56/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 331.94it/s, loss=0.0237, acc=0.999]


Epoch 56: Loss=0.0029, Acc=0.9991


Epoch 57/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 329.67it/s, loss=0.00344, acc=0.998]


Epoch 57: Loss=0.0080, Acc=0.9980


Epoch 58/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.88it/s, loss=6.59e-5, acc=1]


Epoch 58: Loss=0.0005, Acc=1.0000


Epoch 59/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 341.69it/s, loss=1.43e-6, acc=1]


Epoch 59: Loss=0.0001, Acc=1.0000


Epoch 60/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 334.11it/s, loss=2.19e-5, acc=1]


Epoch 60: Loss=0.0000, Acc=1.0000


Epoch 61/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 338.95it/s, loss=1.14e-6, acc=1]


Epoch 61: Loss=0.0000, Acc=1.0000


Epoch 62/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.61it/s, loss=7.44e-5, acc=1]


Epoch 62: Loss=0.0000, Acc=1.0000


Epoch 63/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 336.36it/s, loss=1.17e-5, acc=1]


Epoch 63: Loss=0.0000, Acc=1.0000


Epoch 64/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.10it/s, loss=2.07e-6, acc=1]


Epoch 64: Loss=0.0000, Acc=1.0000


Epoch 65/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.81it/s, loss=1.49e-6, acc=1]


Epoch 65: Loss=0.0000, Acc=1.0000


Epoch 66/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.84it/s, loss=9.16e-7, acc=1]


Epoch 66: Loss=0.0000, Acc=1.0000


Epoch 67/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.44it/s, loss=2.92e-6, acc=1]


Epoch 67: Loss=0.0000, Acc=1.0000


Epoch 68/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 326.60it/s, loss=2.2e-7, acc=1]


Epoch 68: Loss=0.0000, Acc=1.0000


Epoch 69/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.70it/s, loss=3.81e-6, acc=1]


Epoch 69: Loss=0.0000, Acc=1.0000


Epoch 70/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.18it/s, loss=1.08e-6, acc=1]


Epoch 70: Loss=0.0000, Acc=1.0000


Epoch 71/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 336.23it/s, loss=8.01e-7, acc=1]


Epoch 71: Loss=0.0000, Acc=1.0000


Epoch 72/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 332.93it/s, loss=1.53e-6, acc=1]


Epoch 72: Loss=0.0000, Acc=1.0000


Epoch 73/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 337.26it/s, loss=2.11e-5, acc=1]


Epoch 73: Loss=0.0000, Acc=1.0000


Epoch 74/100: 100%|███████████████████████████████████████| 313/313 [00:00<00:00, 332.85it/s, loss=0, acc=1]


Epoch 74: Loss=0.0000, Acc=1.0000


Epoch 75/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 334.71it/s, loss=4.1e-8, acc=1]


Epoch 75: Loss=0.0000, Acc=1.0000


Epoch 76/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 340.16it/s, loss=2.94e-7, acc=1]


Epoch 76: Loss=0.0000, Acc=1.0000


Epoch 77/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 332.18it/s, loss=1.23e-7, acc=1]


Epoch 77: Loss=0.0000, Acc=1.0000


Epoch 78/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.95it/s, loss=1.86e-8, acc=1]


Epoch 78: Loss=0.0000, Acc=1.0000


Epoch 79/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.72it/s, loss=3.48e-6, acc=1]


Epoch 79: Loss=0.0000, Acc=1.0000


Epoch 80/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.81it/s, loss=8.94e-8, acc=1]


Epoch 80: Loss=0.0000, Acc=1.0000


Epoch 81/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 335.27it/s, loss=1.49e-6, acc=1]


Epoch 81: Loss=0.0000, Acc=1.0000


Epoch 82/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 342.95it/s, loss=3.76e-7, acc=1]


Epoch 82: Loss=0.0000, Acc=1.0000


Epoch 83/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.33it/s, loss=9.69e-8, acc=1]


Epoch 83: Loss=0.0000, Acc=1.0000


Epoch 84/100: 100%|███████████████████████████████████████| 313/313 [00:00<00:00, 324.59it/s, loss=0, acc=1]


Epoch 84: Loss=0.0000, Acc=1.0000


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.68it/s, loss=2.98e-8, acc=1]


Epoch 85: Loss=0.0000, Acc=1.0000


Epoch 86/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 337.19it/s, loss=3.73e-9, acc=1]


Epoch 86: Loss=0.0000, Acc=1.0000


Epoch 87/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 333.69it/s, loss=7.82e-8, acc=1]


Epoch 87: Loss=0.0000, Acc=1.0000


Epoch 88/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 339.11it/s, loss=1.97e-7, acc=1]


Epoch 88: Loss=0.0000, Acc=1.0000


Epoch 89/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.67it/s, loss=3.76e-7, acc=1]


Epoch 89: Loss=0.0000, Acc=1.0000


Epoch 90/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 320.49it/s, loss=2.27e-7, acc=1]


Epoch 90: Loss=0.0000, Acc=1.0000


Epoch 91/100: 100%|███████████████████████████████████████| 313/313 [00:00<00:00, 326.77it/s, loss=0, acc=1]


Epoch 91: Loss=0.0000, Acc=1.0000


Epoch 92/100: 100%|███████████████████████████████████████| 313/313 [00:00<00:00, 331.57it/s, loss=0, acc=1]


Epoch 92: Loss=0.0000, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.06it/s, loss=2.98e-8, acc=1]


Epoch 93: Loss=0.0000, Acc=1.0000


Epoch 94/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.15it/s, loss=1.86e-8, acc=1]


Epoch 94: Loss=0.0000, Acc=1.0000


Epoch 95/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 322.48it/s, loss=6.78e-7, acc=1]


Epoch 95: Loss=0.0000, Acc=1.0000


Epoch 96/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 330.95it/s, loss=2.61e-7, acc=1]


Epoch 96: Loss=0.0000, Acc=1.0000


Epoch 97/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 326.04it/s, loss=2.05e-7, acc=1]


Epoch 97: Loss=0.0000, Acc=1.0000


Epoch 98/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.55it/s, loss=3.73e-9, acc=1]


Epoch 98: Loss=0.0000, Acc=1.0000


Epoch 99/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 331.24it/s, loss=2.61e-8, acc=1]


Epoch 99: Loss=0.0000, Acc=1.0000


Epoch 100/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 331.86it/s, loss=2.42e-7, acc=1]


Epoch 100: Loss=0.0000, Acc=1.0000
Starting to train model for mult-hard with vocab size 45
Moving model to device:  cuda


Epoch 1/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.33it/s, loss=1.92, acc=0.246]


Epoch 1: Loss=1.9153, Acc=0.2460


Epoch 2/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 326.88it/s, loss=1.6, acc=0.295]


Epoch 2: Loss=1.7292, Acc=0.2951


Epoch 3/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 324.56it/s, loss=0.888, acc=0.528]


Epoch 3: Loss=1.0994, Acc=0.5282


Epoch 4/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 334.69it/s, loss=0.487, acc=0.668]


Epoch 4: Loss=0.7612, Acc=0.6685


Epoch 5/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 320.97it/s, loss=0.519, acc=0.707]


Epoch 5: Loss=0.6697, Acc=0.7067


Epoch 6/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.36it/s, loss=0.733, acc=0.734]


Epoch 6: Loss=0.6108, Acc=0.7337


Epoch 7/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 319.17it/s, loss=0.666, acc=0.747]


Epoch 7: Loss=0.5772, Acc=0.7474


Epoch 8/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.95it/s, loss=0.565, acc=0.76]


Epoch 8: Loss=0.5562, Acc=0.7603


Epoch 9/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 332.61it/s, loss=0.564, acc=0.773]


Epoch 9: Loss=0.5212, Acc=0.7728


Epoch 10/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 333.74it/s, loss=0.717, acc=0.788]


Epoch 10: Loss=0.4960, Acc=0.7880


Epoch 11/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 323.17it/s, loss=0.4, acc=0.792]


Epoch 11: Loss=0.4767, Acc=0.7915


Epoch 12/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 316.92it/s, loss=0.79, acc=0.796]


Epoch 12: Loss=0.4685, Acc=0.7957


Epoch 13/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 333.58it/s, loss=0.669, acc=0.804]


Epoch 13: Loss=0.4521, Acc=0.8043


Epoch 14/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 313.37it/s, loss=0.384, acc=0.804]


Epoch 14: Loss=0.4548, Acc=0.8036


Epoch 15/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 328.13it/s, loss=0.253, acc=0.816]


Epoch 15: Loss=0.4318, Acc=0.8159


Epoch 16/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 325.04it/s, loss=0.293, acc=0.822]


Epoch 16: Loss=0.4172, Acc=0.8216


Epoch 17/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 330.64it/s, loss=0.685, acc=0.831]


Epoch 17: Loss=0.4005, Acc=0.8313


Epoch 18/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 333.82it/s, loss=0.74, acc=0.833]


Epoch 18: Loss=0.3893, Acc=0.8327


Epoch 19/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 324.94it/s, loss=0.278, acc=0.843]


Epoch 19: Loss=0.3730, Acc=0.8429


Epoch 20/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 319.10it/s, loss=0.473, acc=0.848]


Epoch 20: Loss=0.3637, Acc=0.8484


Epoch 21/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 327.31it/s, loss=0.253, acc=0.846]


Epoch 21: Loss=0.3565, Acc=0.8458


Epoch 22/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 326.43it/s, loss=0.239, acc=0.858]


Epoch 22: Loss=0.3382, Acc=0.8583


Epoch 23/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 323.54it/s, loss=0.35, acc=0.865]


Epoch 23: Loss=0.3224, Acc=0.8647


Epoch 24/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 328.00it/s, loss=0.313, acc=0.86]


Epoch 24: Loss=0.3327, Acc=0.8596


Epoch 25/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 323.77it/s, loss=0.177, acc=0.875]


Epoch 25: Loss=0.2969, Acc=0.8755


Epoch 26/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 307.24it/s, loss=0.353, acc=0.881]


Epoch 26: Loss=0.2816, Acc=0.8814


Epoch 27/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 326.72it/s, loss=0.269, acc=0.884]


Epoch 27: Loss=0.2735, Acc=0.8839


Epoch 28/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 326.72it/s, loss=0.256, acc=0.893]


Epoch 28: Loss=0.2566, Acc=0.8932


Epoch 29/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 327.58it/s, loss=0.172, acc=0.888]


Epoch 29: Loss=0.2712, Acc=0.8880


Epoch 30/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 334.23it/s, loss=0.241, acc=0.904]


Epoch 30: Loss=0.2350, Acc=0.9042


Epoch 31/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 322.41it/s, loss=0.139, acc=0.906]


Epoch 31: Loss=0.2203, Acc=0.9064


Epoch 32/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 335.98it/s, loss=0.18, acc=0.911]


Epoch 32: Loss=0.2152, Acc=0.9105


Epoch 33/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 315.00it/s, loss=0.174, acc=0.917]


Epoch 33: Loss=0.2054, Acc=0.9173


Epoch 34/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 313.74it/s, loss=0.0886, acc=0.924]


Epoch 34: Loss=0.1882, Acc=0.9243


Epoch 35/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 329.67it/s, loss=0.311, acc=0.928]


Epoch 35: Loss=0.1756, Acc=0.9283


Epoch 36/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 339.12it/s, loss=0.374, acc=0.932]


Epoch 36: Loss=0.1689, Acc=0.9315


Epoch 37/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 325.77it/s, loss=0.324, acc=0.934]


Epoch 37: Loss=0.1652, Acc=0.9344


Epoch 38/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 331.01it/s, loss=0.153, acc=0.937]


Epoch 38: Loss=0.1573, Acc=0.9366


Epoch 39/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 321.06it/s, loss=0.264, acc=0.946]


Epoch 39: Loss=0.1344, Acc=0.9462


Epoch 40/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 328.02it/s, loss=0.0625, acc=0.947]


Epoch 40: Loss=0.1378, Acc=0.9469


Epoch 41/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 318.48it/s, loss=0.145, acc=0.951]


Epoch 41: Loss=0.1255, Acc=0.9509


Epoch 42/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 324.18it/s, loss=0.169, acc=0.955]


Epoch 42: Loss=0.1159, Acc=0.9547


Epoch 43/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 320.87it/s, loss=0.125, acc=0.958]


Epoch 43: Loss=0.1074, Acc=0.9580


Epoch 44/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 333.82it/s, loss=0.049, acc=0.964]


Epoch 44: Loss=0.0934, Acc=0.9643


Epoch 45/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 322.77it/s, loss=0.06, acc=0.962]


Epoch 45: Loss=0.0983, Acc=0.9621


Epoch 46/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 316.23it/s, loss=0.0271, acc=0.967]


Epoch 46: Loss=0.0848, Acc=0.9670


Epoch 47/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 325.25it/s, loss=0.0432, acc=0.972]


Epoch 47: Loss=0.0739, Acc=0.9718


Epoch 48/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 334.05it/s, loss=0.085, acc=0.978]


Epoch 48: Loss=0.0610, Acc=0.9780


Epoch 49/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 334.91it/s, loss=0.00928, acc=0.973]


Epoch 49: Loss=0.0704, Acc=0.9734


Epoch 50/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 318.41it/s, loss=0.154, acc=0.972]


Epoch 50: Loss=0.0729, Acc=0.9723


Epoch 51/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 320.41it/s, loss=0.0376, acc=0.981]


Epoch 51: Loss=0.0523, Acc=0.9807


Epoch 52/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 331.71it/s, loss=0.0189, acc=0.98]


Epoch 52: Loss=0.0522, Acc=0.9798


Epoch 53/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 332.35it/s, loss=0.363, acc=0.978]


Epoch 53: Loss=0.0617, Acc=0.9778


Epoch 54/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.82it/s, loss=0.01, acc=0.989]


Epoch 54: Loss=0.0334, Acc=0.9889


Epoch 55/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 325.06it/s, loss=0.0686, acc=0.987]


Epoch 55: Loss=0.0360, Acc=0.9871


Epoch 56/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 329.89it/s, loss=0.136, acc=0.988]


Epoch 56: Loss=0.0321, Acc=0.9885


Epoch 57/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 337.17it/s, loss=0.0956, acc=0.983]


Epoch 57: Loss=0.0481, Acc=0.9830


Epoch 58/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 332.09it/s, loss=0.00251, acc=0.991]


Epoch 58: Loss=0.0272, Acc=0.9910


Epoch 59/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 324.11it/s, loss=0.183, acc=0.987]


Epoch 59: Loss=0.0357, Acc=0.9873


Epoch 60/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 325.71it/s, loss=0.0498, acc=0.992]


Epoch 60: Loss=0.0233, Acc=0.9921


Epoch 61/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 328.34it/s, loss=0.00174, acc=0.992]


Epoch 61: Loss=0.0228, Acc=0.9920


Epoch 62/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 330.37it/s, loss=0.00493, acc=0.995]


Epoch 62: Loss=0.0180, Acc=0.9946


Epoch 63/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 331.40it/s, loss=0.00224, acc=0.996]


Epoch 63: Loss=0.0137, Acc=0.9962


Epoch 64/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 330.30it/s, loss=0.000759, acc=0.995]


Epoch 64: Loss=0.0175, Acc=0.9948


Epoch 65/100: 100%|████████████████████████████| 313/313 [00:00<00:00, 328.36it/s, loss=0.000695, acc=0.998]


Epoch 65: Loss=0.0093, Acc=0.9979


Epoch 66/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 329.07it/s, loss=0.00434, acc=1]


Epoch 66: Loss=0.0039, Acc=0.9996


Epoch 67/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.15it/s, loss=0.000746, acc=1]


Epoch 67: Loss=0.0022, Acc=0.9999


Epoch 68/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 327.31it/s, loss=0.00298, acc=1]


Epoch 68: Loss=0.0014, Acc=1.0000


Epoch 69/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.21it/s, loss=0.00535, acc=1]


Epoch 69: Loss=0.0011, Acc=1.0000


Epoch 70/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.21it/s, loss=0.00181, acc=1]


Epoch 70: Loss=0.0010, Acc=1.0000


Epoch 71/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 336.22it/s, loss=0.000899, acc=1]


Epoch 71: Loss=0.0011, Acc=1.0000


Epoch 72/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.09it/s, loss=9.18e-5, acc=1]


Epoch 72: Loss=0.0009, Acc=1.0000


Epoch 73/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 323.48it/s, loss=0.159, acc=0.991]


Epoch 73: Loss=0.0268, Acc=0.9911


Epoch 74/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 320.42it/s, loss=0.00158, acc=0.994]


Epoch 74: Loss=0.0198, Acc=0.9936


Epoch 75/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 326.23it/s, loss=0.00786, acc=0.999]


Epoch 75: Loss=0.0044, Acc=0.9991


Epoch 76/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 324.76it/s, loss=0.00121, acc=1]


Epoch 76: Loss=0.0018, Acc=0.9998


Epoch 77/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 329.54it/s, loss=0.000405, acc=1]


Epoch 77: Loss=0.0014, Acc=1.0000


Epoch 78/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 328.26it/s, loss=0.000558, acc=1]


Epoch 78: Loss=0.0009, Acc=1.0000


Epoch 79/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 331.72it/s, loss=0.000468, acc=1]


Epoch 79: Loss=0.0007, Acc=1.0000


Epoch 80/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 322.80it/s, loss=0.00103, acc=1]


Epoch 80: Loss=0.0007, Acc=1.0000


Epoch 81/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 330.19it/s, loss=0.000186, acc=1]


Epoch 81: Loss=0.0006, Acc=1.0000


Epoch 82/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 331.06it/s, loss=0.000241, acc=1]


Epoch 82: Loss=0.0005, Acc=1.0000


Epoch 83/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.68it/s, loss=0.000559, acc=1]


Epoch 83: Loss=0.0005, Acc=1.0000


Epoch 84/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 325.76it/s, loss=0.00134, acc=1]


Epoch 84: Loss=0.0005, Acc=1.0000


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 325.63it/s, loss=0.00026, acc=1]


Epoch 85: Loss=0.0004, Acc=1.0000


Epoch 86/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.33it/s, loss=0.000665, acc=1]


Epoch 86: Loss=0.0004, Acc=1.0000


Epoch 87/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 315.76it/s, loss=0.000268, acc=1]


Epoch 87: Loss=0.0004, Acc=1.0000


Epoch 88/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 316.29it/s, loss=0.000534, acc=1]


Epoch 88: Loss=0.0004, Acc=1.0000


Epoch 89/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 329.34it/s, loss=0.000141, acc=1]


Epoch 89: Loss=0.0003, Acc=1.0000


Epoch 90/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 329.11it/s, loss=0.000158, acc=1]


Epoch 90: Loss=0.0003, Acc=1.0000


Epoch 91/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 329.58it/s, loss=0.000374, acc=1]


Epoch 91: Loss=0.0003, Acc=1.0000


Epoch 92/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 332.44it/s, loss=0.000759, acc=1]


Epoch 92: Loss=0.0003, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 322.78it/s, loss=8.01e-5, acc=1]


Epoch 93: Loss=0.0003, Acc=1.0000


Epoch 94/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 323.30it/s, loss=0.000107, acc=1]


Epoch 94: Loss=0.0003, Acc=1.0000


Epoch 95/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 319.50it/s, loss=0.000362, acc=1]


Epoch 95: Loss=0.0002, Acc=1.0000


Epoch 96/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 328.33it/s, loss=6.53e-5, acc=1]


Epoch 96: Loss=0.0002, Acc=1.0000


Epoch 97/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 321.10it/s, loss=0.000422, acc=1]


Epoch 97: Loss=0.0002, Acc=1.0000


Epoch 98/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 334.31it/s, loss=5.07e-5, acc=1]


Epoch 98: Loss=0.0002, Acc=1.0000


Epoch 99/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 331.52it/s, loss=0.000655, acc=1]


Epoch 99: Loss=0.0002, Acc=1.0000


Epoch 100/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 327.54it/s, loss=0.000237, acc=1]


Epoch 100: Loss=0.0002, Acc=1.0000
Starting to train model for length-hard with vocab size 45
Moving model to device:  cuda


Epoch 1/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 302.89it/s, loss=0.688, acc=0.466]


Epoch 1: Loss=1.0529, Acc=0.4660


Epoch 2/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 302.37it/s, loss=1.04, acc=0.472]


Epoch 2: Loss=0.9138, Acc=0.4721


Epoch 3/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 306.20it/s, loss=0.968, acc=0.474]


Epoch 3: Loss=0.9094, Acc=0.4736


Epoch 4/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.53it/s, loss=0.71, acc=0.483]


Epoch 4: Loss=0.9055, Acc=0.4825


Epoch 5/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 307.69it/s, loss=0.872, acc=0.474]


Epoch 5: Loss=0.9063, Acc=0.4743


Epoch 6/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 304.23it/s, loss=0.962, acc=0.476]


Epoch 6: Loss=0.9042, Acc=0.4763


Epoch 7/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 301.38it/s, loss=1.03, acc=0.477]


Epoch 7: Loss=0.9027, Acc=0.4770


Epoch 8/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 300.34it/s, loss=0.694, acc=0.474]


Epoch 8: Loss=0.8995, Acc=0.4745


Epoch 9/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 304.73it/s, loss=0.932, acc=0.48]


Epoch 9: Loss=0.8979, Acc=0.4803


Epoch 10/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.32it/s, loss=0.867, acc=0.477]


Epoch 10: Loss=0.8957, Acc=0.4773


Epoch 11/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 299.67it/s, loss=0.758, acc=0.479]


Epoch 11: Loss=0.8919, Acc=0.4794


Epoch 12/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 305.93it/s, loss=0.736, acc=0.479]


Epoch 12: Loss=0.8871, Acc=0.4794


Epoch 13/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.87it/s, loss=0.958, acc=0.482]


Epoch 13: Loss=0.8842, Acc=0.4819


Epoch 14/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 302.78it/s, loss=0.791, acc=0.477]


Epoch 14: Loss=0.8786, Acc=0.4773


Epoch 15/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 300.20it/s, loss=0.817, acc=0.489]


Epoch 15: Loss=0.8683, Acc=0.4885


Epoch 16/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.26it/s, loss=0.756, acc=0.494]


Epoch 16: Loss=0.8569, Acc=0.4941


Epoch 17/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 307.09it/s, loss=0.959, acc=0.495]


Epoch 17: Loss=0.8456, Acc=0.4951


Epoch 18/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.86it/s, loss=1.08, acc=0.51]


Epoch 18: Loss=0.8292, Acc=0.5097


Epoch 19/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 314.48it/s, loss=0.731, acc=0.515]


Epoch 19: Loss=0.8087, Acc=0.5149


Epoch 20/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 302.36it/s, loss=0.732, acc=0.532]


Epoch 20: Loss=0.7877, Acc=0.5322


Epoch 21/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.11it/s, loss=0.896, acc=0.542]


Epoch 21: Loss=0.7665, Acc=0.5417


Epoch 22/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 293.47it/s, loss=0.802, acc=0.554]


Epoch 22: Loss=0.7442, Acc=0.5540


Epoch 23/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 302.65it/s, loss=0.706, acc=0.561]


Epoch 23: Loss=0.7234, Acc=0.5612


Epoch 24/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 302.00it/s, loss=0.761, acc=0.585]


Epoch 24: Loss=0.7002, Acc=0.5850


Epoch 25/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 312.08it/s, loss=0.693, acc=0.598]


Epoch 25: Loss=0.6840, Acc=0.5978


Epoch 26/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 305.58it/s, loss=0.574, acc=0.614]


Epoch 26: Loss=0.6562, Acc=0.6143


Epoch 27/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 297.93it/s, loss=0.664, acc=0.632]


Epoch 27: Loss=0.6425, Acc=0.6318


Epoch 28/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 308.42it/s, loss=0.664, acc=0.646]


Epoch 28: Loss=0.6238, Acc=0.6458


Epoch 29/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 307.59it/s, loss=0.639, acc=0.662]


Epoch 29: Loss=0.6098, Acc=0.6615


Epoch 30/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 304.49it/s, loss=0.537, acc=0.677]


Epoch 30: Loss=0.5922, Acc=0.6772


Epoch 31/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 305.59it/s, loss=0.508, acc=0.694]


Epoch 31: Loss=0.5732, Acc=0.6942


Epoch 32/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 308.82it/s, loss=0.681, acc=0.713]


Epoch 32: Loss=0.5513, Acc=0.7133


Epoch 33/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.99it/s, loss=0.567, acc=0.735]


Epoch 33: Loss=0.5231, Acc=0.7348


Epoch 34/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 301.12it/s, loss=0.544, acc=0.752]


Epoch 34: Loss=0.4968, Acc=0.7519


Epoch 35/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 304.47it/s, loss=0.494, acc=0.774]


Epoch 35: Loss=0.4644, Acc=0.7742


Epoch 36/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 305.92it/s, loss=0.433, acc=0.791]


Epoch 36: Loss=0.4362, Acc=0.7914


Epoch 37/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 306.69it/s, loss=0.46, acc=0.813]


Epoch 37: Loss=0.4038, Acc=0.8134


Epoch 38/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 303.21it/s, loss=0.471, acc=0.831]


Epoch 38: Loss=0.3717, Acc=0.8306


Epoch 39/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 300.57it/s, loss=0.409, acc=0.849]


Epoch 39: Loss=0.3391, Acc=0.8492


Epoch 40/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 308.73it/s, loss=0.459, acc=0.865]


Epoch 40: Loss=0.3052, Acc=0.8655


Epoch 41/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 313.39it/s, loss=0.24, acc=0.889]


Epoch 41: Loss=0.2663, Acc=0.8889


Epoch 42/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 310.09it/s, loss=0.306, acc=0.897]


Epoch 42: Loss=0.2456, Acc=0.8965


Epoch 43/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 307.62it/s, loss=0.185, acc=0.916]


Epoch 43: Loss=0.2110, Acc=0.9157


Epoch 44/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 308.25it/s, loss=0.23, acc=0.928]


Epoch 44: Loss=0.1758, Acc=0.9279


Epoch 45/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 310.28it/s, loss=0.205, acc=0.939]


Epoch 45: Loss=0.1583, Acc=0.9394


Epoch 46/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 307.56it/s, loss=0.281, acc=0.947]


Epoch 46: Loss=0.1380, Acc=0.9473


Epoch 47/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 309.70it/s, loss=0.202, acc=0.949]


Epoch 47: Loss=0.1282, Acc=0.9488


Epoch 48/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 310.55it/s, loss=0.162, acc=0.959]


Epoch 48: Loss=0.1073, Acc=0.9591


Epoch 49/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 312.46it/s, loss=0.095, acc=0.968]


Epoch 49: Loss=0.0884, Acc=0.9679


Epoch 50/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 303.29it/s, loss=0.286, acc=0.97]


Epoch 50: Loss=0.0840, Acc=0.9695


Epoch 51/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 304.89it/s, loss=0.0373, acc=0.978]


Epoch 51: Loss=0.0616, Acc=0.9784


Epoch 52/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 309.71it/s, loss=0.0787, acc=0.977]


Epoch 52: Loss=0.0647, Acc=0.9775


Epoch 53/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 307.03it/s, loss=0.0999, acc=0.982]


Epoch 53: Loss=0.0519, Acc=0.9820


Epoch 54/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 299.69it/s, loss=0.0308, acc=0.986]


Epoch 54: Loss=0.0440, Acc=0.9856


Epoch 55/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 297.31it/s, loss=0.0232, acc=0.982]


Epoch 55: Loss=0.0507, Acc=0.9822


Epoch 56/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 301.66it/s, loss=0.0412, acc=0.985]


Epoch 56: Loss=0.0447, Acc=0.9852


Epoch 57/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 306.42it/s, loss=0.00329, acc=0.996]


Epoch 57: Loss=0.0170, Acc=0.9956


Epoch 58/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 304.93it/s, loss=0.00666, acc=0.998]


Epoch 58: Loss=0.0105, Acc=0.9980


Epoch 59/100: 100%|███████████████████████████████████| 313/313 [00:01<00:00, 305.93it/s, loss=0.007, acc=1]


Epoch 59: Loss=0.0039, Acc=0.9996


Epoch 60/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.87it/s, loss=0.00072, acc=1]


Epoch 60: Loss=0.0013, Acc=1.0000


Epoch 61/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 302.79it/s, loss=0.00034, acc=1]


Epoch 61: Loss=0.0009, Acc=1.0000


Epoch 62/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 302.92it/s, loss=0.000423, acc=1]


Epoch 62: Loss=0.0007, Acc=1.0000


Epoch 63/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 308.53it/s, loss=0.00102, acc=1]


Epoch 63: Loss=0.0006, Acc=1.0000


Epoch 64/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 301.63it/s, loss=0.000334, acc=1]


Epoch 64: Loss=0.0006, Acc=1.0000


Epoch 65/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 298.52it/s, loss=0.000794, acc=1]


Epoch 65: Loss=0.0005, Acc=1.0000


Epoch 66/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 312.65it/s, loss=0.000114, acc=1]


Epoch 66: Loss=0.0004, Acc=1.0000


Epoch 67/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 309.15it/s, loss=0.000279, acc=1]


Epoch 67: Loss=0.0004, Acc=1.0000


Epoch 68/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 314.04it/s, loss=0.000228, acc=1]


Epoch 68: Loss=0.0003, Acc=1.0000


Epoch 69/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.48it/s, loss=0.00022, acc=1]


Epoch 69: Loss=0.0003, Acc=1.0000


Epoch 70/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 313.77it/s, loss=0.000199, acc=1]


Epoch 70: Loss=0.0003, Acc=1.0000


Epoch 71/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.53it/s, loss=0.00025, acc=1]


Epoch 71: Loss=0.0002, Acc=1.0000


Epoch 72/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 303.39it/s, loss=0.000153, acc=1]


Epoch 72: Loss=0.0002, Acc=1.0000


Epoch 73/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 303.65it/s, loss=0.000324, acc=1]


Epoch 73: Loss=0.0002, Acc=1.0000


Epoch 74/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 299.92it/s, loss=0.000216, acc=1]


Epoch 74: Loss=0.0002, Acc=1.0000


Epoch 75/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 299.28it/s, loss=5.36e-5, acc=1]


Epoch 75: Loss=0.0002, Acc=1.0000


Epoch 76/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.21it/s, loss=9.92e-5, acc=1]


Epoch 76: Loss=0.0001, Acc=1.0000


Epoch 77/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 310.09it/s, loss=0.000145, acc=1]


Epoch 77: Loss=0.0001, Acc=1.0000


Epoch 78/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.68it/s, loss=9.84e-5, acc=1]


Epoch 78: Loss=0.0001, Acc=1.0000


Epoch 79/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 304.56it/s, loss=0.000219, acc=1]


Epoch 79: Loss=0.0001, Acc=1.0000


Epoch 80/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 307.46it/s, loss=9.12e-5, acc=1]


Epoch 80: Loss=0.0001, Acc=1.0000


Epoch 81/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.58it/s, loss=9.23e-5, acc=1]


Epoch 81: Loss=0.0001, Acc=1.0000


Epoch 82/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 296.02it/s, loss=0.000121, acc=1]


Epoch 82: Loss=0.0001, Acc=1.0000


Epoch 83/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.31it/s, loss=2.51e-5, acc=1]


Epoch 83: Loss=0.0001, Acc=1.0000


Epoch 84/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.92it/s, loss=2.95e-5, acc=1]


Epoch 84: Loss=0.0001, Acc=1.0000


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 310.69it/s, loss=2.96e-5, acc=1]


Epoch 85: Loss=0.0001, Acc=1.0000


Epoch 86/100: 100%|████████████████████████████████████| 313/313 [00:00<00:00, 314.45it/s, loss=8e-5, acc=1]


Epoch 86: Loss=0.0001, Acc=1.0000


Epoch 87/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 313.65it/s, loss=5.96e-5, acc=1]


Epoch 87: Loss=0.0000, Acc=1.0000


Epoch 88/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 313.73it/s, loss=4.56e-5, acc=1]


Epoch 88: Loss=0.0000, Acc=1.0000


Epoch 89/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 307.21it/s, loss=1.61e-5, acc=1]


Epoch 89: Loss=0.0000, Acc=1.0000


Epoch 90/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.45it/s, loss=4.55e-5, acc=1]


Epoch 90: Loss=0.0000, Acc=1.0000


Epoch 91/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.91it/s, loss=4.44e-5, acc=1]


Epoch 91: Loss=0.0000, Acc=1.0000


Epoch 92/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.70it/s, loss=9.98e-6, acc=1]


Epoch 92: Loss=0.0000, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.82it/s, loss=2.52e-5, acc=1]


Epoch 93: Loss=0.0000, Acc=1.0000


Epoch 94/100: 100%|██████████████████████████████████| 313/313 [00:00<00:00, 313.54it/s, loss=1.7e-5, acc=1]


Epoch 94: Loss=0.0000, Acc=1.0000


Epoch 95/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 311.69it/s, loss=3.24e-5, acc=1]


Epoch 95: Loss=0.0000, Acc=1.0000


Epoch 96/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.02it/s, loss=1.65e-5, acc=1]


Epoch 96: Loss=0.0000, Acc=1.0000


Epoch 97/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.67it/s, loss=1.84e-5, acc=1]


Epoch 97: Loss=0.0000, Acc=1.0000


Epoch 98/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 308.99it/s, loss=3.58e-5, acc=1]


Epoch 98: Loss=0.0000, Acc=1.0000


Epoch 99/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 304.43it/s, loss=1.76e-5, acc=1]


Epoch 99: Loss=0.0000, Acc=1.0000


Epoch 100/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 294.74it/s, loss=1.44e-5, acc=1]


Epoch 100: Loss=0.0000, Acc=1.0000
Starting to train model for all-hard with vocab size 3588
Moving model to device:  cuda


Epoch 1/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 312.39it/s, loss=2.1, acc=0.133]


Epoch 1: Loss=2.3575, Acc=0.1331


Epoch 2/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 303.74it/s, loss=1.87, acc=0.189]


Epoch 2: Loss=2.0211, Acc=0.1895


Epoch 3/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 310.57it/s, loss=1.83, acc=0.26]


Epoch 3: Loss=1.8965, Acc=0.2596


Epoch 4/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 311.43it/s, loss=1.86, acc=0.325]


Epoch 4: Loss=1.7673, Acc=0.3246


Epoch 5/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.55it/s, loss=1.43, acc=0.404]


Epoch 5: Loss=1.5724, Acc=0.4037


Epoch 6/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 311.34it/s, loss=1.61, acc=0.479]


Epoch 6: Loss=1.3931, Acc=0.4792


Epoch 7/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 308.57it/s, loss=1.58, acc=0.539]


Epoch 7: Loss=1.2403, Acc=0.5394


Epoch 8/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 316.23it/s, loss=1.37, acc=0.591]


Epoch 8: Loss=1.1118, Acc=0.5907


Epoch 9/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.83it/s, loss=1.35, acc=0.634]


Epoch 9: Loss=0.9941, Acc=0.6344


Epoch 10/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 300.67it/s, loss=1.24, acc=0.682]


Epoch 10: Loss=0.8742, Acc=0.6822


Epoch 11/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 315.41it/s, loss=1.01, acc=0.726]


Epoch 11: Loss=0.7550, Acc=0.7264


Epoch 12/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 313.73it/s, loss=0.759, acc=0.768]


Epoch 12: Loss=0.6458, Acc=0.7684


Epoch 13/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 323.07it/s, loss=0.844, acc=0.814]


Epoch 13: Loss=0.5279, Acc=0.8145


Epoch 14/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 319.97it/s, loss=0.622, acc=0.848]


Epoch 14: Loss=0.4347, Acc=0.8478


Epoch 15/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 310.80it/s, loss=0.437, acc=0.88]


Epoch 15: Loss=0.3443, Acc=0.8800


Epoch 16/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 318.30it/s, loss=0.14, acc=0.904]


Epoch 16: Loss=0.2814, Acc=0.9042


Epoch 17/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 318.37it/s, loss=0.253, acc=0.926]


Epoch 17: Loss=0.2264, Acc=0.9262


Epoch 18/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 308.56it/s, loss=0.277, acc=0.94]


Epoch 18: Loss=0.1862, Acc=0.9402


Epoch 19/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 319.09it/s, loss=0.344, acc=0.947]


Epoch 19: Loss=0.1667, Acc=0.9470


Epoch 20/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 318.67it/s, loss=0.278, acc=0.955]


Epoch 20: Loss=0.1398, Acc=0.9548


Epoch 21/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 311.61it/s, loss=0.102, acc=0.956]


Epoch 21: Loss=0.1342, Acc=0.9562


Epoch 22/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 316.82it/s, loss=0.0726, acc=0.968]


Epoch 22: Loss=0.1059, Acc=0.9680


Epoch 23/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 318.86it/s, loss=0.0197, acc=0.975]


Epoch 23: Loss=0.0849, Acc=0.9747


Epoch 24/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 317.85it/s, loss=0.122, acc=0.972]


Epoch 24: Loss=0.0865, Acc=0.9719


Epoch 25/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 307.52it/s, loss=0.0395, acc=0.978]


Epoch 25: Loss=0.0716, Acc=0.9780


Epoch 26/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 308.68it/s, loss=0.625, acc=0.975]


Epoch 26: Loss=0.0829, Acc=0.9749


Epoch 27/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 304.93it/s, loss=0.0513, acc=0.978]


Epoch 27: Loss=0.0693, Acc=0.9784


Epoch 28/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 311.09it/s, loss=0.00372, acc=0.99]


Epoch 28: Loss=0.0343, Acc=0.9902


Epoch 29/100: 100%|███████████████████████████████| 313/313 [00:00<00:00, 321.55it/s, loss=0.233, acc=0.987]


Epoch 29: Loss=0.0454, Acc=0.9872


Epoch 30/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 320.37it/s, loss=0.0153, acc=0.975]


Epoch 30: Loss=0.0763, Acc=0.9755


Epoch 31/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 311.70it/s, loss=0.0929, acc=0.979]


Epoch 31: Loss=0.0659, Acc=0.9791


Epoch 32/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 304.49it/s, loss=0.00319, acc=0.984]


Epoch 32: Loss=0.0525, Acc=0.9838


Epoch 33/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 317.38it/s, loss=0.0119, acc=0.991]


Epoch 33: Loss=0.0333, Acc=0.9913


Epoch 34/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 304.15it/s, loss=0.0273, acc=0.993]


Epoch 34: Loss=0.0262, Acc=0.9929


Epoch 35/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 313.46it/s, loss=0.0192, acc=0.993]


Epoch 35: Loss=0.0210, Acc=0.9932


Epoch 36/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 315.29it/s, loss=0.0215, acc=0.984]


Epoch 36: Loss=0.0503, Acc=0.9841


Epoch 37/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 309.41it/s, loss=0.0208, acc=0.988]


Epoch 37: Loss=0.0363, Acc=0.9882


Epoch 38/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 313.14it/s, loss=0.0401, acc=0.989]


Epoch 38: Loss=0.0388, Acc=0.9886


Epoch 39/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 315.27it/s, loss=0.0633, acc=0.988]


Epoch 39: Loss=0.0388, Acc=0.9878


Epoch 40/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 319.30it/s, loss=0.00375, acc=0.993]


Epoch 40: Loss=0.0238, Acc=0.9931


Epoch 41/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 316.27it/s, loss=0.00591, acc=0.994]


Epoch 41: Loss=0.0187, Acc=0.9943


Epoch 42/100: 100%|██████████████████████████████| 313/313 [00:00<00:00, 315.17it/s, loss=0.0075, acc=0.995]


Epoch 42: Loss=0.0182, Acc=0.9951


Epoch 43/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 306.28it/s, loss=0.00503, acc=0.996]


Epoch 43: Loss=0.0156, Acc=0.9958


Epoch 44/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 308.83it/s, loss=0.0227, acc=0.991]


Epoch 44: Loss=0.0288, Acc=0.9908


Epoch 45/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 307.08it/s, loss=0.0284, acc=0.991]


Epoch 45: Loss=0.0276, Acc=0.9911


Epoch 46/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 311.57it/s, loss=0.00353, acc=0.995]


Epoch 46: Loss=0.0184, Acc=0.9950


Epoch 47/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 311.79it/s, loss=0.00622, acc=0.995]


Epoch 47: Loss=0.0166, Acc=0.9954


Epoch 48/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 312.16it/s, loss=0.0052, acc=0.997]


Epoch 48: Loss=0.0106, Acc=0.9969


Epoch 49/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 308.83it/s, loss=0.00693, acc=0.998]


Epoch 49: Loss=0.0083, Acc=0.9978


Epoch 50/100: 100%|█████████████████████████████| 313/313 [00:00<00:00, 318.82it/s, loss=0.00502, acc=0.999]


Epoch 50: Loss=0.0037, Acc=0.9991


Epoch 51/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 304.89it/s, loss=0.000633, acc=1]


Epoch 51: Loss=0.0012, Acc=0.9999


Epoch 52/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 306.00it/s, loss=0.000185, acc=1]


Epoch 52: Loss=0.0006, Acc=0.9999


Epoch 53/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 306.30it/s, loss=1.9e-5, acc=1]


Epoch 53: Loss=0.0002, Acc=1.0000


Epoch 54/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 315.70it/s, loss=0.000109, acc=1]


Epoch 54: Loss=0.0001, Acc=1.0000


Epoch 55/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 313.54it/s, loss=0.000146, acc=1]


Epoch 55: Loss=0.0001, Acc=1.0000


Epoch 56/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 311.72it/s, loss=7.93e-5, acc=1]


Epoch 56: Loss=0.0001, Acc=1.0000


Epoch 57/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 314.39it/s, loss=0.000108, acc=1]


Epoch 57: Loss=0.0001, Acc=1.0000


Epoch 58/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 316.89it/s, loss=7.46e-5, acc=1]


Epoch 58: Loss=0.0001, Acc=1.0000


Epoch 59/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 313.78it/s, loss=2.62e-5, acc=1]


Epoch 59: Loss=0.0001, Acc=1.0000


Epoch 60/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.21it/s, loss=3.02e-5, acc=1]


Epoch 60: Loss=0.0001, Acc=1.0000


Epoch 61/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.45it/s, loss=6.74e-5, acc=1]


Epoch 61: Loss=0.0001, Acc=1.0000


Epoch 62/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.91it/s, loss=6.26e-5, acc=1]


Epoch 62: Loss=0.0001, Acc=1.0000


Epoch 63/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 319.86it/s, loss=5.24e-5, acc=1]


Epoch 63: Loss=0.0000, Acc=1.0000


Epoch 64/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 315.18it/s, loss=1.72e-5, acc=1]


Epoch 64: Loss=0.0000, Acc=1.0000


Epoch 65/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 316.94it/s, loss=2.96e-5, acc=1]


Epoch 65: Loss=0.0000, Acc=1.0000


Epoch 66/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 309.29it/s, loss=6.14e-6, acc=1]


Epoch 66: Loss=0.0000, Acc=1.0000


Epoch 67/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.33it/s, loss=4.36e-6, acc=1]


Epoch 67: Loss=0.0000, Acc=1.0000


Epoch 68/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.77it/s, loss=5.78e-5, acc=1]


Epoch 68: Loss=0.0000, Acc=1.0000


Epoch 69/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 315.50it/s, loss=1.17e-5, acc=1]


Epoch 69: Loss=0.0000, Acc=1.0000


Epoch 70/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 320.66it/s, loss=2.82e-5, acc=1]


Epoch 70: Loss=0.0000, Acc=1.0000


Epoch 71/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 308.57it/s, loss=1.49e-5, acc=1]


Epoch 71: Loss=0.0000, Acc=1.0000


Epoch 72/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 313.20it/s, loss=3.04e-5, acc=1]


Epoch 72: Loss=0.0000, Acc=1.0000


Epoch 73/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 315.27it/s, loss=6.41e-6, acc=1]


Epoch 73: Loss=0.0000, Acc=1.0000


Epoch 74/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.43it/s, loss=1.72e-5, acc=1]


Epoch 74: Loss=0.0000, Acc=1.0000


Epoch 75/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 310.01it/s, loss=1.78e-5, acc=1]


Epoch 75: Loss=0.0000, Acc=1.0000


Epoch 76/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.73it/s, loss=6.35e-6, acc=1]


Epoch 76: Loss=0.0000, Acc=1.0000


Epoch 77/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 306.24it/s, loss=3.34e-6, acc=1]


Epoch 77: Loss=0.0000, Acc=1.0000


Epoch 78/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.31it/s, loss=5.77e-6, acc=1]


Epoch 78: Loss=0.0000, Acc=1.0000


Epoch 79/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.79it/s, loss=6.05e-6, acc=1]


Epoch 79: Loss=0.0000, Acc=1.0000


Epoch 80/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 316.47it/s, loss=8.21e-6, acc=1]


Epoch 80: Loss=0.0000, Acc=1.0000


Epoch 81/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.29it/s, loss=7.22e-6, acc=1]


Epoch 81: Loss=0.0000, Acc=1.0000


Epoch 82/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.37it/s, loss=2.63e-6, acc=1]


Epoch 82: Loss=0.0000, Acc=1.0000


Epoch 83/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 318.98it/s, loss=8.08e-6, acc=1]


Epoch 83: Loss=0.0000, Acc=1.0000


Epoch 84/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 319.31it/s, loss=8.18e-6, acc=1]


Epoch 84: Loss=0.0000, Acc=1.0000


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 308.73it/s, loss=4.12e-6, acc=1]


Epoch 85: Loss=0.0000, Acc=1.0000


Epoch 86/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 311.68it/s, loss=6.11e-6, acc=1]


Epoch 86: Loss=0.0000, Acc=1.0000


Epoch 87/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 310.14it/s, loss=1.12e-6, acc=1]


Epoch 87: Loss=0.0000, Acc=1.0000


Epoch 88/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.12it/s, loss=1.62e-6, acc=1]


Epoch 88: Loss=0.0000, Acc=1.0000


Epoch 89/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 314.23it/s, loss=1.23e-6, acc=1]


Epoch 89: Loss=0.0000, Acc=1.0000


Epoch 90/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 315.42it/s, loss=1.05e-6, acc=1]


Epoch 90: Loss=0.0000, Acc=1.0000


Epoch 91/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 318.82it/s, loss=1.84e-6, acc=1]


Epoch 91: Loss=0.0000, Acc=1.0000


Epoch 92/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 315.94it/s, loss=2.66e-6, acc=1]


Epoch 92: Loss=0.0000, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.98it/s, loss=1.31e-6, acc=1]


Epoch 93: Loss=0.0000, Acc=1.0000


Epoch 94/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 305.34it/s, loss=2.83e-6, acc=1]


Epoch 94: Loss=0.0000, Acc=1.0000


Epoch 95/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.67it/s, loss=9.46e-7, acc=1]


Epoch 95: Loss=0.0000, Acc=1.0000


Epoch 96/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 317.50it/s, loss=1.58e-6, acc=1]


Epoch 96: Loss=0.0000, Acc=1.0000


Epoch 97/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 312.90it/s, loss=2.23e-6, acc=1]


Epoch 97: Loss=0.0000, Acc=1.0000


Epoch 98/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.02it/s, loss=1.22e-6, acc=1]


Epoch 98: Loss=0.0000, Acc=1.0000


Epoch 99/100: 100%|█████████████████████████████████| 313/313 [00:00<00:00, 321.47it/s, loss=7.45e-7, acc=1]


Epoch 99: Loss=0.0000, Acc=1.0000


Epoch 100/100: 100%|████████████████████████████████| 313/313 [00:00<00:00, 318.74it/s, loss=8.34e-7, acc=1]


Epoch 100: Loss=0.0000, Acc=1.0000
Starting to train model for mixed with vocab size 3785
Moving model to device:  cuda


Epoch 1/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.71it/s, loss=2.09, acc=0.178]


Epoch 1: Loss=2.5483, Acc=0.1782


Epoch 2/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.85it/s, loss=2.17, acc=0.194]


Epoch 2: Loss=2.2205, Acc=0.1938


Epoch 3/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.67it/s, loss=2.13, acc=0.214]


Epoch 3: Loss=2.1530, Acc=0.2137


Epoch 4/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.80it/s, loss=1.85, acc=0.254]


Epoch 4: Loss=2.0493, Acc=0.2535


Epoch 5/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.15it/s, loss=1.96, acc=0.325]


Epoch 5: Loss=1.8809, Acc=0.3253


Epoch 6/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 234.99it/s, loss=1.58, acc=0.399]


Epoch 6: Loss=1.6855, Acc=0.3993


Epoch 7/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.12it/s, loss=1.23, acc=0.455]


Epoch 7: Loss=1.5264, Acc=0.4546


Epoch 8/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.97it/s, loss=1.46, acc=0.496]


Epoch 8: Loss=1.4027, Acc=0.4958


Epoch 9/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.24it/s, loss=1.65, acc=0.529]


Epoch 9: Loss=1.3166, Acc=0.5291


Epoch 10/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 235.57it/s, loss=1.51, acc=0.549]


Epoch 10: Loss=1.2586, Acc=0.5494


Epoch 11/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 236.50it/s, loss=1.39, acc=0.561]


Epoch 11: Loss=1.2112, Acc=0.5612


Epoch 12/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.68it/s, loss=1.38, acc=0.57]


Epoch 12: Loss=1.1877, Acc=0.5696


Epoch 13/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 234.31it/s, loss=1.21, acc=0.581]


Epoch 13: Loss=1.1508, Acc=0.5812


Epoch 14/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 233.70it/s, loss=1.17, acc=0.588]


Epoch 14: Loss=1.1281, Acc=0.5877


Epoch 15/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 236.84it/s, loss=1.33, acc=0.588]


Epoch 15: Loss=1.1202, Acc=0.5884


Epoch 16/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 233.71it/s, loss=1.4, acc=0.597]


Epoch 16: Loss=1.0881, Acc=0.5968


Epoch 17/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.81it/s, loss=0.814, acc=0.604]


Epoch 17: Loss=1.0661, Acc=0.6045


Epoch 18/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.41it/s, loss=0.904, acc=0.606]


Epoch 18: Loss=1.0529, Acc=0.6057


Epoch 19/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 235.04it/s, loss=1.32, acc=0.611]


Epoch 19: Loss=1.0416, Acc=0.6115


Epoch 20/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 234.85it/s, loss=1.27, acc=0.618]


Epoch 20: Loss=1.0136, Acc=0.6182


Epoch 21/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 236.25it/s, loss=1.34, acc=0.621]


Epoch 21: Loss=0.9937, Acc=0.6209


Epoch 22/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.86it/s, loss=0.899, acc=0.629]


Epoch 22: Loss=0.9760, Acc=0.6290


Epoch 23/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 233.67it/s, loss=0.999, acc=0.636]


Epoch 23: Loss=0.9444, Acc=0.6363


Epoch 24/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 235.52it/s, loss=1.04, acc=0.644]


Epoch 24: Loss=0.9241, Acc=0.6438


Epoch 25/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 238.12it/s, loss=0.958, acc=0.65]


Epoch 25: Loss=0.8972, Acc=0.6495


Epoch 26/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 236.43it/s, loss=1.15, acc=0.653]


Epoch 26: Loss=0.8912, Acc=0.6533


Epoch 27/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.09it/s, loss=0.624, acc=0.659]


Epoch 27: Loss=0.8760, Acc=0.6593


Epoch 28/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.82it/s, loss=0.932, acc=0.674]


Epoch 28: Loss=0.8391, Acc=0.6736


Epoch 29/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 237.32it/s, loss=0.986, acc=0.687]


Epoch 29: Loss=0.7995, Acc=0.6866


Epoch 30/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.93it/s, loss=0.698, acc=0.694]


Epoch 30: Loss=0.7786, Acc=0.6935


Epoch 31/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.39it/s, loss=0.924, acc=0.701]


Epoch 31: Loss=0.7537, Acc=0.7008


Epoch 32/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.76it/s, loss=0.787, acc=0.712]


Epoch 32: Loss=0.7178, Acc=0.7120


Epoch 33/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.76it/s, loss=0.63, acc=0.72]


Epoch 33: Loss=0.7021, Acc=0.7199


Epoch 34/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 237.01it/s, loss=0.471, acc=0.733]


Epoch 34: Loss=0.6632, Acc=0.7327


Epoch 35/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.60it/s, loss=0.481, acc=0.748]


Epoch 35: Loss=0.6291, Acc=0.7479


Epoch 36/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 235.83it/s, loss=0.61, acc=0.754]


Epoch 36: Loss=0.6062, Acc=0.7540


Epoch 37/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.37it/s, loss=0.796, acc=0.765]


Epoch 37: Loss=0.5731, Acc=0.7648


Epoch 38/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.26it/s, loss=0.606, acc=0.778]


Epoch 38: Loss=0.5482, Acc=0.7780


Epoch 39/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 233.60it/s, loss=0.314, acc=0.787]


Epoch 39: Loss=0.5244, Acc=0.7872


Epoch 40/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.74it/s, loss=0.361, acc=0.796]


Epoch 40: Loss=0.5097, Acc=0.7963


Epoch 41/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.80it/s, loss=0.327, acc=0.804]


Epoch 41: Loss=0.4847, Acc=0.8037


Epoch 42/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.31it/s, loss=0.441, acc=0.818]


Epoch 42: Loss=0.4447, Acc=0.8181


Epoch 43/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.86it/s, loss=0.377, acc=0.834]


Epoch 43: Loss=0.4072, Acc=0.8339


Epoch 44/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.99it/s, loss=0.381, acc=0.842]


Epoch 44: Loss=0.3904, Acc=0.8415


Epoch 45/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.10it/s, loss=0.548, acc=0.849]


Epoch 45: Loss=0.3716, Acc=0.8492


Epoch 46/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.77it/s, loss=0.197, acc=0.861]


Epoch 46: Loss=0.3484, Acc=0.8614


Epoch 47/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.01it/s, loss=0.222, acc=0.869]


Epoch 47: Loss=0.3297, Acc=0.8687


Epoch 48/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.31it/s, loss=0.373, acc=0.866]


Epoch 48: Loss=0.3292, Acc=0.8659


Epoch 49/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.93it/s, loss=0.315, acc=0.881]


Epoch 49: Loss=0.2980, Acc=0.8813


Epoch 50/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.07it/s, loss=0.244, acc=0.881]


Epoch 50: Loss=0.3026, Acc=0.8810


Epoch 51/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.19it/s, loss=0.166, acc=0.904]


Epoch 51: Loss=0.2492, Acc=0.9036


Epoch 52/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.27it/s, loss=0.171, acc=0.906]


Epoch 52: Loss=0.2455, Acc=0.9060


Epoch 53/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.15it/s, loss=0.194, acc=0.904]


Epoch 53: Loss=0.2472, Acc=0.9042


Epoch 54/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 236.43it/s, loss=0.352, acc=0.92]


Epoch 54: Loss=0.2098, Acc=0.9203


Epoch 55/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.67it/s, loss=0.207, acc=0.925]


Epoch 55: Loss=0.1971, Acc=0.9252


Epoch 56/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.09it/s, loss=0.216, acc=0.924]


Epoch 56: Loss=0.1981, Acc=0.9239


Epoch 57/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.12it/s, loss=0.209, acc=0.932]


Epoch 57: Loss=0.1770, Acc=0.9319


Epoch 58/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.83it/s, loss=0.189, acc=0.941]


Epoch 58: Loss=0.1610, Acc=0.9408


Epoch 59/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.55it/s, loss=0.0994, acc=0.94]


Epoch 59: Loss=0.1642, Acc=0.9402


Epoch 60/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.23it/s, loss=0.123, acc=0.946]


Epoch 60: Loss=0.1505, Acc=0.9457


Epoch 61/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 238.10it/s, loss=0.0297, acc=0.956]


Epoch 61: Loss=0.1271, Acc=0.9556


Epoch 62/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 237.23it/s, loss=0.0992, acc=0.957]


Epoch 62: Loss=0.1201, Acc=0.9574


Epoch 63/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.89it/s, loss=0.112, acc=0.963]


Epoch 63: Loss=0.1081, Acc=0.9634


Epoch 64/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 235.02it/s, loss=0.0565, acc=0.965]


Epoch 64: Loss=0.1013, Acc=0.9654


Epoch 65/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 234.89it/s, loss=0.133, acc=0.972]


Epoch 65: Loss=0.0854, Acc=0.9721


Epoch 66/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 235.43it/s, loss=0.0539, acc=0.971]


Epoch 66: Loss=0.0857, Acc=0.9710


Epoch 67/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 236.01it/s, loss=0.116, acc=0.971]


Epoch 67: Loss=0.0842, Acc=0.9708


Epoch 68/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 237.13it/s, loss=0.0595, acc=0.976]


Epoch 68: Loss=0.0748, Acc=0.9756


Epoch 69/100: 100%|███████████████████████████████| 313/313 [00:01<00:00, 235.46it/s, loss=0.132, acc=0.983]


Epoch 69: Loss=0.0606, Acc=0.9832


Epoch 70/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 235.85it/s, loss=0.0457, acc=0.986]


Epoch 70: Loss=0.0527, Acc=0.9860


Epoch 71/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 235.09it/s, loss=0.0406, acc=0.989]


Epoch 71: Loss=0.0438, Acc=0.9893


Epoch 72/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 237.47it/s, loss=0.0692, acc=0.987]


Epoch 72: Loss=0.0475, Acc=0.9869


Epoch 73/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 235.57it/s, loss=0.0306, acc=0.991]


Epoch 73: Loss=0.0388, Acc=0.9915


Epoch 74/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 236.33it/s, loss=0.00586, acc=0.995]


Epoch 74: Loss=0.0281, Acc=0.9950


Epoch 75/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 236.12it/s, loss=0.0967, acc=0.996]


Epoch 75: Loss=0.0255, Acc=0.9960


Epoch 76/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 234.79it/s, loss=0.0201, acc=0.996]


Epoch 76: Loss=0.0256, Acc=0.9956


Epoch 77/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 235.25it/s, loss=0.00901, acc=0.995]


Epoch 77: Loss=0.0263, Acc=0.9950


Epoch 78/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 236.09it/s, loss=0.0359, acc=0.998]


Epoch 78: Loss=0.0174, Acc=0.9981


Epoch 79/100: 100%|██████████████████████████████| 313/313 [00:01<00:00, 234.96it/s, loss=0.0122, acc=0.998]


Epoch 79: Loss=0.0154, Acc=0.9983


Epoch 80/100: 100%|█████████████████████████████| 313/313 [00:01<00:00, 235.98it/s, loss=0.00358, acc=0.999]


Epoch 80: Loss=0.0119, Acc=0.9994


Epoch 81/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 234.50it/s, loss=0.00277, acc=1]


Epoch 81: Loss=0.0101, Acc=0.9996


Epoch 82/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 233.83it/s, loss=0.0108, acc=1]


Epoch 82: Loss=0.0097, Acc=0.9997


Epoch 83/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 235.85it/s, loss=0.0103, acc=1]


Epoch 83: Loss=0.0089, Acc=0.9997


Epoch 84/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.84it/s, loss=0.00368, acc=1]


Epoch 84: Loss=0.0076, Acc=0.9999


Epoch 85/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.38it/s, loss=0.00954, acc=1]


Epoch 85: Loss=0.0066, Acc=1.0000


Epoch 86/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 235.29it/s, loss=0.0107, acc=1]


Epoch 86: Loss=0.0061, Acc=1.0000


Epoch 87/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.78it/s, loss=0.00315, acc=1]


Epoch 87: Loss=0.0054, Acc=1.0000


Epoch 88/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.10it/s, loss=0.00526, acc=1]


Epoch 88: Loss=0.0051, Acc=1.0000


Epoch 89/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.14it/s, loss=0.00337, acc=1]


Epoch 89: Loss=0.0047, Acc=1.0000


Epoch 90/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.57it/s, loss=0.00563, acc=1]


Epoch 90: Loss=0.0043, Acc=1.0000


Epoch 91/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.63it/s, loss=0.00448, acc=1]


Epoch 91: Loss=0.0041, Acc=1.0000


Epoch 92/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.52it/s, loss=0.00286, acc=1]


Epoch 92: Loss=0.0039, Acc=1.0000


Epoch 93/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.76it/s, loss=0.00448, acc=1]


Epoch 93: Loss=0.0037, Acc=1.0000


Epoch 94/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 233.73it/s, loss=0.00568, acc=1]


Epoch 94: Loss=0.0036, Acc=1.0000


Epoch 95/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.75it/s, loss=0.00506, acc=1]


Epoch 95: Loss=0.0034, Acc=1.0000


Epoch 96/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 235.53it/s, loss=0.000881, acc=1]


Epoch 96: Loss=0.0033, Acc=1.0000


Epoch 97/100: 100%|██████████████████████████████████| 313/313 [00:01<00:00, 235.95it/s, loss=0.0014, acc=1]


Epoch 97: Loss=0.0032, Acc=1.0000


Epoch 98/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 236.12it/s, loss=0.00134, acc=1]


Epoch 98: Loss=0.0032, Acc=1.0000


Epoch 99/100: 100%|█████████████████████████████████| 313/313 [00:01<00:00, 235.32it/s, loss=0.00315, acc=1]


Epoch 99: Loss=0.0031, Acc=1.0000


Epoch 100/100: 100%|████████████████████████████████| 313/313 [00:01<00:00, 234.52it/s, loss=0.00441, acc=1]


Epoch 100: Loss=0.0031, Acc=1.0000


In [None]:
def test_model(model, text, tokenizer, device='cuda'):
    """Quick inference test"""
    model.eval()
    tokens = tokenizer.encode(text)
    input_ids = torch.tensor([tokens]).to(device)
    
    with torch.no_grad():
        logits = model(input_ids)
        pred_token = logits[0, -1].argmax().item()
        pred_char = tokenizer.decode([pred_token])
    
    print(f"Input: {text}")
    print(f"Predicted: {pred_char}")
    return pred_char

# Test
for name in train_dataset_names:
    test_model(models[name], "Count the letter o in: johnon", train_tokenizers[name])