# Check device

In [1]:
######################################################

# Check device
!nvidia-smi

Thu Mar  3 19:10:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

# Package Install

In [2]:
# ====================================================
# Install
# ====================================================

!pip install -q datasets==1.18.3
!pip install -q sentencepiece==0.1.96
!pip install -q transformers==4.16.2

[K     |████████████████████████████████| 311 kB 13.7 MB/s 
[K     |████████████████████████████████| 67 kB 6.0 MB/s 
[K     |████████████████████████████████| 1.1 MB 61.0 MB/s 
[K     |████████████████████████████████| 134 kB 86.2 MB/s 
[K     |████████████████████████████████| 212 kB 88.3 MB/s 
[K     |████████████████████████████████| 94 kB 3.8 MB/s 
[K     |████████████████████████████████| 271 kB 68.0 MB/s 
[K     |████████████████████████████████| 144 kB 70.5 MB/s 
[K     |████████████████████████████████| 1.2 MB 15.0 MB/s 
[K     |████████████████████████████████| 3.5 MB 14.1 MB/s 
[K     |████████████████████████████████| 596 kB 76.8 MB/s 
[K     |████████████████████████████████| 895 kB 70.9 MB/s 
[K     |████████████████████████████████| 6.5 MB 77.9 MB/s 
[?25h

# Config

In [3]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    apex=True
    print_freq=100
    num_workers=4
    model="microsoft/deberta-v3-large"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=30
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=2
    fc_dropout=0.2
    max_len=450
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    # MLM setting
    mlm_probability=0.40 # 0.15
    max_seq_length=None
    seed=42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [4]:
# ====================================================
# Define path
# ====================================================
import os

INPUT_DIR = '/content/drive/MyDrive/kaggle-nbme-score-clinical-patient-notes/input/'
OUTPUT_DIR = '/content/drive/MyDrive/kaggle-nbme-score-clinical-patient-notes/pretrained/{}/'.format(CFG.model.replace('/', '-'))
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Library

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForMaskedLM
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers.modeling_outputs import MaskedLMOutput
from transformers import DataCollatorForLanguageModeling
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.11.6
transformers.__version__: 4.16.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [6]:
# ====================================================
# Utils
# ===================================================
def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# Trainer

In [7]:
######################################################

# Trainer
def trainer(model, data_loader, optimizer, scheduler, CFG):
    model.train()

    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = []

    optimizer.zero_grad()

    for idx, batch in enumerate(data_loader):
        for k, v in batch.items():
            batch[k] = v.to(device, dtype=torch.long)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            outputs = model(**batch)

        loss = outputs.loss

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (idx + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            CFG.global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        
        losses.append(loss.detach().cpu().item())

        # if (CFG.global_step % CFG.save_step) == 0:
        #     LOGGER.info(
        #         "Epoch {} Step {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
        #             CFG.epoch + 1, CFG.global_step, np.mean(losses), time.time() - start)
        #         )
        #     torch.save(
        #         model.state_dict(),
        #         OUTPUT_DIR + '{}-mlm-step-{}.bin'.format(
        #             CFG.model.replace('/', '-'),
        #             CFG.global_step))
        
        if idx % 500 == 0 or idx == (len(data_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss:.4f} '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, idx, len(data_loader), 
                          remain=timeSince(start, float(idx+1)/len(data_loader)),
                          loss=np.mean(losses),
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
    
    return np.mean(losses)

# Model

In [8]:
def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_parameters = [
        {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': weight_decay},
        {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
         'lr': encoder_lr, 'weight_decay': 0.0},
        {'params': [p for n, p in model.named_parameters() if "model" not in n],
         'lr': decoder_lr, 'weight_decay': 0.0}
    ]
    return optimizer_parameters

In [9]:
def get_scheduler(cfg, optimizer, num_train_steps):
    if cfg.scheduler=='linear':
        scheduler = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
        )
    elif cfg.scheduler=='cosine':
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
    return scheduler

In [10]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.model,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.model, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

# Main

In [11]:
# ====================================================
# data loading
# ====================================================
train = pd.read_csv(os.path.join(INPUT_DIR, 'train.csv'))
features = pd.read_csv(os.path.join(INPUT_DIR, 'features.csv'))
patient_notes = pd.read_csv(os.path.join(INPUT_DIR, 'patient_notes.csv'))
display(patient_notes.head())
display(patient_notes['pn_history'].nunique(), len(patient_notes))

Unnamed: 0,pn_num,case_num,pn_history
0,0,0,"17-year-old male, has come to the student heal..."
1,1,0,17 yo male with recurrent palpitations for the...
2,2,0,Dillon Cleveland is a 17 y.o. male patient wit...
3,3,0,a 17 yo m c/o palpitation started 3 mos ago; \...
4,4,0,17yo male with no pmh here for evaluation of p...


42146

42146

In [12]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.35M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [13]:
mlm_data = patient_notes[['pn_history']].reset_index(drop=True)
mlm_data = mlm_data.rename(columns={'pn_history': 'text'})
csv_name = f'mlm_data.csv'
mlm_data.to_csv(OUTPUT_DIR+csv_name, index=False)
print(f"Saved mlm data: {csv_name}")
print(f"mlm data: {mlm_data.shape}")

Saved mlm data: mlm_data.csv
mlm data: (42146, 1)


In [14]:
#####################################################

# Training support

def tokenize_function(examples):
    return tokenizer(examples["text"], return_special_tokens_mask=True)

def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // CFG.max_len) * CFG.max_len
    result = {
        k: [t[i : i + CFG.max_len] for i in range(0, total_length, CFG.max_len)]
        for k, t in concatenated_examples.items()
    }
    return result

In [15]:
seed_everything(CFG.seed)

CFG.train_file = f"mlm_data.csv"
data_files = {'train': OUTPUT_DIR+CFG.train_file}
raw_datasets = load_dataset('csv', data_files=data_files)

Using custom data configuration default-2954b82d524aa7c6


Downloading and preparing dataset csv/default to /root/.cache/huggingface/datasets/csv/default-2954b82d524aa7c6/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e...


  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

Dataset csv downloaded and prepared to /root/.cache/huggingface/datasets/csv/default-2954b82d524aa7c6/0.0.0/6b9057d9e23d9d8a2f05b985917a0da84d70c5dae3d22ddd8a3f22fb01c69d9e. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

In [16]:
if CFG.max_seq_length is None:
    max_seq_length = tokenizer.model_max_length
else:
    if CFG.max_seq_length > tokenizer.model_max_length:
        max_seq_length = min(CFG.max_seq_length, tokenizer.model_max_length)

In [17]:
tokenized_datasets = raw_datasets.map(
    tokenize_function,
    batched=True,
    num_proc=4,
    remove_columns=["text"],
    load_from_cache_file=not True,
    )
LOGGER.info(f"tokenized_datasets: {tokenized_datasets}")

Setting TOKENIZERS_PARALLELISM=false for forked processes.
tokenized_datasets: DatasetDict({
    train: Dataset({
        features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
        num_rows: 42146
    })
})


In [18]:
tokenized_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    num_proc=4,
    load_from_cache_file=not True,
    )
train_dataset = tokenized_datasets["train"]
LOGGER.info(f"train_dataset: {train_dataset}")

Setting TOKENIZERS_PARALLELISM=false for forked processes.
train_dataset: Dataset({
    features: ['input_ids', 'token_type_ids', 'special_tokens_mask', 'attention_mask'],
    num_rows: 17763
})


In [19]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm_probability=CFG.mlm_probability
    )
train_loader = DataLoader(
    train_dataset,
    shuffle=True,
    collate_fn=data_collator,
    batch_size=CFG.batch_size
    )

In [20]:
model = CustomModel(CFG, config_path=None, pretrained=True)
model.to(device)
optimizer_parameters = get_optimizer_params(
    model,
    encoder_lr=CFG.encoder_lr, 
    decoder_lr=CFG.decoder_lr,
    weight_decay=CFG.weight_decay)
optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)

Downloading:   0%|          | 0.00/833M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.classifier.weight', 'mask_predictions.classifier.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.LayerNorm.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

In [21]:
num_train_steps = int(len(mlm_data) / CFG.batch_size * CFG.epochs)
scheduler = get_scheduler(CFG, optimizer, num_train_steps)

In [22]:
print(num_train_steps)

632190


In [None]:
CFG.global_step = 0
CFG.save_step = 25000

for epoch in range(CFG.epochs):
    CFG.epoch = epoch
    start = time.time()
        
    train_loss = trainer(model, train_loader, optimizer, scheduler, CFG)

    LOGGER.info(
        "Epoch {}: Train Loss {:.4f}, elapsed {:.4f}s".format(
            epoch + 1, train_loss, time.time() - start)
        )
    torch.save(
        model.state_dict(),
        OUTPUT_DIR + '{}-mlm-epoch-{}.bin'.format(
            CFG.model.replace('/', '-'),
            CFG.epoch + 1))

Epoch: [1][0/8882] Elapsed 0m 1s (remain 167m 25s) Loss: 12.2465 Grad: inf  LR: 0.00002000  
Epoch: [1][500/8882] Elapsed 2m 26s (remain 40m 58s) Loss: 5.5722 Grad: 56626.3945  LR: 0.00002000  
Epoch: [1][1000/8882] Elapsed 4m 53s (remain 38m 27s) Loss: 4.6518 Grad: 69817.8125  LR: 0.00002000  
Epoch: [1][1500/8882] Elapsed 7m 19s (remain 35m 58s) Loss: 4.1503 Grad: 54568.5820  LR: 0.00002000  
Epoch: [1][2000/8882] Elapsed 9m 45s (remain 33m 31s) Loss: 3.8168 Grad: 53982.0547  LR: 0.00002000  
Epoch: [1][2500/8882] Elapsed 12m 11s (remain 31m 5s) Loss: 3.5697 Grad: 110480.6406  LR: 0.00002000  
Epoch: [1][3000/8882] Elapsed 14m 37s (remain 28m 38s) Loss: 3.3927 Grad: 114175.9922  LR: 0.00002000  
Epoch: [1][3500/8882] Elapsed 17m 3s (remain 26m 12s) Loss: 3.2527 Grad: 111778.2344  LR: 0.00002000  
Epoch: [1][4000/8882] Elapsed 19m 30s (remain 23m 47s) Loss: 3.1380 Grad: 106983.9531  LR: 0.00002000  
Epoch: [1][4500/8882] Elapsed 21m 56s (remain 21m 21s) Loss: 3.0386 Grad: 240411.4688 

Epoch 1: Train Loss 2.5651, elapsed 2588.3083s


Epoch: [1][8881/8882] Elapsed 43m 8s (remain 0m 0s) Loss: 2.5651 Grad: 548954.3125  LR: 0.00001999  
Epoch: [2][0/8882] Elapsed 0m 0s (remain 50m 32s) Loss: 1.9792 Grad: 408525.0000  LR: 0.00001999  
Epoch: [2][500/8882] Elapsed 2m 25s (remain 40m 32s) Loss: 1.9540 Grad: 431627.7812  LR: 0.00001999  
Epoch: [2][1000/8882] Elapsed 4m 49s (remain 38m 1s) Loss: 1.9418 Grad: 440722.0000  LR: 0.00001999  
Epoch: [2][1500/8882] Elapsed 7m 14s (remain 35m 34s) Loss: 1.9275 Grad: 416848.3750  LR: 0.00001999  
Epoch: [2][2000/8882] Elapsed 9m 38s (remain 33m 9s) Loss: 1.9186 Grad: 862945.9375  LR: 0.00001999  
Epoch: [2][2500/8882] Elapsed 12m 2s (remain 30m 44s) Loss: 1.9113 Grad: 396378.0000  LR: 0.00001998  
Epoch: [2][3000/8882] Elapsed 14m 26s (remain 28m 18s) Loss: 1.9063 Grad: 440765.7500  LR: 0.00001998  
Epoch: [2][3500/8882] Elapsed 16m 50s (remain 25m 53s) Loss: 1.8990 Grad: 472494.3125  LR: 0.00001998  
Epoch: [2][4000/8882] Elapsed 19m 15s (remain 23m 29s) Loss: 1.8923 Grad: 249725

Epoch 2: Train Loss 1.8150, elapsed 2566.3125s


Epoch: [2][8881/8882] Elapsed 42m 46s (remain 0m 0s) Loss: 1.8150 Grad: 571499.0000  LR: 0.00001996  
Epoch: [3][0/8882] Elapsed 0m 0s (remain 52m 31s) Loss: 1.5482 Grad: 554923.5625  LR: 0.00001996  
Epoch: [3][500/8882] Elapsed 2m 26s (remain 40m 46s) Loss: 1.6868 Grad: 363727.4375  LR: 0.00001996  
Epoch: [3][1000/8882] Elapsed 4m 51s (remain 38m 13s) Loss: 1.6871 Grad: 475128.2500  LR: 0.00001996  
Epoch: [3][1500/8882] Elapsed 7m 16s (remain 35m 44s) Loss: 1.6788 Grad: 414488.5000  LR: 0.00001995  
Epoch: [3][2000/8882] Elapsed 9m 40s (remain 33m 17s) Loss: 1.6758 Grad: 791860.7500  LR: 0.00001995  
Epoch: [3][2500/8882] Elapsed 12m 5s (remain 30m 52s) Loss: 1.6666 Grad: 362081.9062  LR: 0.00001995  
Epoch: [3][3000/8882] Elapsed 14m 30s (remain 28m 26s) Loss: 1.6616 Grad: 397199.5000  LR: 0.00001995  
Epoch: [3][3500/8882] Elapsed 16m 55s (remain 26m 1s) Loss: 1.6580 Grad: 360379.3750  LR: 0.00001994  
Epoch: [3][4000/8882] Elapsed 19m 20s (remain 23m 36s) Loss: 1.6566 Grad: 2047

Epoch 3: Train Loss 1.6225, elapsed 2579.5576s


Epoch: [3][8881/8882] Elapsed 42m 59s (remain 0m 0s) Loss: 1.6225 Grad: 464544.1875  LR: 0.00001991  
Epoch: [4][0/8882] Elapsed 0m 0s (remain 49m 36s) Loss: 1.5398 Grad: 406116.3125  LR: 0.00001991  
Epoch: [4][500/8882] Elapsed 2m 26s (remain 40m 58s) Loss: 1.5651 Grad: 341840.2500  LR: 0.00001991  
Epoch: [4][1000/8882] Elapsed 4m 52s (remain 38m 25s) Loss: 1.5507 Grad: 352848.2188  LR: 0.00001991  
Epoch: [4][1500/8882] Elapsed 7m 18s (remain 35m 56s) Loss: 1.5401 Grad: 334462.8125  LR: 0.00001990  
Epoch: [4][2000/8882] Elapsed 9m 43s (remain 33m 28s) Loss: 1.5344 Grad: 892576.7500  LR: 0.00001990  
Epoch: [4][2500/8882] Elapsed 12m 8s (remain 30m 59s) Loss: 1.5319 Grad: 338804.3125  LR: 0.00001990  
Epoch: [4][3000/8882] Elapsed 14m 34s (remain 28m 33s) Loss: 1.5318 Grad: 394032.0312  LR: 0.00001989  
Epoch: [4][3500/8882] Elapsed 17m 0s (remain 26m 7s) Loss: 1.5296 Grad: 374960.9062  LR: 0.00001989  
Epoch: [4][4000/8882] Elapsed 19m 25s (remain 23m 41s) Loss: 1.5265 Grad: 41744

Epoch 4: Train Loss 1.5109, elapsed 2583.2482s


Epoch: [4][8881/8882] Elapsed 43m 3s (remain 0m 0s) Loss: 1.5109 Grad: 227665.8281  LR: 0.00001984  
Epoch: [5][0/8882] Elapsed 0m 0s (remain 50m 32s) Loss: 1.2817 Grad: 422108.5625  LR: 0.00001984  
Epoch: [5][500/8882] Elapsed 2m 27s (remain 41m 4s) Loss: 1.4573 Grad: 187744.7656  LR: 0.00001984  
Epoch: [5][1000/8882] Elapsed 4m 53s (remain 38m 30s) Loss: 1.4637 Grad: 176406.0781  LR: 0.00001984  
Epoch: [5][1500/8882] Elapsed 7m 18s (remain 35m 57s) Loss: 1.4714 Grad: 176276.4375  LR: 0.00001983  
Epoch: [5][2000/8882] Elapsed 9m 44s (remain 33m 29s) Loss: 1.4780 Grad: 178560.7812  LR: 0.00001983  
Epoch: [5][2500/8882] Elapsed 12m 9s (remain 31m 2s) Loss: 1.4737 Grad: 409448.2500  LR: 0.00001982  
Epoch: [5][3000/8882] Elapsed 14m 35s (remain 28m 36s) Loss: 1.4697 Grad: 322463.9062  LR: 0.00001982  
Epoch: [5][3500/8882] Elapsed 17m 1s (remain 26m 10s) Loss: 1.4664 Grad: 395902.9375  LR: 0.00001981  
Epoch: [5][4000/8882] Elapsed 19m 27s (remain 23m 43s) Loss: 1.4627 Grad: 350919.

Epoch 5: Train Loss 1.4402, elapsed 2587.6956s


Epoch: [5][8881/8882] Elapsed 43m 7s (remain 0m 0s) Loss: 1.4402 Grad: 473637.5312  LR: 0.00001976  
Epoch: [6][0/8882] Elapsed 0m 0s (remain 50m 55s) Loss: 1.4329 Grad: 316120.4062  LR: 0.00001976  
Epoch: [6][500/8882] Elapsed 2m 27s (remain 41m 3s) Loss: 1.4033 Grad: 388192.0312  LR: 0.00001975  
Epoch: [6][1000/8882] Elapsed 4m 52s (remain 38m 24s) Loss: 1.4020 Grad: 360658.1562  LR: 0.00001975  
Epoch: [6][1500/8882] Elapsed 7m 18s (remain 35m 58s) Loss: 1.4066 Grad: 167681.0781  LR: 0.00001974  
Epoch: [6][2000/8882] Elapsed 9m 44s (remain 33m 31s) Loss: 1.4114 Grad: 159072.2031  LR: 0.00001974  
Epoch: [6][2500/8882] Elapsed 12m 10s (remain 31m 4s) Loss: 1.4160 Grad: 159733.3438  LR: 0.00001973  
Epoch: [6][3000/8882] Elapsed 14m 36s (remain 28m 38s) Loss: 1.4144 Grad: 200220.5156  LR: 0.00001972  
Epoch: [6][3500/8882] Elapsed 17m 2s (remain 26m 11s) Loss: 1.4091 Grad: 338152.0312  LR: 0.00001972  
Epoch: [6][4000/8882] Elapsed 19m 28s (remain 23m 45s) Loss: 1.4078 Grad: 377361

Epoch 6: Train Loss 1.3898, elapsed 2590.6291s


Epoch: [6][8881/8882] Elapsed 43m 10s (remain 0m 0s) Loss: 1.3898 Grad: 473930.5000  LR: 0.00001965  
Epoch: [7][0/8882] Elapsed 0m 0s (remain 51m 12s) Loss: 1.1727 Grad: 288933.5000  LR: 0.00001965  
Epoch: [7][500/8882] Elapsed 2m 27s (remain 41m 6s) Loss: 1.3651 Grad: 315355.8438  LR: 0.00001964  
Epoch: [7][1000/8882] Elapsed 4m 53s (remain 38m 30s) Loss: 1.3535 Grad: 355561.5000  LR: 0.00001964  
Epoch: [7][1500/8882] Elapsed 7m 19s (remain 36m 0s) Loss: 1.3552 Grad: 350323.5625  LR: 0.00001963  
Epoch: [7][2000/8882] Elapsed 9m 44s (remain 33m 30s) Loss: 1.3531 Grad: 830923.8750  LR: 0.00001962  
Epoch: [7][2500/8882] Elapsed 12m 9s (remain 31m 2s) Loss: 1.3503 Grad: 406306.8750  LR: 0.00001962  
Epoch: [7][3000/8882] Elapsed 14m 34s (remain 28m 34s) Loss: 1.3494 Grad: 352435.0938  LR: 0.00001961  
Epoch: [7][3500/8882] Elapsed 17m 0s (remain 26m 8s) Loss: 1.3498 Grad: 351974.3438  LR: 0.00001960  
Epoch: [7][4000/8882] Elapsed 19m 26s (remain 23m 42s) Loss: 1.3494 Grad: 315999.1

Epoch 7: Train Loss 1.3557, elapsed 2581.2760s


Epoch: [7][8881/8882] Elapsed 43m 1s (remain 0m 0s) Loss: 1.3557 Grad: 109771.0703  LR: 0.00001953  
Epoch: [8][0/8882] Elapsed 0m 0s (remain 52m 51s) Loss: 1.1683 Grad: 290815.0000  LR: 0.00001953  
Epoch: [8][500/8882] Elapsed 2m 26s (remain 40m 47s) Loss: 1.3194 Grad: 297502.7500  LR: 0.00001952  
Epoch: [8][1000/8882] Elapsed 4m 51s (remain 38m 11s) Loss: 1.3189 Grad: 344959.1562  LR: 0.00001951  
Epoch: [8][1500/8882] Elapsed 7m 15s (remain 35m 42s) Loss: 1.3178 Grad: 331125.2500  LR: 0.00001950  
Epoch: [8][2000/8882] Elapsed 9m 40s (remain 33m 16s) Loss: 1.3132 Grad: 664868.2500  LR: 0.00001950  
Epoch: [8][2500/8882] Elapsed 12m 5s (remain 30m 50s) Loss: 1.3118 Grad: 627740.1875  LR: 0.00001949  
Epoch: [8][3000/8882] Elapsed 14m 30s (remain 28m 25s) Loss: 1.3093 Grad: 369661.0938  LR: 0.00001948  
Epoch: [8][3500/8882] Elapsed 16m 55s (remain 26m 0s) Loss: 1.3127 Grad: 169741.6562  LR: 0.00001947  
Epoch: [8][4000/8882] Elapsed 19m 20s (remain 23m 35s) Loss: 1.3158 Grad: 14963

Epoch 8: Train Loss 1.3126, elapsed 2573.5248s


Epoch: [8][8881/8882] Elapsed 42m 53s (remain 0m 0s) Loss: 1.3126 Grad: 815043.1250  LR: 0.00001938  
Epoch: [9][0/8882] Elapsed 0m 0s (remain 51m 12s) Loss: 1.1130 Grad: 289168.3750  LR: 0.00001938  
Epoch: [9][500/8882] Elapsed 2m 25s (remain 40m 40s) Loss: 1.2840 Grad: 323576.6250  LR: 0.00001937  
Epoch: [9][1000/8882] Elapsed 4m 50s (remain 38m 8s) Loss: 1.2891 Grad: 442619.9688  LR: 0.00001937  
Epoch: [9][1500/8882] Elapsed 7m 15s (remain 35m 41s) Loss: 1.2868 Grad: 284786.8438  LR: 0.00001936  
Epoch: [9][2000/8882] Elapsed 9m 39s (remain 33m 14s) Loss: 1.2867 Grad: 571627.3125  LR: 0.00001935  
Epoch: [9][2500/8882] Elapsed 12m 4s (remain 30m 48s) Loss: 1.2868 Grad: 825734.5625  LR: 0.00001934  
Epoch: [9][3000/8882] Elapsed 14m 29s (remain 28m 24s) Loss: 1.2818 Grad: 689734.2500  LR: 0.00001933  
Epoch: [9][3500/8882] Elapsed 16m 54s (remain 25m 59s) Loss: 1.2829 Grad: 628279.6250  LR: 0.00001932  
Epoch: [9][4000/8882] Elapsed 19m 19s (remain 23m 34s) Loss: 1.2829 Grad: 1252

Epoch 9: Train Loss 1.2761, elapsed 2573.9113s


Epoch: [9][8881/8882] Elapsed 42m 53s (remain 0m 0s) Loss: 1.2761 Grad: 418203.3438  LR: 0.00001922  
Epoch: [10][0/8882] Elapsed 0m 0s (remain 51m 26s) Loss: 1.6918 Grad: 395139.6562  LR: 0.00001922  
Epoch: [10][500/8882] Elapsed 2m 25s (remain 40m 40s) Loss: 1.2797 Grad: 141711.4375  LR: 0.00001921  
Epoch: [10][1000/8882] Elapsed 4m 50s (remain 38m 7s) Loss: 1.2862 Grad: 158539.3125  LR: 0.00001920  
Epoch: [10][1500/8882] Elapsed 7m 15s (remain 35m 40s) Loss: 1.2885 Grad: 154384.1719  LR: 0.00001919  
Epoch: [10][2000/8882] Elapsed 9m 39s (remain 33m 14s) Loss: 1.2892 Grad: 155494.0156  LR: 0.00001918  
Epoch: [10][2500/8882] Elapsed 12m 5s (remain 30m 50s) Loss: 1.2888 Grad: 341078.5000  LR: 0.00001917  
Epoch: [10][3000/8882] Elapsed 14m 31s (remain 28m 28s) Loss: 1.2839 Grad: 319997.0625  LR: 0.00001916  
Epoch: [10][3500/8882] Elapsed 16m 57s (remain 26m 3s) Loss: 1.2784 Grad: 324317.4062  LR: 0.00001915  
Epoch: [10][4000/8882] Elapsed 19m 22s (remain 23m 38s) Loss: 1.2743 Gr

Epoch 10: Train Loss 1.2624, elapsed 2576.3360s


Epoch: [10][8881/8882] Elapsed 42m 56s (remain 0m 0s) Loss: 1.2624 Grad: 902484.4375  LR: 0.00001904  
Epoch: [11][0/8882] Elapsed 0m 0s (remain 51m 46s) Loss: 1.2408 Grad: 603756.6250  LR: 0.00001904  
Epoch: [11][500/8882] Elapsed 2m 25s (remain 40m 37s) Loss: 1.2542 Grad: 345016.9062  LR: 0.00001903  
Epoch: [11][1000/8882] Elapsed 4m 50s (remain 38m 9s) Loss: 1.2649 Grad: 163390.1406  LR: 0.00001902  
Epoch: [11][1500/8882] Elapsed 7m 15s (remain 35m 42s) Loss: 1.2640 Grad: 174677.6719  LR: 0.00001901  
Epoch: [11][2000/8882] Elapsed 9m 40s (remain 33m 16s) Loss: 1.2644 Grad: 161327.7812  LR: 0.00001900  
Epoch: [11][2500/8882] Elapsed 12m 5s (remain 30m 51s) Loss: 1.2649 Grad: 81284.9922  LR: 0.00001899  
Epoch: [11][3000/8882] Elapsed 14m 30s (remain 28m 26s) Loss: 1.2720 Grad: 77672.9609  LR: 0.00001898  
Epoch: [11][3500/8882] Elapsed 16m 56s (remain 26m 2s) Loss: 1.2731 Grad: 88717.2188  LR: 0.00001897  
Epoch: [11][4000/8882] Elapsed 19m 22s (remain 23m 37s) Loss: 1.2740 Grad

Epoch 11: Train Loss 1.2723, elapsed 2581.0831s


Epoch: [11][8881/8882] Elapsed 43m 1s (remain 0m 0s) Loss: 1.2723 Grad: 196065.8906  LR: 0.00001884  
Epoch: [12][0/8882] Elapsed 0m 0s (remain 51m 59s) Loss: 1.2851 Grad: 303919.5312  LR: 0.00001884  
Epoch: [12][500/8882] Elapsed 2m 28s (remain 41m 31s) Loss: 1.2170 Grad: 384601.6875  LR: 0.00001883  
Epoch: [12][1000/8882] Elapsed 4m 56s (remain 38m 57s) Loss: 1.2171 Grad: 290830.0938  LR: 0.00001882  
Epoch: [12][1500/8882] Elapsed 7m 24s (remain 36m 26s) Loss: 1.2191 Grad: 394055.0625  LR: 0.00001881  
Epoch: [12][2000/8882] Elapsed 9m 52s (remain 33m 56s) Loss: 1.2156 Grad: 572362.4375  LR: 0.00001880  
Epoch: [12][2500/8882] Elapsed 12m 19s (remain 31m 27s) Loss: 1.2122 Grad: 710922.0000  LR: 0.00001879  
Epoch: [12][3000/8882] Elapsed 14m 47s (remain 28m 58s) Loss: 1.2125 Grad: 354235.0312  LR: 0.00001877  
Epoch: [12][3500/8882] Elapsed 17m 14s (remain 26m 30s) Loss: 1.2130 Grad: 377488.9062  LR: 0.00001876  
Epoch: [12][4000/8882] Elapsed 19m 42s (remain 24m 2s) Loss: 1.2158 

Epoch 12: Train Loss 1.2130, elapsed 2622.6098s


Epoch: [12][8881/8882] Elapsed 43m 42s (remain 0m 0s) Loss: 1.2130 Grad: 769568.3750  LR: 0.00001863  
Epoch: [13][0/8882] Elapsed 0m 0s (remain 51m 35s) Loss: 1.2336 Grad: 337916.0000  LR: 0.00001863  
Epoch: [13][500/8882] Elapsed 2m 28s (remain 41m 26s) Loss: 1.1990 Grad: 300857.3750  LR: 0.00001862  
Epoch: [13][1000/8882] Elapsed 4m 56s (remain 38m 52s) Loss: 1.2058 Grad: 339556.1250  LR: 0.00001860  
Epoch: [13][1500/8882] Elapsed 7m 24s (remain 36m 23s) Loss: 1.2053 Grad: 303512.2188  LR: 0.00001859  
Epoch: [13][2000/8882] Elapsed 9m 52s (remain 33m 56s) Loss: 1.2071 Grad: 531301.0625  LR: 0.00001858  
Epoch: [13][2500/8882] Elapsed 12m 19s (remain 31m 27s) Loss: 1.2064 Grad: 678254.5000  LR: 0.00001857  
Epoch: [13][3000/8882] Elapsed 14m 47s (remain 28m 59s) Loss: 1.2021 Grad: 623146.6250  LR: 0.00001855  
Epoch: [13][3500/8882] Elapsed 17m 15s (remain 26m 31s) Loss: 1.2011 Grad: 585860.6250  LR: 0.00001854  
Epoch: [13][4000/8882] Elapsed 19m 42s (remain 24m 3s) Loss: 1.2012

Epoch 13: Train Loss 1.1902, elapsed 2623.6771s


Epoch: [13][8881/8882] Elapsed 43m 43s (remain 0m 0s) Loss: 1.1902 Grad: 877334.6250  LR: 0.00001840  
Epoch: [14][0/8882] Elapsed 0m 0s (remain 50m 43s) Loss: 1.1328 Grad: 326619.7188  LR: 0.00001840  
Epoch: [14][500/8882] Elapsed 2m 29s (remain 41m 32s) Loss: 1.1849 Grad: 318648.4375  LR: 0.00001838  
Epoch: [14][1000/8882] Elapsed 4m 56s (remain 38m 56s) Loss: 1.1963 Grad: 334006.2812  LR: 0.00001837  
Epoch: [14][1500/8882] Elapsed 7m 25s (remain 36m 28s) Loss: 1.1933 Grad: 327130.6250  LR: 0.00001836  
Epoch: [14][2000/8882] Elapsed 9m 52s (remain 33m 58s) Loss: 1.1905 Grad: 489977.4062  LR: 0.00001834  
Epoch: [14][2500/8882] Elapsed 12m 20s (remain 31m 30s) Loss: 1.1877 Grad: 540610.3750  LR: 0.00001833  
Epoch: [14][3000/8882] Elapsed 14m 49s (remain 29m 2s) Loss: 1.1846 Grad: 280882.2500  LR: 0.00001832  
Epoch: [14][3500/8882] Elapsed 17m 16s (remain 26m 33s) Loss: 1.1838 Grad: 334849.9688  LR: 0.00001830  
Epoch: [14][4000/8882] Elapsed 19m 44s (remain 24m 5s) Loss: 1.1846 

Epoch 14: Train Loss 1.1970, elapsed 2625.6107s


Epoch: [14][8881/8882] Elapsed 43m 45s (remain 0m 0s) Loss: 1.1970 Grad: 53169.3555  LR: 0.00001815  
Epoch: [15][0/8882] Elapsed 0m 0s (remain 51m 56s) Loss: 0.9312 Grad: 438826.3125  LR: 0.00001815  
Epoch: [15][500/8882] Elapsed 2m 28s (remain 41m 25s) Loss: 1.2153 Grad: 160022.3906  LR: 0.00001814  
Epoch: [15][1000/8882] Elapsed 4m 56s (remain 38m 52s) Loss: 1.2035 Grad: 165752.0000  LR: 0.00001812  
Epoch: [15][1500/8882] Elapsed 7m 23s (remain 36m 22s) Loss: 1.1971 Grad: 137366.8125  LR: 0.00001811  
Epoch: [15][2000/8882] Elapsed 9m 51s (remain 33m 53s) Loss: 1.1929 Grad: 148110.6094  LR: 0.00001809  
Epoch: [15][2500/8882] Elapsed 12m 18s (remain 31m 25s) Loss: 1.1907 Grad: 279819.6562  LR: 0.00001808  
Epoch: [15][3000/8882] Elapsed 14m 46s (remain 28m 57s) Loss: 1.1851 Grad: 335362.6562  LR: 0.00001806  
Epoch: [15][3500/8882] Elapsed 17m 14s (remain 26m 30s) Loss: 1.1830 Grad: 340649.5938  LR: 0.00001805  
Epoch: [15][4000/8882] Elapsed 19m 41s (remain 24m 1s) Loss: 1.1816 

Epoch 15: Train Loss 1.1819, elapsed 2621.9764s


Epoch: [15][8881/8882] Elapsed 43m 41s (remain 0m 0s) Loss: 1.1819 Grad: 71992.0625  LR: 0.00001789  
Epoch: [16][0/8882] Elapsed 0m 0s (remain 50m 39s) Loss: 1.3426 Grad: 385775.4062  LR: 0.00001789  
Epoch: [16][500/8882] Elapsed 2m 29s (remain 41m 37s) Loss: 1.2143 Grad: 256627.6719  LR: 0.00001787  
Epoch: [16][1000/8882] Elapsed 4m 56s (remain 38m 57s) Loss: 1.1976 Grad: 327494.1875  LR: 0.00001786  
Epoch: [16][1500/8882] Elapsed 7m 24s (remain 36m 25s) Loss: 1.1901 Grad: 264515.0625  LR: 0.00001784  
Epoch: [16][2000/8882] Elapsed 9m 49s (remain 33m 46s) Loss: 1.1849 Grad: 566004.8125  LR: 0.00001783  
Epoch: [16][2500/8882] Elapsed 12m 13s (remain 31m 12s) Loss: 1.1776 Grad: 123886.2969  LR: 0.00001781  
Epoch: [16][3000/8882] Elapsed 14m 38s (remain 28m 41s) Loss: 1.1761 Grad: 121030.4844  LR: 0.00001779  
Epoch: [16][3500/8882] Elapsed 17m 2s (remain 26m 11s) Loss: 1.1771 Grad: 18113.8945  LR: 0.00001778  
Epoch: [16][4000/8882] Elapsed 19m 26s (remain 23m 43s) Loss: 1.1877 G

Epoch 16: Train Loss 1.1954, elapsed 2579.7637s


Epoch: [16][8881/8882] Elapsed 42m 59s (remain 0m 0s) Loss: 1.1954 Grad: 127853.8203  LR: 0.00001761  
Epoch: [17][0/8882] Elapsed 0m 0s (remain 52m 23s) Loss: 1.0759 Grad: 292213.2500  LR: 0.00001761  
Epoch: [17][500/8882] Elapsed 2m 25s (remain 40m 27s) Loss: 1.1691 Grad: 309415.5000  LR: 0.00001759  
Epoch: [17][1000/8882] Elapsed 4m 48s (remain 37m 55s) Loss: 1.1631 Grad: 334326.7500  LR: 0.00001758  
Epoch: [17][1500/8882] Elapsed 7m 13s (remain 35m 29s) Loss: 1.1601 Grad: 286887.2188  LR: 0.00001756  
Epoch: [17][2000/8882] Elapsed 9m 36s (remain 33m 3s) Loss: 1.1531 Grad: 476832.9062  LR: 0.00001754  
Epoch: [17][2500/8882] Elapsed 12m 1s (remain 30m 41s) Loss: 1.1522 Grad: 620267.0625  LR: 0.00001753  
Epoch: [17][3000/8882] Elapsed 14m 27s (remain 28m 19s) Loss: 1.1493 Grad: 676744.4375  LR: 0.00001751  
Epoch: [17][3500/8882] Elapsed 16m 53s (remain 25m 57s) Loss: 1.1448 Grad: 515922.0938  LR: 0.00001749  
Epoch: [17][4000/8882] Elapsed 19m 18s (remain 23m 33s) Loss: 1.1467 

Epoch 17: Train Loss 1.1483, elapsed 2575.5378s


Epoch: [17][8881/8882] Elapsed 42m 55s (remain 0m 0s) Loss: 1.1483 Grad: 202176.9219  LR: 0.00001731  
Epoch: [18][0/8882] Elapsed 0m 0s (remain 51m 19s) Loss: 1.1168 Grad: 284360.0938  LR: 0.00001731  
Epoch: [18][500/8882] Elapsed 2m 27s (remain 41m 2s) Loss: 1.1544 Grad: 255419.6094  LR: 0.00001730  
Epoch: [18][1000/8882] Elapsed 4m 53s (remain 38m 27s) Loss: 1.1503 Grad: 306276.3125  LR: 0.00001728  
Epoch: [18][1500/8882] Elapsed 7m 18s (remain 35m 55s) Loss: 1.1538 Grad: 160881.5781  LR: 0.00001726  
Epoch: [18][2000/8882] Elapsed 9m 43s (remain 33m 27s) Loss: 1.1536 Grad: 154126.6250  LR: 0.00001725  
Epoch: [18][2500/8882] Elapsed 12m 8s (remain 30m 59s) Loss: 1.1545 Grad: 146171.7812  LR: 0.00001723  
Epoch: [18][3000/8882] Elapsed 14m 34s (remain 28m 33s) Loss: 1.1560 Grad: 129444.5625  LR: 0.00001721  
Epoch: [18][3500/8882] Elapsed 16m 59s (remain 26m 6s) Loss: 1.1546 Grad: 354286.6250  LR: 0.00001719  
Epoch: [18][4000/8882] Elapsed 19m 24s (remain 23m 40s) Loss: 1.1561 G

Epoch 18: Train Loss 1.1519, elapsed 2574.6422s


Epoch: [18][8881/8882] Elapsed 42m 54s (remain 0m 0s) Loss: 1.1519 Grad: 194914.8125  LR: 0.00001701  
Epoch: [19][0/8882] Elapsed 0m 0s (remain 52m 46s) Loss: 1.0798 Grad: 281274.4688  LR: 0.00001701  
Epoch: [19][500/8882] Elapsed 2m 26s (remain 40m 46s) Loss: 1.1318 Grad: 319144.1875  LR: 0.00001699  
Epoch: [19][1000/8882] Elapsed 4m 51s (remain 38m 12s) Loss: 1.1293 Grad: 298405.8438  LR: 0.00001697  
Epoch: [19][1500/8882] Elapsed 7m 16s (remain 35m 44s) Loss: 1.1273 Grad: 286300.0938  LR: 0.00001695  
Epoch: [19][2000/8882] Elapsed 9m 41s (remain 33m 18s) Loss: 1.1288 Grad: 499392.5938  LR: 0.00001694  
Epoch: [19][2500/8882] Elapsed 12m 6s (remain 30m 53s) Loss: 1.1259 Grad: 296599.8125  LR: 0.00001692  
Epoch: [19][3000/8882] Elapsed 14m 31s (remain 28m 28s) Loss: 1.1249 Grad: 304593.6250  LR: 0.00001690  
Epoch: [19][3500/8882] Elapsed 16m 56s (remain 26m 2s) Loss: 1.1259 Grad: 122061.7031  LR: 0.00001688  
Epoch: [19][4000/8882] Elapsed 19m 19s (remain 23m 34s) Loss: 1.1276 

Epoch 19: Train Loss 1.1337, elapsed 2564.0619s


Epoch: [19][8881/8882] Elapsed 42m 44s (remain 0m 0s) Loss: 1.1337 Grad: 349459.1875  LR: 0.00001668  
Epoch: [20][0/8882] Elapsed 0m 0s (remain 50m 23s) Loss: 0.6760 Grad: 237296.2812  LR: 0.00001668  
Epoch: [20][500/8882] Elapsed 2m 25s (remain 40m 40s) Loss: 1.1074 Grad: 158810.5156  LR: 0.00001667  
Epoch: [20][1000/8882] Elapsed 4m 50s (remain 38m 4s) Loss: 1.1166 Grad: 140198.1250  LR: 0.00001665  
Epoch: [20][1500/8882] Elapsed 7m 14s (remain 35m 35s) Loss: 1.1193 Grad: 145014.8281  LR: 0.00001663  
Epoch: [20][2000/8882] Elapsed 9m 38s (remain 33m 8s) Loss: 1.1222 Grad: 124960.6406  LR: 0.00001661  
Epoch: [20][2500/8882] Elapsed 12m 2s (remain 30m 42s) Loss: 1.1258 Grad: 310623.0312  LR: 0.00001659  
Epoch: [20][3000/8882] Elapsed 14m 26s (remain 28m 17s) Loss: 1.1234 Grad: 362238.9688  LR: 0.00001657  
Epoch: [20][3500/8882] Elapsed 16m 49s (remain 25m 51s) Loss: 1.1196 Grad: 291116.1875  LR: 0.00001655  
Epoch: [20][4000/8882] Elapsed 19m 13s (remain 23m 26s) Loss: 1.1198 G

Epoch 20: Train Loss 1.1141, elapsed 2559.5946s


Epoch: [20][8881/8882] Elapsed 42m 39s (remain 0m 0s) Loss: 1.1141 Grad: 299666.0312  LR: 0.00001635  
Epoch: [21][0/8882] Elapsed 0m 0s (remain 50m 29s) Loss: 0.9312 Grad: 269729.9062  LR: 0.00001635  
Epoch: [21][500/8882] Elapsed 2m 26s (remain 40m 51s) Loss: 1.1071 Grad: 268848.8125  LR: 0.00001633  
Epoch: [21][1000/8882] Elapsed 4m 53s (remain 38m 29s) Loss: 1.1038 Grad: 275123.0312  LR: 0.00001631  
Epoch: [21][1500/8882] Elapsed 7m 18s (remain 35m 55s) Loss: 1.1023 Grad: 317983.3438  LR: 0.00001629  
Epoch: [21][2000/8882] Elapsed 9m 44s (remain 33m 28s) Loss: 1.1013 Grad: 586422.5000  LR: 0.00001627  
Epoch: [21][2500/8882] Elapsed 12m 8s (remain 30m 59s) Loss: 1.1005 Grad: 589802.3750  LR: 0.00001625  
Epoch: [21][3000/8882] Elapsed 14m 33s (remain 28m 31s) Loss: 1.1002 Grad: 730703.6875  LR: 0.00001623  
Epoch: [21][3500/8882] Elapsed 16m 57s (remain 26m 4s) Loss: 1.0982 Grad: 619287.5625  LR: 0.00001621  
Epoch: [21][4000/8882] Elapsed 19m 22s (remain 23m 38s) Loss: 1.0972 

Epoch 21: Train Loss 1.1033, elapsed 2567.1365s


Epoch: [21][8881/8882] Elapsed 42m 47s (remain 0m 0s) Loss: 1.1033 Grad: 398109.3438  LR: 0.00001600  
Epoch: [22][0/8882] Elapsed 0m 0s (remain 51m 5s) Loss: 0.7247 Grad: 232881.0312  LR: 0.00001600  
Epoch: [22][500/8882] Elapsed 2m 25s (remain 40m 30s) Loss: 1.0909 Grad: 297500.2188  LR: 0.00001598  
Epoch: [22][1000/8882] Elapsed 4m 49s (remain 37m 55s) Loss: 1.0926 Grad: 124888.3203  LR: 0.00001596  
Epoch: [22][1500/8882] Elapsed 7m 14s (remain 35m 35s) Loss: 1.0980 Grad: 69196.6016  LR: 0.00001594  
Epoch: [22][2000/8882] Elapsed 9m 39s (remain 33m 12s) Loss: 1.1083 Grad: 77476.5078  LR: 0.00001592  
Epoch: [22][2500/8882] Elapsed 12m 4s (remain 30m 48s) Loss: 1.1119 Grad: 68669.6328  LR: 0.00001590  
Epoch: [22][3000/8882] Elapsed 14m 29s (remain 28m 23s) Loss: 1.1106 Grad: 71075.5391  LR: 0.00001588  
Epoch: [22][3500/8882] Elapsed 16m 54s (remain 25m 59s) Loss: 1.1156 Grad: 44223.9688  LR: 0.00001586  
Epoch: [22][4000/8882] Elapsed 19m 19s (remain 23m 34s) Loss: 1.1159 Grad:

Epoch 22: Train Loss 1.1251, elapsed 2572.3112s


Epoch: [22][8881/8882] Elapsed 42m 52s (remain 0m 0s) Loss: 1.1251 Grad: 85824.1406  LR: 0.00001564  
Epoch: [23][0/8882] Elapsed 0m 0s (remain 52m 53s) Loss: 0.8431 Grad: 251446.9844  LR: 0.00001564  
Epoch: [23][500/8882] Elapsed 2m 25s (remain 40m 39s) Loss: 1.1023 Grad: 290779.3125  LR: 0.00001562  
Epoch: [23][1000/8882] Elapsed 4m 50s (remain 38m 3s) Loss: 1.0940 Grad: 301840.3438  LR: 0.00001560  
Epoch: [23][1500/8882] Elapsed 7m 14s (remain 35m 37s) Loss: 1.0919 Grad: 232985.8750  LR: 0.00001558  
Epoch: [23][2000/8882] Elapsed 9m 38s (remain 33m 11s) Loss: 1.0905 Grad: 485465.1875  LR: 0.00001556  
Epoch: [23][2500/8882] Elapsed 12m 4s (remain 30m 48s) Loss: 1.0906 Grad: 242799.2969  LR: 0.00001554  
Epoch: [23][3000/8882] Elapsed 14m 29s (remain 28m 24s) Loss: 1.0879 Grad: 272803.7500  LR: 0.00001552  
Epoch: [23][3500/8882] Elapsed 16m 54s (remain 25m 59s) Loss: 1.0894 Grad: 139269.7812  LR: 0.00001550  
Epoch: [23][4000/8882] Elapsed 19m 18s (remain 23m 33s) Loss: 1.0909 G

Epoch 23: Train Loss 1.1000, elapsed 2579.9142s


Epoch: [23][8881/8882] Elapsed 42m 59s (remain 0m 0s) Loss: 1.1000 Grad: 175999.1094  LR: 0.00001527  
Epoch: [24][0/8882] Elapsed 0m 0s (remain 51m 43s) Loss: 1.1286 Grad: 288720.8125  LR: 0.00001527  
Epoch: [24][500/8882] Elapsed 2m 27s (remain 41m 11s) Loss: 1.0784 Grad: 270819.8125  LR: 0.00001525  
Epoch: [24][1000/8882] Elapsed 4m 53s (remain 38m 33s) Loss: 1.0837 Grad: 300673.1562  LR: 0.00001523  
Epoch: [24][1500/8882] Elapsed 7m 19s (remain 36m 0s) Loss: 1.0788 Grad: 269994.3125  LR: 0.00001521  
Epoch: [24][2000/8882] Elapsed 9m 45s (remain 33m 33s) Loss: 1.0773 Grad: 647841.3750  LR: 0.00001519  
Epoch: [24][2500/8882] Elapsed 12m 11s (remain 31m 6s) Loss: 1.0755 Grad: 631090.8125  LR: 0.00001517  
Epoch: [24][3000/8882] Elapsed 14m 37s (remain 28m 40s) Loss: 1.0741 Grad: 558113.7500  LR: 0.00001515  
Epoch: [24][3500/8882] Elapsed 17m 3s (remain 26m 13s) Loss: 1.0728 Grad: 600946.4375  LR: 0.00001513  
Epoch: [24][4000/8882] Elapsed 19m 29s (remain 23m 46s) Loss: 1.0724 G

Epoch 24: Train Loss 1.0734, elapsed 2589.5395s


Epoch: [24][8881/8882] Elapsed 43m 9s (remain 0m 0s) Loss: 1.0734 Grad: 630925.3125  LR: 0.00001489  
Epoch: [25][0/8882] Elapsed 0m 0s (remain 54m 15s) Loss: 1.1779 Grad: 284410.9688  LR: 0.00001489  
Epoch: [25][500/8882] Elapsed 2m 26s (remain 40m 47s) Loss: 1.0763 Grad: 317287.3438  LR: 0.00001487  
Epoch: [25][1000/8882] Elapsed 4m 51s (remain 38m 12s) Loss: 1.0736 Grad: 268723.8750  LR: 0.00001485  
Epoch: [25][1500/8882] Elapsed 7m 15s (remain 35m 41s) Loss: 1.0735 Grad: 296154.9375  LR: 0.00001483  
Epoch: [25][2000/8882] Elapsed 9m 40s (remain 33m 14s) Loss: 1.0701 Grad: 547060.0625  LR: 0.00001481  
Epoch: [25][2500/8882] Elapsed 12m 4s (remain 30m 48s) Loss: 1.0703 Grad: 618426.1250  LR: 0.00001479  
Epoch: [25][3000/8882] Elapsed 14m 28s (remain 28m 22s) Loss: 1.0695 Grad: 820062.1875  LR: 0.00001476  
Epoch: [25][3500/8882] Elapsed 16m 53s (remain 25m 57s) Loss: 1.0660 Grad: 514933.1875  LR: 0.00001474  
Epoch: [25][4000/8882] Elapsed 19m 17s (remain 23m 32s) Loss: 1.0675 