## References

- https://www.kaggle.com/yasufuminakama/nbme-deberta-base-baseline-train

## Configurations

In [1]:
EXP_NAME = "nbme-exp093"
ENV = "local"
DEBUG_MODE = False
SUBMISSION_MODE = False

In [2]:
%env TOKENIZERS_PARALLELISM=true

env: TOKENIZERS_PARALLELISM=true


In [3]:
class CFG:
    env=ENV
    exp_name=EXP_NAME
    debug=DEBUG_MODE
    submission=SUBMISSION_MODE
    apex=True
    input_dir=None
    output_dir=None
    library="pytorch"  # ["tf", "pytorch"]
    device="GPU"  # ["GPU", "TPU"]
    competition_name="nbme-score-clinical-patient-notes"
    id_col="id"
    target_col="location"
    pretrained_model_name="microsoft/deberta-v3-large"
    tokenizer=None
    max_len=None
    pseudo_plain_path='../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl'
    #pseudo_plain_path="./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl"
    n_pseudo_labels=10725
    output_dim=1
    dropout=0.2
    num_workers=4
    batch_size=3
    lr=2e-5
    betas=(0.9, 0.98)
    weight_decay=0.1
    alpha=1
    gamma=2
    smoothing=0.0001
    num_warmup_steps_rate=0.1
    batch_scheduler=True
    epochs=5
    n_fold=4
    train_fold=[0,1,2,3]
    seed=71
    gradient_accumulation_steps=2
    max_grad_norm=1000
    print_freq=100
    train=True
    inference=True

In [4]:
if CFG.debug:
    CFG.epochs = 2
    CFG.train_fold = [0, 1]

if CFG.submission:
    CFG.train = False
    CFG.inference = True

## Directory Settings

In [5]:
import sys
from pathlib import Path


print(CFG.env)
if CFG.env == "colab":
    # colab環境
    from google.colab import drive
    drive.mount("/content/drive")
    CFG.input_dir = Path("./drive/MyDrive/00.kaggle/input") / CFG.competition_name
    CFG.output_dir = Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()
    # install packages
    !pip install transformers==4.16.2
    !pip install -q sentencepiece==0.1.96

elif CFG.env == "local":
    # ローカルサーバ
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("../output/") / CFG.competition_name / CFG.exp_name
    if not CFG.output_dir.exists():
        CFG.output_dir.mkdir()

elif CFG.env == "kaggle":
    # kaggle環境
    CFG.input_dir = Path("../input/") / CFG.competition_name
    CFG.output_dir = Path("./")

local


In [6]:
# The following is necessary if you want to use the fast tokenizer for deberta v2 or v3
# This must be done before importing transformers
import shutil
from pathlib import Path

if CFG.env == "colab":
    input_dir = Path("./drive/MyDrive/00.kaggle/input/deberta-v2-3-fast-tokenizer")
    transformers_path = Path("/usr/local/lib/python3.7/dist-packages/transformers")
else:
    input_dir = Path("../input/deberta-v2-3-fast-tokenizer")
    transformers_path = Path("/opt/conda/lib/python3.7/site-packages/transformers")

convert_file = input_dir / "convert_slow_tokenizer.py"
conversion_path = transformers_path/convert_file.name

if conversion_path.exists():
    conversion_path.unlink()

shutil.copy(convert_file, transformers_path)
deberta_v2_path = transformers_path / "models" / "deberta_v2"

for filename in ['tokenization_deberta_v2.py', 'tokenization_deberta_v2_fast.py']:
    filepath = deberta_v2_path/filename
    if filepath.exists():
        filepath.unlink()

    shutil.copy(input_dir/filename, filepath)
    
    
from transformers.models.deberta_v2.tokenization_deberta_v2_fast import DebertaV2TokenizerFast

In [7]:
import gc
import os
import ast
import time
import math
import random
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from tqdm.auto import tqdm
from scipy.optimize import minimize
from sklearn.metrics import roc_auc_score, mean_squared_error, f1_score
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import torchvision.transforms as T
from torchvision.io import read_image
from torch.utils.data import DataLoader, Dataset

from transformers import AutoModelForMaskedLM
from transformers import BartModel,BertModel,BertTokenizer
from transformers import DebertaModel,DebertaTokenizer
from transformers import RobertaModel,RobertaTokenizer
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoModel,AutoConfig
from transformers import AdamW, get_linear_schedule_with_warmup
from transformers import ElectraModel, ElectraTokenizer, ElectraForSequenceClassification

import warnings
warnings.filterwarnings("ignore")

## Utilities

In [8]:
def micro_f1(preds, truths):
    """
    Micro f1 on binary arrays.

    Args:
        preds (list of lists of ints): Predictions.
        truths (list of lists of ints): Ground truths.

    Returns:
        float: f1 score.
    """
    # Micro : aggregating over all instances
    preds = np.concatenate(preds)
    truths = np.concatenate(truths)
    return f1_score(truths, preds)


def spans_to_binary(spans, length=None):
    """
    Converts spans to a binary array indicating whether each character is in the span.

    Args:
        spans (list of lists of two ints): Spans.

    Returns:
        np array [length]: Binarized spans.
    """
    length = np.max(spans) if length is None else length
    binary = np.zeros(length)
    for start, end in spans:
        binary[start:end] = 1
    return binary


def span_micro_f1(preds, truths):
    """
    Micro f1 on spans.

    Args:
        preds (list of lists of two ints): Prediction spans.
        truths (list of lists of two ints): Ground truth spans.

    Returns:
        float: f1 score.
    """
    bin_preds = []
    bin_truths = []
    for pred, truth in zip(preds, truths):
        if not len(pred) and not len(truth):
            continue
        length = max(np.max(pred) if len(pred) else 0, np.max(truth) if len(truth) else 0)
        bin_preds.append(spans_to_binary(pred, length))
        bin_truths.append(spans_to_binary(truth, length))
    return micro_f1(bin_preds, bin_truths)


def get_score(y_true, y_pred):
    score = span_micro_f1(y_true, y_pred)
    return score

In [9]:
def create_labels_for_scoring(df):
    # example: ['48 61', '111 128'] -> [[48, 61], [111, 128]]
    df["location_for_create_labels"] = [ast.literal_eval(f"[]")] * len(df)
    for i in range(len(df)):
        lst = df.loc[i, "location"]
        if lst:
            new_lst = ";".join(lst)
            df.loc[i, "location_for_create_labels"] = ast.literal_eval(f"[['{new_lst}']]")

    # create labels
    truths = []
    for location_list in df["location_for_create_labels"].values:
        truth = []
        if len(location_list) > 0:
            location = location_list[0]
            for loc in [s.split() for s in location.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                truth.append([start, end])
        truths.append(truth)

    return truths


def get_char_probs(texts, token_probs, tokenizer):
    res = [np.zeros(len(t)) for t in texts]
    for i, (text, prediction) in enumerate(zip(texts, token_probs)):
        encoded = tokenizer(
            text=text,
            max_length=CFG.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        for (offset_mapping, pred) in zip(encoded["offset_mapping"], prediction):
            start, end = offset_mapping
            res[i][start:end] = pred
    return res


def get_predicted_location_str(char_probs, th=0.5):
    results = []
    for char_prob in char_probs:
        # result = np.where(char_prob >= th)[0] + 1
        result = np.where(char_prob >= th)[0]
        result = [list(g) for _, g in itertools.groupby(result, key=lambda n, c=itertools.count(): n - next(c))]
        # result = [f"{min(r)} {max(r)}" for r in result]
        result = [f"{min(r)} {max(r) + 1}" for r in result]
        result = ";".join(result)
        results.append(result)
    return results


def get_predictions(results):
    predictions = []
    for result in results:
        prediction = []
        if result != "":
            for loc in [s.split() for s in result.split(";")]:
                start, end = int(loc[0]), int(loc[1])
                prediction.append([start, end])
        predictions.append(prediction)
    return predictions


def scoring(df, th=0.5, use_token_prob=True):
    labels = create_labels_for_scoring(df)

    if use_token_prob:
        token_probs = df[[str(i) for i in range(CFG.max_len)]].values
        char_probs = get_char_probs(df["pn_history"].values, token_probs, CFG.tokenizer)
    else:
        char_probs = df[[str(i) for i in range(CFG.max_char_len)]].values
        char_probs = [char_probs[i] for i in range(len(char_probs))]

    predicted_location_str = get_predicted_location_str(char_probs, th=th)
    preds = get_predictions(predicted_location_str)

    score = get_score(labels, preds)
    return score


def get_best_thres(oof_df):
    def f1_opt(x):
        return -1 * scoring(oof_df, th=x)

    best_thres = minimize(f1_opt, x0=np.array([0.5]), method="Nelder-Mead")["x"].item()
    return best_thres

In [10]:
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return "%dm %ds" % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return "%s (remain %s)" % (asMinutes(s), asMinutes(rs))


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

In [11]:
seed_everything()

In [12]:
def postprocess(texts, preds):
    fix_tokenize_dict = {
        'heart': ['h', 'eart'],
        'hair': ['h', 'air'],
        'adderal': ['a', 'dderal'],
        'mother': ['m', 'other'],
        'intermittent': ['i', 'ntermittent'],
        'temperature': ['t', 'emperature'],
        'episodes': ['e', 'pisodes'],
        'no': ['n', 'o'],
        'has': ['h', 'as'],
        'LMP': ['L', 'MP'],
        '10': ['1', '0'],
        'blood': ['b', 'lood'],
        'recurrent': ['r', 'ecurrent'],
        'denies': ['d', 'enies'],
        'sudden': ['s', 'udden'],
        'Sexually': ['S', 'exually'],
        'up': ['u', 'p'],
        'wakes': ['w', 'akes'],
        'sweats': ['s', 'weats'],
        'hot': ['h', 'ot'],
        'drenched': ['d', 'renched'],
        'gnawing': ['g', 'nawing'],
        'Uses': ['U', 'ses'],
        'Begin': ['B', 'egin'],
        'Nausea': ['N', 'ausea'],
        'Burning': ['B', 'urning'],
        'Started': ['S', 'tarted'],
        'neurvousness': ['n', 'eurvousness'],
        'constipation': ['c', 'onstipation'],
        'nervousness': ['n', 'ervousness'],
        'cold': ['c', 'old'],
        'loss': ['l', 'oss'],
        'CBC': ['C', 'BC'],
        'Hx': ['H', 'x'],
        'tingling': ['t', 'ingling'],
        'feels': ['f', 'eels'],
        'Lost': ['L', 'ost'],
        'she': ['s', 'he'],
        'racing': ['r', 'acing'],
        'throat': ['t', 'hroat'],
        'PATIENT': ['P', 'ATIENT'],
        'recreational': ['r', 'ecreational'],
        'clammy': ['c', 'lammy'],
        'numbness': ['n', 'umbness'],
        'like': ['l', 'ike'],
        'reports': ['r', 'eports'],
        'exercise': ['e', 'xercise'],
        'started': ['s', 'tarted'],
        'brough': ['b', 'rough'],
        'Associated': ['A', 'ssociated'],
        'exacerbated': ['e', 'xacerbated'],
        'sharp': ['s', 'harp'],
        'cannot': ['c', 'annot'],
        'heavy': ['h', 'eavy'],
        'fatigue': ['f', 'atigue'],
        'trouble': ['t', 'rouble'],
        'hearing': ['h', 'earing'],
        'reduced': ['r', 'educed'],
        'lack': ['l', 'ack'],
        'vomiting': ['v', 'omiting'],
        'generalized': ['g', 'eneralized'],
        'body': ['b', 'ody'],
        'all': ['a', 'll'],
        'scratchy': ['s', 'cratchy'],
        'mom': ['m', 'om'],
        'discomfort': ['d', 'iscomfort'],
        'CAD': ['C', 'AD'],
        'Thyroid': ['T', 'hyroid'],
        'BLADDER': ['B', 'LADDER'],
        'diarrhea': ['d', 'iarrhea'],
        'Started': ['S', 'tarted'],
        'Vaginal': ['V', 'aginal'],
        'sleeping': ['s', 'leeping'],
        'UNCLE': ['U', 'NCLE'],
        'USING': ['U', 'SING'],
        'BURNING': ['B', 'URNING'],
        'GETTING': ['G', 'ETTING'],
        'ETOH': ['E', 'TOH'],
        'ON': ['O', 'N'],
        'INITIALLY': ['I', 'NITIALLY'],
        'epigastric': ['e', 'pigastric'],
        'occurs': ['o', 'ccurs'],
        'began': ['b', 'egan'],
        'alleviated': ['a', 'lleviated'],
        'overwhelmed': ['o', 'verwhelmed'],
        'clamminess': ['c', 'lamminess'],
        'strongly': ['s', 'trongly'],
        'lump': ['l', 'ump'],
        'drugs': ['d', 'rugs'],
        'chest': ['c', 'hest'],
        'stuffy': ['s', 'tuffy'],
        'changes': ['c', 'hanges'],
        'trouble': ['t', 'rouble'],
        'takes': ['t', 'akes'],
        'tossing': ['t', 'ossing'],
        'Fam': ['F', 'am'],
        'sweating': ['s', 'weating'],
        'dyspareunia': ['d', 'yspareunia'],
        'irregular': ['i', 'rregular'],
        'time': ['t', 'ime'],
        'unpredictable': ['u', 'npredictable'],
        'darkened': ['d', 'arkened'],
        'anxiety': ['a', 'nxiety'],
        'nervous': ['n', 'ervous'],
        'TAKING': ['T', 'AKING'],
        'losing': ['l', 'osing'],
        'Difficulyt': ['D', 'ifficulyt'],
        'Appetite': ['A', 'ppetite'],
        'increased': ['i', 'ncreased'],
        'fingers': ['f', 'ingers'],
        'illicit': ['i', 'llicit'],
        'claminess': ['c', 'laminess'],
        'clamy': ['c', 'lamy'],
        'Recently': ['R', 'ecently'],
        'feeling': ['f', 'eeling'],
        'aggrav': ['a', 'ggrav'],
        'changing': ['c', 'hanging'],
        'unable': ['u', 'nable'],
        'SEEING': ['S', 'EEING'],
        'staying': ['s', 'taying'],
        'lightheadedness': ['l', 'ightheadedness'],
        'lighheadeness': ['l', 'ighheadeness'],
        'nail': ['n', 'ail'],
        'pounding': ['p', 'ounding'],
        'My': ['M', 'y'],
        'Father': ['F', 'ather'],
        'urinary': ['u', 'rinary'],
        'pain': ['p', 'ain'],
        'not': ['n', 'ot'],
        'lower': ['l', 'ower'],
        'menses': ['m', 'enses'],
        'at': ['a', 't'],
        'takes': ['t', 'akes'],
        'initally': ['i', 'nitally'],
        'melena': ['m', 'elena'],
        'BOWEL': ['B', 'OWEL'],
        'WEIGHT': ['W', 'EIGHT'],
        'difficulty': ['d', 'ifficulty'],
        'condo': ['c', 'ondo'],
        'experiences': ['e', 'xperiences'],
        'stuffy': ['s', 'tuffy'],
        'rhinorrhea': ['r', 'hinorrhea'],
        'felt': ['f', 'elt'],
        'feverish': ['f', 'everish'],
        'CYCLE': ['C', 'YCLE'],
        'tampon': ['t', 'ampon'],
        'Last': ['L', 'ast'],
        'Son': ['S', 'on'],
        'saw': ['s', 'aw'],
        'tightness': ['t', 'ightness'],
        'rash': ['r', 'ash'],
        'ibuprofen': ['i', 'buprofen'],
        'SCRATHY': ['S', 'CRATHY'],
        'PHOTOPHOBIA': ['P', 'HOTOPHOBIA'],
    }
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        pred = preds[raw_idx]
        text = texts[raw_idx]
        if len(pred) != 0:
            # pp1: indexが1から始まる予測値は0から始まるように修正 ## 0.88579 -> 0.88702
            if pred[0][0] == 1:
                preds_pp[raw_idx][0][0] = 0
            for p_index, pp in enumerate(pred):
                start, end = pred[p_index]
                # pp2: startとendが同じ予測値はstartを前に１ずらす ## 0.88702 -> 0.88714
                if start == end:
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp3: 始点が改行の場合始点を1つ後ろにずらす ## 0.88714 -> 0.88746
                if text[start] == '\n':
                    preds_pp[raw_idx][p_index][0] = start + 1
                    start = start + 1
                # pp4: 1-2などは-2で予測されることがあるので修正 ## 0.88746 -> 0.88747
                if text[start-1].isdigit() and text[start] == '-' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-1].isdigit() and text[start] == '/' and text[start+1].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp5: 67などは7で予測されることがあるので修正 ## 0.88747 -> 0.88748
                if text[start-1].isdigit() and text[start].isdigit():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp6: 文頭が大文字で始まるものは大文字部分が除かれて予測されることがあるので修正 ## 0.88748 -> 0.88761
                if text[start-2] == '.' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ',' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == ':' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                if text[start-2] == '-' and text[start-1].isupper():
                    preds_pp[raw_idx][p_index][0] = start - 1
                    start = start - 1
                # pp7: heart -> h + eart となっているようなものを修正する ## 0.88761 -> 0.88806
                for key, fix_tokenize in fix_tokenize_dict.items():
                    _s, s = fix_tokenize[0], fix_tokenize[1]
                    if text[start-1].lower() == _s.lower() and text[start:start+len(s)].lower() == s.lower():
                        preds_pp[raw_idx][p_index][0] = start - 1
                        start = start - 1
    return preds_pp

In [13]:
def get_results_from_preds_list(preds):
    results = []
    for pred in preds:
        s = []
        for p in pred:
            s.append(' '.join(list(map(str, p))))
        s = ';'.join(s)
        results.append(s)
    return results

In [14]:
def trunc_pred(texts, preds):
    preds_pp = preds.copy()
    tk0 = tqdm(range(len(preds_pp)), total=len(preds_pp))
    for raw_idx in tk0:
        text = texts[raw_idx]
        num_text = len(text)
        preds_pp[raw_idx, num_text:] = 0
    return preds_pp

In [15]:
def create_label(pn_history, location_list, max_char_len):
    label = np.zeros(max_char_len)
    label[len(pn_history):] = -1
    if len(location_list) > 0:
        for location in location_list:
            start, end = int(location[0]), int(location[1])
            label[start:end] = 1
    return label

def get_preds_from_results(results, texts, max_char_len):
    labels = []
    for idx, result in enumerate(results):
        label = create_label(texts[idx], result, max_char_len)
        labels.append(label)
    labels = np.stack(labels)
    print(labels.shape)
    return labels

## Data Loading

In [16]:
train = pd.read_csv(CFG.input_dir / "train.csv")
features = pd.read_csv(CFG.input_dir / "features.csv")
patient_notes = pd.read_csv(CFG.input_dir / "patient_notes.csv")
test = pd.read_csv(CFG.input_dir / "test.csv")

train.shape, features.shape, patient_notes.shape, test.shape

((14300, 6), (143, 3), (42146, 3), (5, 4))

In [17]:
if CFG.debug:
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    print(train.shape)

## Preprocessing

In [18]:
def preprocess_features(features):
    features.loc[features["feature_text"] == "Last-Pap-smear-I-year-ago", "feature_text"] = "Last-Pap-smear-1-year-ago"
    return features


features = preprocess_features(features)

In [19]:
features['feature_text'] = features['feature_text'].str.lower()
patient_notes['pn_history'] = patient_notes['pn_history'].str.lower()

In [20]:
train = train.merge(features, on=["feature_num", "case_num"], how="left")
train = train.merge(patient_notes, on=["pn_num", "case_num"], how="left")
test = test.merge(features, on=["feature_num", "case_num"], how="left")
test = test.merge(patient_notes, on=["pn_num", "case_num"], how="left")

train.shape, test.shape

((14300, 8), (5, 6))

In [21]:
train["annotation"] = train["annotation"].apply(ast.literal_eval)
train["location"] = train["location"].apply(ast.literal_eval)

In [22]:
train["annotation_length"] = train["annotation"].apply(len)
display(train['annotation_length'].value_counts().sort_index())

0    4399
1    8181
2    1296
3     287
4      99
5      27
6       9
7       1
8       1
Name: annotation_length, dtype: int64

## CV split

In [23]:
Fold = GroupKFold(n_splits=CFG.n_fold)
groups = train['pn_num'].values
for n, (train_index, val_index) in enumerate(Fold.split(train, train['location'], groups)):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    3575
1    3575
2    3575
3    3575
dtype: int64

## Setup tokenizer

In [24]:
if CFG.submission:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(Path("../input/") / CFG.exp_name / "tokenizer/")
else:
    tokenizer = DebertaV2TokenizerFast.from_pretrained(CFG.pretrained_model_name)
    tokenizer.save_pretrained(CFG.output_dir / "tokenizer/")

CFG.tokenizer = tokenizer

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


## Create dataset

In [25]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    pn_history_lengths.append(length)

print("max length:", np.max(pn_history_lengths))

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 284


In [26]:
feature_text_lengths = []
tk0 = tqdm(features["feature_text"].fillna("").values, total=len(features))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)["input_ids"])
    feature_text_lengths.append(length)

print("max length:", np.max(feature_text_lengths))

  0%|          | 0/143 [00:00<?, ?it/s]

max length: 28


In [27]:
CFG.max_len = max(pn_history_lengths) + max(feature_text_lengths) + 3   # cls & sep & sep

print("max length:", CFG.max_len)

max length: 315


In [28]:
pn_history_lengths = []
tk0 = tqdm(patient_notes["pn_history"].fillna("").values, total=len(patient_notes))
for text in tk0:
    length = len(text)
    pn_history_lengths.append(length)

CFG.max_char_len = max(pn_history_lengths)

print("max length:", CFG.max_char_len)

  0%|          | 0/42146 [00:00<?, ?it/s]

max length: 950


In [29]:
class TrainingDataset(Dataset):
    def __init__(self, cfg, df, pseudo_label=None):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values
        self.annotation_lengths = self.df["annotation_length"].values
        self.locations = self.df["location"].values
        if "pseudo_idx" in df.columns:
            self.pseudo_idx = self.df["pseudo_idx"].values
            self.pseudo_label = pseudo_label

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def _create_label(self, pn_history, annotation_length, location_list):
        label = np.zeros(self.max_char_len)
        label[len(pn_history):] = -1
        if annotation_length > 0:
            for location in location_list:
                for loc in [s.split() for s in location.split(";")]:
                    start, end = int(loc[0]), int(loc[1])
                    label[start:end] = 1
        return torch.tensor(label, dtype=torch.float)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        if not np.isnan(self.annotation_lengths[idx]):
            label = self._create_label(self.pn_historys[idx], self.annotation_lengths[idx], self.locations[idx])
        else:
            p_idx = int(self.pseudo_idx[idx])
            label = torch.tensor(self.pseudo_label[p_idx], dtype=torch.float)
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, label, mapping_from_token_to_char

In [30]:
class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.df = df
        self.tokenizer = self.cfg.tokenizer
        self.max_len = self.cfg.max_len
        self.max_char_len = self.cfg.max_char_len
        self.feature_texts = self.df["feature_text"].values
        self.pn_historys = self.df["pn_history"].values

    def __len__(self):
        return len(self.df)

    def _create_input(self, pn_history, feature_text):
        encoded = self.tokenizer(
            text=pn_history,
            text_pair=feature_text,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=False,
        )
        for k, v in encoded.items():
            encoded[k] = torch.tensor(v, dtype=torch.long)
        return encoded

    def _create_mapping_from_token_to_char(self, pn_history):
        encoded = self.tokenizer(
            text=pn_history,
            max_length=self.max_len,
            padding="max_length",
            return_offsets_mapping=True,
        )
        mapping_from_token_to_char = np.zeros(self.max_char_len)
        offset_mapping = encoded["offset_mapping"]
        for i, offset in enumerate(offset_mapping):
            start_idx, end_idx = offset
            mapping_from_token_to_char[start_idx:end_idx] = i
        return torch.tensor(mapping_from_token_to_char, dtype=torch.long)

    def __getitem__(self, idx):
        input_ = self._create_input(self.pn_historys[idx], self.feature_texts[idx])
        mapping_from_token_to_char = self._create_mapping_from_token_to_char(self.pn_historys[idx])
        return input_, mapping_from_token_to_char

## Model

In [31]:
from transformers.modeling_outputs import MaskedLMOutput

class MaskedModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(
                cfg.pretrained_model_name,
                output_hidden_states=False
                )
        else:
            self.config = torch.load(config_path)
        
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.pretrained_model_name, config=self.config)
            self.lm_head = AutoModelForMaskedLM.from_pretrained(cfg.pretrained_model_name, config=self.config).cls # [cls, lm_head]
        else:
            self.model = AutoModel(self.config)
            self.lm_head = AutoModelForMaskedLM(self.config).cls # [cls, lm_head]
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(
            self, 
            input_ids=None,
            attention_mask=None,
            token_type_ids=None,
            #position_ids=None,
            inputs_embeds=None,
            labels=None,
            output_attentions=None,
            output_hidden_states=None,
            return_dict=None):
        
        outputs = self.model(
            input_ids,
            attention_mask=attention_mask,
            token_type_ids=token_type_ids,
            #position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,)
        
        sequence_output = outputs[0]
        prediction_scores = self.lm_head(sequence_output)

        masked_lm_loss = None
        if labels is not None:
            loss_fct = nn.CrossEntropyLoss()
            masked_lm_loss = loss_fct(prediction_scores.view(-1, self.config.vocab_size), labels.view(-1))

        return MaskedLMOutput(loss=masked_lm_loss,
                              logits=prediction_scores,
                              hidden_states=outputs.hidden_states,
                              attentions=outputs.attentions)

In [32]:
class CustomModel(nn.Module):
    def __init__(self, cfg, model_config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg

        if model_config_path is None:
            self.model_config = AutoConfig.from_pretrained(
                self.cfg.pretrained_model_name,
                output_hidden_states=True,
            )
        else:
            self.model_config = torch.load(model_config_path)

        if pretrained:
            self.backbone = AutoModel.from_pretrained(
                self.cfg.pretrained_model_name,
                config=self.model_config,
            )
            print(f"Load weight from pretrained")
        else:
            #self.backbone = AutoModel.from_config(self.model_config)
            # itpt = AutoModelForMaskedLM.from_config(self.model_config)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp010/checkpoint-130170/pytorch_model.bin")
            # path = "../output/nbme-score-clinical-patient-notes/nbme-exp010/checkpoint-130170/pytorch_model.bin"
            # state_dict = torch.load(path)
            # itpt.load_state_dict(state_dict)
            #path = str(Path("./drive/MyDrive/00.kaggle/output") / CFG.competition_name /  "nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin")
            path = str(Path("../output") / CFG.competition_name /  "nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin")
            masked_model = MaskedModel(CFG, config_path=None, pretrained=True)
            state = torch.load(path, map_location=torch.device("cpu"))
            masked_model.load_state_dict(state)
            self.backbone = masked_model.model
            print(f"Load weight from {path}")
            del state, masked_model; gc.collect()

        self.lstm = nn.GRU(
            input_size=self.model_config.hidden_size,
            bidirectional=True,
            hidden_size=self.model_config.hidden_size // 2,
            num_layers=4,
            dropout=self.cfg.dropout,
            batch_first=True,
        )
        self.fc = nn.Sequential(
            nn.Dropout(self.cfg.dropout),
            nn.Linear(self.model_config.hidden_size, self.cfg.output_dim),
        )

    def forward(self, inputs, mappings_from_token_to_char):
        h = self.backbone(**inputs)["last_hidden_state"]  # [batch, seq_len, d_model]
        mappings_from_token_to_char = mappings_from_token_to_char.unsqueeze(2).expand(-1, -1, self.model_config.hidden_size)
        h = torch.gather(h, 1, mappings_from_token_to_char)    # [batch, seq_len, d_model]
        h, _ = self.lstm(h)
        output = self.fc(h)

        return output

## Training

In [33]:
class FocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super().__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        bce_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-bce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * bce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss


class SmoothFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super().__init__()
        self.reduction = reduction
        self.focal_loss = FocalLoss(reduction='none', alpha=alpha, gamma=gamma)
        self.smoothing = smoothing

    @staticmethod
    def _smooth(targets:torch.Tensor, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets

    def forward(self, inputs, targets):
        targets = SmoothFocalLoss._smooth(targets, self.smoothing)
        loss = self.focal_loss(inputs, targets)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class CEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2):
        super(CEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none')
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

    
class SmoothCEFocalLoss(nn.Module):
    def __init__(self, reduction='none', alpha=1, gamma=2, smoothing=0.0):
        super(SmoothCEFocalLoss, self).__init__()
        self.reduction = reduction
        self.alpha = alpha
        self.gamma = gamma
        self.smoothing = smoothing

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', label_smoothing=self.smoothing) # torch >= 1.10.0
        pt = torch.exp(-ce_loss)
        loss = self.alpha * (1. - pt)**self.gamma * ce_loss
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

In [34]:
def train_fn(
    train_dataloader,
    model,
    criterion,
    optimizer,
    epoch,
    scheduler,
    device,
):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(train_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()

        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)

        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()

        if CFG.batch_scheduler:
            scheduler.step()

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_dataloader)-1):
            print(
                "Epoch: [{0}][{1}/{2}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                "Grad: {grad_norm:.4f}  "
                "LR: {lr:.6f}  "
                .format(
                    epoch+1,
                    step,
                    len(train_dataloader),
                    remain=timeSince(start, float(step+1) / len(train_dataloader)),
                    loss=losses,
                     grad_norm=grad_norm,
                     lr=scheduler.get_lr()[0],
                )
            )
    del output, loss, inputs, labels, mappings_from_token_to_char, scaler, grad_norm; gc.collect()
    torch.cuda.empty_cache()
    return losses.avg

In [35]:
def valid_fn(
    val_dataloader,
    model,
    criterion,
    device,
):
    model.eval()
    preds = []
    losses = AverageMeter()
    start = time.time()
    for step, (inputs, labels, mappings_from_token_to_char) in enumerate(val_dataloader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device) 
        batch_size = labels.size(0)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.cuda.amp.autocast(enabled=CFG.apex):
            output = model(inputs, mappings_from_token_to_char)

        loss = criterion(output.view(-1, 1), labels.view(-1, 1))
        loss = torch.masked_select(loss, labels.view(-1, 1) != -1)
        loss = loss.mean()
    
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())

        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(val_dataloader)-1):
            print(
                "EVAL: [{0}/{1}] "
                "Elapsed {remain:s} "
                "Loss: {loss.val:.4f}({loss.avg:.4f}) "
                .format(
                    step, len(val_dataloader),
                    remain=timeSince(start, float(step+1) / len(val_dataloader)),
                    loss=losses,
                )
            )
    preds = np.concatenate(preds)
    return losses.avg, preds

In [36]:
def inference_fn(test_dataloader, model, device):
    model.eval()
    model.to(device)
    preds = []
    tk0 = tqdm(test_dataloader, total=len(test_dataloader))
    for (inputs, mappings_from_token_to_char) in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        mappings_from_token_to_char = mappings_from_token_to_char.to(device)

        with torch.no_grad():
            output = model(inputs, mappings_from_token_to_char)
        preds.append(output.sigmoid().squeeze(2).detach().cpu().numpy())
    preds = np.concatenate(preds)
    return preds

In [37]:
def train_loop(df, i_fold, device):
    print(f"========== fold: {i_fold} training ==========")
    train_idx = df[df["fold"] != i_fold].index
    val_idx = df[df["fold"] == i_fold].index

    train_folds = df.loc[train_idx].reset_index(drop=True)
    val_folds = df.loc[val_idx].reset_index(drop=True)

    if CFG.pseudo_plain_path is not None:
        pseudo_plain = pd.read_pickle(CFG.pseudo_plain_path)
        print(f"get pseudo plain from {CFG.pseudo_plain_path}")
        pseudo_label_list = []
        weights = [0.4433659049657008, 0.20859987143371844, 0.3480342236005807]
        for exp_name in ["nbme-exp060", "nbme-exp067", "nbme-exp083"]:
            #pseudo_label_path = f'./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
            pseudo_label_path = f'../output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
            pseudo_label = np.load(pseudo_label_path)
            print(f"get pseudo labels from {pseudo_label_path}")
            pseudo_label_list.append(pseudo_label)

        pseudo_label = weights[0] * pseudo_label_list[0] + weights[1] * pseudo_label_list[1] + weights[2] * pseudo_label_list[2]
        pseudo_label = trunc_pred(pseudo_plain["pn_history"].values, pseudo_label)
        predicted_location_str = get_predicted_location_str(pseudo_label, th=0.5)
        preds = get_predictions(predicted_location_str)
        results_postprocess = postprocess(pseudo_plain["pn_history"].values, preds)
        #results_postprocess = get_results_from_preds_list(results_postprocess)
        pseudo_label = get_preds_from_results(results_postprocess, pseudo_plain["pn_history"].values, pseudo_label.shape[1])
        print(pseudo_plain.shape, pseudo_label.shape)

        pseudo_plain['feature_text'] = pseudo_plain['feature_text'].str.lower()
        pseudo_plain['pn_history'] = pseudo_plain['pn_history'].str.lower()

        pseudo_plain["pseudo_idx"] = np.arange(len(pseudo_plain))
        pseudo_plain = pseudo_plain.sample(n=CFG.n_pseudo_labels)
        print(pseudo_plain.shape)
        train_folds = pd.concat([train_folds, pseudo_plain], axis=0, ignore_index=True)
        print(train_folds.shape)

    train_dataset = TrainingDataset(CFG, train_folds, pseudo_label)
    val_dataset = TrainingDataset(CFG, val_folds)

    train_dataloader = DataLoader(
        train_dataset,
        batch_size=CFG.batch_size,
        shuffle=True,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=True,
    )
    val_dataloader = DataLoader(
        val_dataset,
        batch_size=CFG.batch_size,
        shuffle=False,
        num_workers=CFG.num_workers,
        pin_memory=True,
        drop_last=False,
    )

    # model = CustomModel(CFG, model_config_path=None, pretrained=True)
    model = CustomModel(CFG, model_config_path=None, pretrained=False)   # itptを使うため
    torch.save(model.model_config, CFG.output_dir / "model_config.pth")
    model.to(device)

    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
    optimizer_grouped_parameters = [
        {"params": [p for n, p in param_optimizer if not any(
            nd in n for nd in no_decay)], "weight_decay": CFG.weight_decay},
        {"params": [p for n, p in param_optimizer if any(
            nd in n for nd in no_decay)], "weight_decay": 0.0}
    ]
    optimizer = AdamW(
        optimizer_grouped_parameters,
        lr=CFG.lr,
        betas=CFG.betas,
        weight_decay=CFG.weight_decay,
    )
    num_train_optimization_steps = int(len(train_dataloader) * CFG.epochs * 2.5)
    num_warmup_steps = int(num_train_optimization_steps * CFG.num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps,
    )

    criterion = SmoothFocalLoss(reduction='none', alpha=CFG.alpha, gamma=CFG.gamma, smoothing=CFG.smoothing)
    #criterion = nn.BCEWithLogitsLoss(reduction="none")
    best_score = -1 * np.inf
    """

    for epoch in range(CFG.epochs):
        if CFG.pseudo_plain_path is not None:
            pseudo_plain = pd.read_pickle(CFG.pseudo_plain_path)
            print(f"get pseudo plain from {CFG.pseudo_plain_path}")
            pseudo_label_list = []
            weights = [0.4433659049657008, 0.20859987143371844, 0.3480342236005807]
            for exp_name in ["nbme-exp060", "nbme-exp067", "nbme-exp083"]:
                #pseudo_label_path = f'./drive/MyDrive/00.kaggle/output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
                pseudo_label_path = f'../output/nbme-score-clinical-patient-notes/{exp_name}/pseudo_labels_{i_fold}.npy'
                pseudo_label = np.load(pseudo_label_path)
                print(f"get pseudo labels from {pseudo_label_path}")
                pseudo_label_list.append(pseudo_label)

            pseudo_label = weights[0] * pseudo_label_list[0] + weights[1] * pseudo_label_list[1] + weights[2] * pseudo_label_list[2]
            pseudo_label = trunc_pred(pseudo_plain["pn_history"].values, pseudo_label)
            predicted_location_str = get_predicted_location_str(pseudo_label, th=0.5)
            preds = get_predictions(predicted_location_str)
            results_postprocess = postprocess(pseudo_plain["pn_history"].values, preds)
            #results_postprocess = get_results_from_preds_list(results_postprocess)
            pseudo_label = get_preds_from_results(results_postprocess, pseudo_plain["pn_history"].values, pseudo_label.shape[1])
            print(pseudo_plain.shape, pseudo_label.shape)

            pseudo_plain['feature_text'] = pseudo_plain['feature_text'].str.lower()
            pseudo_plain['pn_history'] = pseudo_plain['pn_history'].str.lower()

            pseudo_plain["pseudo_idx"] = np.arange(len(pseudo_plain))
            pseudo_plain = pseudo_plain.sample(n=CFG.n_pseudo_labels)
            print(pseudo_plain.shape)
            train_folds = pd.concat([train_folds, pseudo_plain], axis=0, ignore_index=True)
            print(train_folds.shape)

        train_dataset = TrainingDataset(CFG, train_folds, pseudo_label)
        val_dataset = TrainingDataset(CFG, val_folds)

        train_dataloader = DataLoader(
            train_dataset,
            batch_size=CFG.batch_size,
            shuffle=True,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=True,
        )
        val_dataloader = DataLoader(
            val_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )

        start_time = time.time()
        avg_loss = train_fn(
            train_dataloader,
            model,
            criterion,
            optimizer,
            epoch,
            scheduler,
            device,
        )
        avg_val_loss, val_preds = valid_fn(
            val_dataloader,
            model,
            criterion,
            device,
        )

        if isinstance(scheduler, optim.lr_scheduler.CosineAnnealingWarmRestarts):
            scheduler.step()

        # scoring
        val_folds[[str(i) for i in range(CFG.max_char_len)]] = val_preds
        score = scoring(val_folds, th=0.5, use_token_prob=False)

        elapsed = time.time() - start_time

        print(f"Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s")
        print(f"Epoch {epoch+1} - Score: {score:.4f}")
        if score > best_score:
            best_score = score
            print(f"Epoch {epoch+1} - Save Best Score: {score:.4f} Model")
            torch.save({
                "model": model.state_dict(),
                "predictions": val_preds,
                },
                CFG.output_dir / f"fold{i_fold}_best.pth",
            )
    """

    predictions = torch.load(
        CFG.output_dir / f"fold{i_fold}_best.pth",
        map_location=torch.device("cpu"),
    )["predictions"]
    val_folds[[str(i) for i in range(CFG.max_char_len)]] = predictions

    torch.cuda.empty_cache()
    gc.collect()

    return val_folds

## Main

In [38]:
def main():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    if CFG.train:
        oof_df = pd.DataFrame()
        for i_fold in range(CFG.n_fold):
            if i_fold in CFG.train_fold:
                _oof_df = train_loop(train, i_fold, device)
                oof_df = pd.concat([oof_df, _oof_df], axis=0, ignore_index=True)
        oof_df.to_pickle(CFG.output_dir / "oof_df.pkl")

    if CFG.submission:
        oof_df = pd.read_pickle(Path("../input/") / CFG.exp_name / "oof_df.pkl")
    else:
        oof_df = pd.read_pickle(CFG.output_dir / "oof_df.pkl")

    best_thres = 0.5
    best_score = 0.
    for th in np.arange(0.45, 0.55, 0.01):
        th = np.round(th, 2)
        score = scoring(oof_df, th=th, use_token_prob=False)
        if best_score < score:
            best_thres = th
            best_score = score
    print(f"best_thres: {best_thres}  score: {best_score:.5f}")

    if CFG.inference:
        test_dataset = TestDataset(CFG, test)
        test_dataloader = DataLoader(
            test_dataset,
            batch_size=CFG.batch_size,
            shuffle=False,
            num_workers=CFG.num_workers,
            pin_memory=True,
            drop_last=False,
        )
        predictions = []
        for i_fold in CFG.train_fold:
            if CFG.submission:
                model = CustomModel(CFG, model_config_path=Path("../input/") / CFG.exp_name / "model_config.pth", pretrained=False)
                path = Path("../input/") / CFG.exp_name / f"fold{i_fold}_best.pth"
            else:
                model = CustomModel(CFG, model_config_path=None, pretrained=True)
                path = CFG.output_dir / f"fold{i_fold}_best.pth"

            state = torch.load(path, map_location=torch.device("cpu"))
            model.load_state_dict(state["model"])
            print(f"load weights from {path}")
            test_char_probs = inference_fn(test_dataloader, model, device)
            predictions.append(test_char_probs)

            del state, test_char_probs, model; gc.collect()
            torch.cuda.empty_cache()

        predictions = np.mean(predictions, axis=0)
        predicted_location_str = get_predicted_location_str(predictions, th=best_thres)
        test[CFG.target_col] = predicted_location_str
        test.to_csv(CFG.output_dir / "raw_submission.csv", index=False)
        test[[CFG.id_col, CFG.target_col]].to_csv(
            CFG.output_dir / "submission.csv", index=False
        )

In [39]:
if __name__ == "__main__":
    main()

get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_2.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_2.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_2.npy


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_3.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_3.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_3.npy


  0%|          | 0/612602 [00:00<?, ?it/s]

  0%|          | 0/612602 [00:00<?, ?it/s]

(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the 

Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
best_thres: 0.47  score: 0.89362


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.dense.weight', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'mask_predictions.classifier.weight', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp093/fold0_best.pth


  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiproc

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp093/fold1_best.pth


  0%|          | 0/2 [00:01<?, ?it/s]

Exception ignored in: <function _ConnectionBase.__del__ at 0x7f4ffca4b170>
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 132, in __del__
Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _fee

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp093/fold2_best.pth


  0%|          | 0/2 [00:00<?, ?it/s]

Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: 

Load weight from pretrained
load weights from ../output/nbme-score-clinical-patient-notes/nbme-exp093/fold3_best.pth


  0%|          | 0/2 [00:00<?, ?it/s]

Exception ignored in: Exception in thread QueueFeederThread:
Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 232, in _feed
    close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 177, in close
    self._close()
  File "/opt/conda/lib/python3.7/multiprocessing/connection.py", line 361, in _close
    _close(self._handle)
OSError: [Errno 9] Bad file descriptor

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/opt/conda/lib/python3.7/threading.py", line 926, in _bootstrap_inner
    self.run()
  File "/opt/conda/lib/python3.7/threading.py", line 870, in run
    self._target(*self._args, **self._kwargs)
  File "/opt/conda/lib/python3.7/multiprocessing/queues.py", line 263, in _feed
    queue_sem.release()
ValueError: semaphore or lock released too many times

<function _ConnectionBase.__del__ at 0x7f4ffca4b170>
Traceback (most recent call last):
  Fil

========== fold: 0 training ==========
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 584601.52it/s]
100%
612602/612602 [00:44<00:00, 15566.12it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 564935.46it/s]
100%
612602/612602 [00:44<00:00, 16035.62it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(32175, 11)
Epoch: [1][0/10725] Elapsed 0m 2s (remain 451m 48s) Loss: 0.0858(0.0858) Grad: 82686.9141  LR: 0.000000  
Epoch: [1][100/10725] Elapsed 1m 20s (remain 141m 48s) Loss: 0.0662(0.0807) Grad: 67109.2578  LR: 0.000001  
Epoch: [1][200/10725] Elapsed 2m 38s (remain 138m 24s) Loss: 0.0325(0.0657) Grad: 31623.5156  LR: 0.000001  
Epoch: [1][300/10725] Elapsed 3m 55s (remain 135m 53s) Loss: 0.0104(0.0505) Grad: 4259.9985  LR: 0.000002  
Epoch: [1][400/10725] Elapsed 5m 11s (remain 133m 28s) Loss: 0.0181(0.0412) Grad: 6045.1685  LR: 0.000002  
Epoch: [1][500/10725] Elapsed 6m 27s (remain 131m 51s) Loss: 0.0092(0.0354) Grad: 2830.6404  LR: 0.000003  
Epoch: [1][600/10725] Elapsed 7m 44s (remain 130m 24s) Loss: 0.0116(0.0313) Grad: 4078.9724  LR: 0.000003  
Epoch: [1][700/10725] Elapsed 9m 2s (remain 129m 12s) Loss: 0.0107(0.0286) Grad: 3505.4036  LR: 0.000004  
Epoch: [1][800/10725] Elapsed 10m 18s (remain 127m 47s) Loss: 0.0186(0.0265) Grad: 7944.4766  LR: 0.000004  
Epoch: [1][900/10725] Elapsed 11m 35s (remain 126m 27s) Loss: 0.0076(0.0248) Grad: 4796.7236  LR: 0.000005  
Epoch: [1][1000/10725] Elapsed 12m 53s (remain 125m 16s) Loss: 0.0090(0.0233) Grad: 4150.1299  LR: 0.000006  
Epoch: [1][1100/10725] Elapsed 14m 11s (remain 124m 0s) Loss: 0.0083(0.0222) Grad: 8220.5684  LR: 0.000006  
Epoch: [1][1200/10725] Elapsed 15m 28s (remain 122m 45s) Loss: 0.0054(0.0210) Grad: 5974.5332  LR: 0.000007  
Epoch: [1][1300/10725] Elapsed 16m 47s (remain 121m 37s) Loss: 0.0084(0.0198) Grad: 35930.1719  LR: 0.000007  
Epoch: [1][1400/10725] Elapsed 18m 6s (remain 120m 30s) Loss: 0.0026(0.0189) Grad: 10715.6104  LR: 0.000008  
Epoch: [1][1500/10725] Elapsed 19m 24s (remain 119m 16s) Loss: 0.0007(0.0179) Grad: 5286.9033  LR: 0.000008  
Epoch: [1][1600/10725] Elapsed 20m 42s (remain 118m 0s) Loss: 0.0091(0.0171) Grad: 10863.9043  LR: 0.000009  
Epoch: [1][1700/10725] Elapsed 21m 59s (remain 116m 38s) Loss: 0.0004(0.0163) Grad: 966.8531  LR: 0.000010  
Epoch: [1][1800/10725] Elapsed 23m 16s (remain 115m 18s) Loss: 0.0006(0.0156) Grad: 5405.3135  LR: 0.000010  
Epoch: [1][1900/10725] Elapsed 24m 33s (remain 114m 1s) Loss: 0.0002(0.0150) Grad: 3130.4922  LR: 0.000011  
Epoch: [1][2000/10725] Elapsed 25m 51s (remain 112m 43s) Loss: 0.0052(0.0145) Grad: 30547.7520  LR: 0.000011  
Epoch: [1][2100/10725] Elapsed 27m 8s (remain 111m 24s) Loss: 0.0002(0.0139) Grad: 577.3871  LR: 0.000012  
Epoch: [1][2200/10725] Elapsed 28m 26s (remain 110m 7s) Loss: 0.0008(0.0135) Grad: 2099.9797  LR: 0.000012  
Epoch: [1][2300/10725] Elapsed 29m 45s (remain 108m 56s) Loss: 0.0008(0.0130) Grad: 3609.4392  LR: 0.000013  
Epoch: [1][2400/10725] Elapsed 31m 2s (remain 107m 36s) Loss: 0.0071(0.0126) Grad: 14964.7354  LR: 0.000013  
Epoch: [1][2500/10725] Elapsed 32m 19s (remain 106m 18s) Loss: 0.0013(0.0123) Grad: 4783.9751  LR: 0.000014  
Epoch: [1][2600/10725] Elapsed 33m 36s (remain 104m 57s) Loss: 0.0003(0.0119) Grad: 517.4664  LR: 0.000015  
Epoch: [1][2700/10725] Elapsed 34m 53s (remain 103m 38s) Loss: 0.0007(0.0116) Grad: 3630.9780  LR: 0.000015  
Epoch: [1][2800/10725] Elapsed 36m 10s (remain 102m 20s) Loss: 0.0014(0.0112) Grad: 5090.0396  LR: 0.000016  
Epoch: [1][2900/10725] Elapsed 37m 27s (remain 101m 0s) Loss: 0.0011(0.0109) Grad: 3303.5439  LR: 0.000016  
Epoch: [1][3000/10725] Elapsed 38m 44s (remain 99m 43s) Loss: 0.0009(0.0107) Grad: 4445.2114  LR: 0.000017  
Epoch: [1][3100/10725] Elapsed 40m 1s (remain 98m 25s) Loss: 0.0015(0.0105) Grad: 7967.1519  LR: 0.000017  
Epoch: [1][3200/10725] Elapsed 41m 19s (remain 97m 7s) Loss: 0.0005(0.0102) Grad: 1101.0281  LR: 0.000018  
Epoch: [1][3300/10725] Elapsed 42m 36s (remain 95m 49s) Loss: 0.0013(0.0100) Grad: 2689.5977  LR: 0.000018  
Epoch: [1][3400/10725] Elapsed 43m 54s (remain 94m 32s) Loss: 0.0038(0.0098) Grad: 11662.3467  LR: 0.000019  
Epoch: [1][3500/10725] Elapsed 45m 12s (remain 93m 17s) Loss: 0.0012(0.0096) Grad: 9334.3740  LR: 0.000020  
Epoch: [1][3600/10725] Elapsed 46m 30s (remain 92m 1s) Loss: 0.0008(0.0094) Grad: 13355.4102  LR: 0.000020  
Epoch: [1][3700/10725] Elapsed 47m 47s (remain 90m 42s) Loss: 0.0007(0.0092) Grad: 4803.0005  LR: 0.000020  
Epoch: [1][3800/10725] Elapsed 49m 4s (remain 89m 23s) Loss: 0.0017(0.0090) Grad: 7089.7837  LR: 0.000020  
Epoch: [1][3900/10725] Elapsed 50m 21s (remain 88m 5s) Loss: 0.0016(0.0089) Grad: 3805.6160  LR: 0.000020  
Epoch: [1][4000/10725] Elapsed 51m 39s (remain 86m 48s) Loss: 0.0012(0.0087) Grad: 20088.4961  LR: 0.000020  
Epoch: [1][4100/10725] Elapsed 52m 57s (remain 85m 32s) Loss: 0.0052(0.0085) Grad: 78357.7578  LR: 0.000020  
Epoch: [1][4200/10725] Elapsed 54m 16s (remain 84m 16s) Loss: 0.0005(0.0084) Grad: 11089.3135  LR: 0.000020  
Epoch: [1][4300/10725] Elapsed 55m 34s (remain 83m 1s) Loss: 0.0028(0.0083) Grad: 17615.9219  LR: 0.000020  
Epoch: [1][4400/10725] Elapsed 56m 51s (remain 81m 42s) Loss: 0.0004(0.0081) Grad: 648.7901  LR: 0.000019  
Epoch: [1][4500/10725] Elapsed 58m 9s (remain 80m 25s) Loss: 0.0000(0.0080) Grad: 22.8469  LR: 0.000019  
Epoch: [1][4600/10725] Elapsed 59m 27s (remain 79m 7s) Loss: 0.0004(0.0079) Grad: 8893.1543  LR: 0.000019  
Epoch: [1][4700/10725] Elapsed 60m 43s (remain 77m 48s) Loss: 0.0019(0.0077) Grad: 11896.6592  LR: 0.000019  
Epoch: [1][4800/10725] Elapsed 62m 0s (remain 76m 30s) Loss: 0.0009(0.0076) Grad: 34727.6797  LR: 0.000019  
Epoch: [1][4900/10725] Elapsed 63m 17s (remain 75m 12s) Loss: 0.0003(0.0075) Grad: 1250.4675  LR: 0.000019  
Epoch: [1][5000/10725] Elapsed 64m 34s (remain 73m 54s) Loss: 0.0015(0.0074) Grad: 15043.7598  LR: 0.000019  
Epoch: [1][5100/10725] Elapsed 65m 52s (remain 72m 37s) Loss: 0.0018(0.0073) Grad: 19192.1836  LR: 0.000019  
Epoch: [1][5200/10725] Elapsed 67m 9s (remain 71m 19s) Loss: 0.0002(0.0072) Grad: 815.7847  LR: 0.000019  
Epoch: [1][5300/10725] Elapsed 68m 27s (remain 70m 2s) Loss: 0.0001(0.0071) Grad: 829.3785  LR: 0.000019  
Epoch: [1][5400/10725] Elapsed 69m 44s (remain 68m 44s) Loss: 0.0001(0.0070) Grad: 502.9657  LR: 0.000019  
Epoch: [1][5500/10725] Elapsed 71m 2s (remain 67m 27s) Loss: 0.0083(0.0069) Grad: 49128.3398  LR: 0.000019  
Epoch: [1][5600/10725] Elapsed 72m 19s (remain 66m 9s) Loss: 0.0000(0.0069) Grad: 98.0384  LR: 0.000019  
Epoch: [1][5700/10725] Elapsed 73m 35s (remain 64m 51s) Loss: 0.0027(0.0068) Grad: 26253.2500  LR: 0.000019  
Epoch: [1][5800/10725] Elapsed 74m 53s (remain 63m 34s) Loss: 0.0003(0.0067) Grad: 1002.4453  LR: 0.000019  
Epoch: [1][5900/10725] Elapsed 76m 11s (remain 62m 17s) Loss: 0.0003(0.0066) Grad: 594.8950  LR: 0.000019  
Epoch: [1][6000/10725] Elapsed 77m 30s (remain 61m 1s) Loss: 0.0071(0.0065) Grad: 69016.5703  LR: 0.000018  
Epoch: [1][6100/10725] Elapsed 78m 48s (remain 59m 44s) Loss: 0.0008(0.0064) Grad: 1521.1774  LR: 0.000018  
Epoch: [1][6200/10725] Elapsed 80m 7s (remain 58m 27s) Loss: 0.0004(0.0064) Grad: 7725.7559  LR: 0.000018  
Epoch: [1][6300/10725] Elapsed 81m 24s (remain 57m 9s) Loss: 0.0000(0.0063) Grad: 83.0081  LR: 0.000018  
Epoch: [1][6400/10725] Elapsed 82m 42s (remain 55m 52s) Loss: 0.0049(0.0062) Grad: 34083.2500  LR: 0.000018  
Epoch: [1][6500/10725] Elapsed 84m 0s (remain 54m 35s) Loss: 0.0001(0.0062) Grad: 446.7029  LR: 0.000018  
Epoch: [1][6600/10725] Elapsed 85m 18s (remain 53m 18s) Loss: 0.0001(0.0061) Grad: 1695.9106  LR: 0.000018  
Epoch: [1][6700/10725] Elapsed 86m 36s (remain 52m 0s) Loss: 0.0002(0.0060) Grad: 2978.1379  LR: 0.000018  
Epoch: [1][6800/10725] Elapsed 87m 52s (remain 50m 42s) Loss: 0.0004(0.0060) Grad: 3797.7410  LR: 0.000018  
Epoch: [1][6900/10725] Elapsed 89m 10s (remain 49m 24s) Loss: 0.0001(0.0059) Grad: 107.2918  LR: 0.000018  
Epoch: [1][7000/10725] Elapsed 90m 28s (remain 48m 7s) Loss: 0.0012(0.0059) Grad: 20759.1113  LR: 0.000018  
Epoch: [1][7100/10725] Elapsed 91m 45s (remain 46m 49s) Loss: 0.0001(0.0058) Grad: 300.0518  LR: 0.000018  
Epoch: [1][7200/10725] Elapsed 93m 3s (remain 45m 32s) Loss: 0.0000(0.0058) Grad: 58.5357  LR: 0.000018  
Epoch: [1][7300/10725] Elapsed 94m 20s (remain 44m 14s) Loss: 0.0004(0.0057) Grad: 3892.3501  LR: 0.000018  
Epoch: [1][7400/10725] Elapsed 95m 38s (remain 42m 57s) Loss: 0.0000(0.0057) Grad: 67.2781  LR: 0.000018  
Epoch: [1][7500/10725] Elapsed 96m 56s (remain 41m 39s) Loss: 0.0000(0.0056) Grad: 393.6308  LR: 0.000018  
Epoch: [1][7600/10725] Elapsed 98m 13s (remain 40m 22s) Loss: 0.0000(0.0056) Grad: 62.6034  LR: 0.000017  
Epoch: [1][7700/10725] Elapsed 99m 31s (remain 39m 4s) Loss: 0.0001(0.0055) Grad: 529.3019  LR: 0.000017  
Epoch: [1][7800/10725] Elapsed 100m 49s (remain 37m 47s) Loss: 0.0039(0.0054) Grad: 34608.6914  LR: 0.000017  
Epoch: [1][7900/10725] Elapsed 102m 7s (remain 36m 30s) Loss: 0.0001(0.0054) Grad: 227.4811  LR: 0.000017  
Epoch: [1][8000/10725] Elapsed 103m 26s (remain 35m 12s) Loss: 0.0000(0.0054) Grad: 217.0154  LR: 0.000017  
Epoch: [1][8100/10725] Elapsed 104m 42s (remain 33m 54s) Loss: 0.0001(0.0053) Grad: 194.0016  LR: 0.000017  
Epoch: [1][8200/10725] Elapsed 105m 59s (remain 32m 37s) Loss: 0.0013(0.0053) Grad: 14241.3643  LR: 0.000017  
Epoch: [1][8300/10725] Elapsed 107m 17s (remain 31m 19s) Loss: 0.0001(0.0052) Grad: 962.3940  LR: 0.000017  
Epoch: [1][8400/10725] Elapsed 108m 35s (remain 30m 2s) Loss: 0.0019(0.0052) Grad: 168310.9531  LR: 0.000017  
Epoch: [1][8500/10725] Elapsed 109m 53s (remain 28m 44s) Loss: 0.0001(0.0051) Grad: 2857.6924  LR: 0.000017  
Epoch: [1][8600/10725] Elapsed 111m 11s (remain 27m 27s) Loss: 0.0000(0.0051) Grad: 532.3784  LR: 0.000017  
Epoch: [1][8700/10725] Elapsed 112m 28s (remain 26m 9s) Loss: 0.0001(0.0051) Grad: 282.8375  LR: 0.000017  
Epoch: [1][8800/10725] Elapsed 113m 46s (remain 24m 52s) Loss: 0.0002(0.0050) Grad: 4623.1929  LR: 0.000017  
Epoch: [1][8900/10725] Elapsed 115m 4s (remain 23m 34s) Loss: 0.0002(0.0050) Grad: 1652.3335  LR: 0.000017  
Epoch: [1][9000/10725] Elapsed 116m 21s (remain 22m 17s) Loss: 0.0028(0.0050) Grad: 242628.9844  LR: 0.000017  
Epoch: [1][9100/10725] Elapsed 117m 38s (remain 20m 59s) Loss: 0.0000(0.0049) Grad: 111.3958  LR: 0.000017  
Epoch: [1][9200/10725] Elapsed 118m 56s (remain 19m 41s) Loss: 0.0000(0.0049) Grad: 280.3939  LR: 0.000017  
Epoch: [1][9300/10725] Elapsed 120m 13s (remain 18m 24s) Loss: 0.0001(0.0049) Grad: 3166.4629  LR: 0.000016  
Epoch: [1][9400/10725] Elapsed 121m 31s (remain 17m 6s) Loss: 0.0001(0.0048) Grad: 1985.8890  LR: 0.000016  
Epoch: [1][9500/10725] Elapsed 122m 50s (remain 15m 49s) Loss: 0.0020(0.0048) Grad: 38208.8398  LR: 0.000016  
Epoch: [1][9600/10725] Elapsed 124m 7s (remain 14m 31s) Loss: 0.0008(0.0048) Grad: 7750.8857  LR: 0.000016  
Epoch: [1][9700/10725] Elapsed 125m 26s (remain 13m 14s) Loss: 0.0017(0.0047) Grad: 31474.3770  LR: 0.000016  
Epoch: [1][9800/10725] Elapsed 126m 42s (remain 11m 56s) Loss: 0.0006(0.0047) Grad: 13691.7178  LR: 0.000016  
Epoch: [1][9900/10725] Elapsed 128m 0s (remain 10m 39s) Loss: 0.0000(0.0047) Grad: 11.8353  LR: 0.000016  
Epoch: [1][10000/10725] Elapsed 129m 17s (remain 9m 21s) Loss: 0.0036(0.0046) Grad: 45763.1172  LR: 0.000016  
Epoch: [1][10100/10725] Elapsed 130m 34s (remain 8m 3s) Loss: 0.0033(0.0046) Grad: 67066.0625  LR: 0.000016  
Epoch: [1][10200/10725] Elapsed 131m 53s (remain 6m 46s) Loss: 0.0001(0.0046) Grad: 838.7913  LR: 0.000016  
Epoch: [1][10300/10725] Elapsed 133m 12s (remain 5m 28s) Loss: 0.0002(0.0046) Grad: 5491.1567  LR: 0.000016  
Epoch: [1][10400/10725] Elapsed 134m 30s (remain 4m 11s) Loss: 0.0012(0.0045) Grad: 63802.3438  LR: 0.000016  
Epoch: [1][10500/10725] Elapsed 135m 48s (remain 2m 53s) Loss: 0.0000(0.0045) Grad: 398.9751  LR: 0.000016  
Epoch: [1][10600/10725] Elapsed 137m 5s (remain 1m 36s) Loss: 0.0000(0.0045) Grad: 295.4294  LR: 0.000016  
Epoch: [1][10700/10725] Elapsed 138m 24s (remain 0m 18s) Loss: 0.0024(0.0044) Grad: 53103.8008  LR: 0.000016  
Epoch: [1][10724/10725] Elapsed 138m 42s (remain 0m 0s) Loss: 0.0024(0.0044) Grad: 27795.6758  LR: 0.000016  
EVAL: [0/1192] Elapsed 0m 1s (remain 24m 38s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 31s (remain 5m 39s) Loss: 0.0031(0.0023) 
EVAL: [200/1192] Elapsed 1m 0s (remain 4m 59s) Loss: 0.0071(0.0029) 
EVAL: [300/1192] Elapsed 1m 31s (remain 4m 29s) Loss: 0.0028(0.0031) 
EVAL: [400/1192] Elapsed 2m 0s (remain 3m 57s) Loss: 0.0025(0.0032) 
EVAL: [500/1192] Elapsed 2m 30s (remain 3m 27s) Loss: 0.0017(0.0030) 
EVAL: [600/1192] Elapsed 3m 0s (remain 2m 57s) Loss: 0.0001(0.0033) 
EVAL: [700/1192] Elapsed 3m 29s (remain 2m 26s) Loss: 0.0305(0.0038) 
EVAL: [800/1192] Elapsed 3m 59s (remain 1m 57s) Loss: 0.0025(0.0039) 
EVAL: [900/1192] Elapsed 4m 29s (remain 1m 27s) Loss: 0.0024(0.0038) 
EVAL: [1000/1192] Elapsed 4m 59s (remain 0m 57s) Loss: 0.0000(0.0038) 
EVAL: [1100/1192] Elapsed 5m 28s (remain 0m 27s) Loss: 0.0000(0.0036) 
EVAL: [1191/1192] Elapsed 5m 56s (remain 0m 0s) Loss: 0.0000(0.0035) 
Epoch 1 - avg_train_loss: 0.0044  avg_val_loss: 0.0035  time: 8683s
Epoch 1 - Score: 0.8779
Epoch 1 - Save Best Score: 0.8779 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 558394.72it/s]
100%
612602/612602 [00:44<00:00, 15044.20it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(42900, 11)
Epoch: [2][0/14300] Elapsed 0m 2s (remain 679m 20s) Loss: 0.0000(0.0000) Grad: 999.3548  LR: 0.000016  
Epoch: [2][100/14300] Elapsed 1m 20s (remain 188m 37s) Loss: 0.0001(0.0013) Grad: 805.9419  LR: 0.000015  
Epoch: [2][200/14300] Elapsed 2m 39s (remain 185m 58s) Loss: 0.0010(0.0014) Grad: 7047.5142  LR: 0.000015  
Epoch: [2][300/14300] Elapsed 3m 56s (remain 183m 42s) Loss: 0.0001(0.0015) Grad: 526.9429  LR: 0.000015  
Epoch: [2][400/14300] Elapsed 5m 14s (remain 181m 57s) Loss: 0.0015(0.0014) Grad: 3014.4929  LR: 0.000015  
Epoch: [2][500/14300] Elapsed 6m 32s (remain 180m 22s) Loss: 0.0001(0.0014) Grad: 328.1581  LR: 0.000015  
Epoch: [2][600/14300] Elapsed 7m 50s (remain 178m 33s) Loss: 0.0001(0.0014) Grad: 105.9611  LR: 0.000015  
Epoch: [2][700/14300] Elapsed 9m 7s (remain 177m 4s) Loss: 0.0000(0.0014) Grad: 54.8676  LR: 0.000015  
Epoch: [2][800/14300] Elapsed 10m 25s (remain 175m 47s) Loss: 0.0058(0.0013) Grad: 5036.4478  LR: 0.000015  
Epoch: [2][900/14300] Elapsed 11m 43s (remain 174m 16s) Loss: 0.0002(0.0013) Grad: 833.3433  LR: 0.000015  
Epoch: [2][1000/14300] Elapsed 13m 0s (remain 172m 45s) Loss: 0.0002(0.0014) Grad: 621.0443  LR: 0.000015  
Epoch: [2][1100/14300] Elapsed 14m 17s (remain 171m 15s) Loss: 0.0000(0.0014) Grad: 46.7164  LR: 0.000015  
Epoch: [2][1200/14300] Elapsed 15m 34s (remain 169m 54s) Loss: 0.0000(0.0013) Grad: 10.7132  LR: 0.000015  
Epoch: [2][1300/14300] Elapsed 16m 52s (remain 168m 37s) Loss: 0.0004(0.0013) Grad: 1705.4581  LR: 0.000015  
Epoch: [2][1400/14300] Elapsed 18m 10s (remain 167m 21s) Loss: 0.0000(0.0013) Grad: 106.0165  LR: 0.000015  
Epoch: [2][1500/14300] Elapsed 19m 28s (remain 166m 2s) Loss: 0.0001(0.0013) Grad: 132.5398  LR: 0.000015  
Epoch: [2][1600/14300] Elapsed 20m 46s (remain 164m 49s) Loss: 0.0012(0.0013) Grad: 25062.2559  LR: 0.000015  
Epoch: [2][1700/14300] Elapsed 22m 4s (remain 163m 27s) Loss: 0.0000(0.0013) Grad: 26.1772  LR: 0.000014  
Epoch: [2][1800/14300] Elapsed 23m 22s (remain 162m 10s) Loss: 0.0000(0.0013) Grad: 131.9137  LR: 0.000014  
Epoch: [2][1900/14300] Elapsed 24m 38s (remain 160m 45s) Loss: 0.0000(0.0013) Grad: 178.7202  LR: 0.000014  
Epoch: [2][2000/14300] Elapsed 25m 56s (remain 159m 28s) Loss: 0.0000(0.0013) Grad: 38.4784  LR: 0.000014  
Epoch: [2][2100/14300] Elapsed 27m 14s (remain 158m 11s) Loss: 0.0049(0.0013) Grad: 30363.2832  LR: 0.000014  
Epoch: [2][2200/14300] Elapsed 28m 31s (remain 156m 49s) Loss: 0.0006(0.0013) Grad: 12987.7559  LR: 0.000014  
Epoch: [2][2300/14300] Elapsed 29m 47s (remain 155m 23s) Loss: 0.0000(0.0013) Grad: 1.6086  LR: 0.000014  
Epoch: [2][2400/14300] Elapsed 31m 4s (remain 154m 2s) Loss: 0.0000(0.0013) Grad: 41.2907  LR: 0.000014  
Epoch: [2][2500/14300] Elapsed 32m 21s (remain 152m 41s) Loss: 0.0028(0.0013) Grad: 20711.2246  LR: 0.000014  
Epoch: [2][2600/14300] Elapsed 33m 40s (remain 151m 26s) Loss: 0.0000(0.0013) Grad: 29.6805  LR: 0.000014  
Epoch: [2][2700/14300] Elapsed 34m 57s (remain 150m 8s) Loss: 0.0017(0.0013) Grad: 19120.0000  LR: 0.000014  
Epoch: [2][2800/14300] Elapsed 36m 14s (remain 148m 47s) Loss: 0.0011(0.0013) Grad: 3575.3276  LR: 0.000014  
Epoch: [2][2900/14300] Elapsed 37m 30s (remain 147m 24s) Loss: 0.0000(0.0013) Grad: 29.3902  LR: 0.000014  
Epoch: [2][3000/14300] Elapsed 38m 48s (remain 146m 5s) Loss: 0.0000(0.0013) Grad: 61.1992  LR: 0.000014  
Epoch: [2][3100/14300] Elapsed 40m 4s (remain 144m 43s) Loss: 0.0001(0.0013) Grad: 158.6386  LR: 0.000014  
Epoch: [2][3200/14300] Elapsed 41m 21s (remain 143m 24s) Loss: 0.0000(0.0013) Grad: 21.7196  LR: 0.000014  
Epoch: [2][3300/14300] Elapsed 42m 38s (remain 142m 3s) Loss: 0.0000(0.0013) Grad: 40.8739  LR: 0.000014  
Epoch: [2][3400/14300] Elapsed 43m 56s (remain 140m 48s) Loss: 0.0000(0.0013) Grad: 40.2240  LR: 0.000013  
Epoch: [2][3500/14300] Elapsed 45m 14s (remain 139m 31s) Loss: 0.0050(0.0013) Grad: 15599.8193  LR: 0.000013  
Epoch: [2][3600/14300] Elapsed 46m 32s (remain 138m 16s) Loss: 0.0002(0.0013) Grad: 466.0917  LR: 0.000013  
Epoch: [2][3700/14300] Elapsed 47m 49s (remain 136m 57s) Loss: 0.0017(0.0013) Grad: 5435.5986  LR: 0.000013  
Epoch: [2][3800/14300] Elapsed 49m 6s (remain 135m 39s) Loss: 0.0015(0.0013) Grad: 3535.2046  LR: 0.000013  
Epoch: [2][3900/14300] Elapsed 50m 23s (remain 134m 20s) Loss: 0.0009(0.0013) Grad: 2808.5183  LR: 0.000013  
Epoch: [2][4000/14300] Elapsed 51m 40s (remain 133m 0s) Loss: 0.0000(0.0013) Grad: 285.1458  LR: 0.000013  
Epoch: [2][4100/14300] Elapsed 52m 58s (remain 131m 45s) Loss: 0.0006(0.0013) Grad: 11613.0654  LR: 0.000013  
Epoch: [2][4200/14300] Elapsed 54m 15s (remain 130m 26s) Loss: 0.0020(0.0013) Grad: 15351.5039  LR: 0.000013  
Epoch: [2][4300/14300] Elapsed 55m 33s (remain 129m 10s) Loss: 0.0028(0.0013) Grad: 13779.0557  LR: 0.000013  
Epoch: [2][4400/14300] Elapsed 56m 50s (remain 127m 50s) Loss: 0.0000(0.0013) Grad: 109.0461  LR: 0.000013  
Epoch: [2][4500/14300] Elapsed 58m 6s (remain 126m 30s) Loss: 0.0020(0.0013) Grad: 8687.2236  LR: 0.000013  
Epoch: [2][4600/14300] Elapsed 59m 24s (remain 125m 14s) Loss: 0.0002(0.0013) Grad: 1939.3512  LR: 0.000013  
Epoch: [2][4700/14300] Elapsed 60m 40s (remain 123m 54s) Loss: 0.0000(0.0013) Grad: 347.0905  LR: 0.000013  
Epoch: [2][4800/14300] Elapsed 61m 58s (remain 122m 36s) Loss: 0.0045(0.0013) Grad: 42646.6641  LR: 0.000013  
Epoch: [2][4900/14300] Elapsed 63m 15s (remain 121m 19s) Loss: 0.0022(0.0013) Grad: 16272.3838  LR: 0.000013  
Epoch: [2][5000/14300] Elapsed 64m 32s (remain 120m 0s) Loss: 0.0000(0.0013) Grad: 56.5163  LR: 0.000012  
Epoch: [2][5100/14300] Elapsed 65m 49s (remain 118m 43s) Loss: 0.0000(0.0013) Grad: 24.9479  LR: 0.000012  
Epoch: [2][5200/14300] Elapsed 67m 8s (remain 117m 26s) Loss: 0.0000(0.0013) Grad: 14.4517  LR: 0.000012  
Epoch: [2][5300/14300] Elapsed 68m 25s (remain 116m 9s) Loss: 0.0000(0.0013) Grad: 122.4493  LR: 0.000012  
Epoch: [2][5400/14300] Elapsed 69m 42s (remain 114m 51s) Loss: 0.0001(0.0013) Grad: 2455.4392  LR: 0.000012  
Epoch: [2][5500/14300] Elapsed 70m 58s (remain 113m 32s) Loss: 0.0000(0.0013) Grad: 778.1210  LR: 0.000012  
Epoch: [2][5600/14300] Elapsed 72m 16s (remain 112m 14s) Loss: 0.0010(0.0012) Grad: 59485.2578  LR: 0.000012  
Epoch: [2][5700/14300] Elapsed 73m 33s (remain 110m 56s) Loss: 0.0014(0.0012) Grad: 9099.9014  LR: 0.000012  
Epoch: [2][5800/14300] Elapsed 74m 50s (remain 109m 38s) Loss: 0.0003(0.0013) Grad: 1361.8762  LR: 0.000012  
Epoch: [2][5900/14300] Elapsed 76m 7s (remain 108m 20s) Loss: 0.0001(0.0013) Grad: 1665.5115  LR: 0.000012  
Epoch: [2][6000/14300] Elapsed 77m 24s (remain 107m 3s) Loss: 0.0000(0.0013) Grad: 63.7077  LR: 0.000012  
Epoch: [2][6100/14300] Elapsed 78m 41s (remain 105m 45s) Loss: 0.0032(0.0013) Grad: 27270.9160  LR: 0.000012  
Epoch: [2][6200/14300] Elapsed 79m 58s (remain 104m 27s) Loss: 0.0010(0.0013) Grad: 8467.8213  LR: 0.000012  
Epoch: [2][6300/14300] Elapsed 81m 15s (remain 103m 9s) Loss: 0.0013(0.0013) Grad: 9886.6875  LR: 0.000012  
Epoch: [2][6400/14300] Elapsed 82m 32s (remain 101m 51s) Loss: 0.0000(0.0013) Grad: 94.9035  LR: 0.000012  
Epoch: [2][6500/14300] Elapsed 83m 49s (remain 100m 34s) Loss: 0.0015(0.0013) Grad: 20229.5176  LR: 0.000012  
Epoch: [2][6600/14300] Elapsed 85m 7s (remain 99m 16s) Loss: 0.0015(0.0013) Grad: 10639.3086  LR: 0.000011  
Epoch: [2][6700/14300] Elapsed 86m 24s (remain 97m 59s) Loss: 0.0038(0.0013) Grad: 358494.9688  LR: 0.000011  
Epoch: [2][6800/14300] Elapsed 87m 42s (remain 96m 42s) Loss: 0.0003(0.0012) Grad: 5452.6479  LR: 0.000011  
Epoch: [2][6900/14300] Elapsed 89m 0s (remain 95m 25s) Loss: 0.0000(0.0013) Grad: 145.4523  LR: 0.000011  
Epoch: [2][7000/14300] Elapsed 90m 18s (remain 94m 9s) Loss: 0.0000(0.0013) Grad: 10.0475  LR: 0.000011  
Epoch: [2][7100/14300] Elapsed 91m 36s (remain 92m 51s) Loss: 0.0000(0.0013) Grad: 43.9577  LR: 0.000011  
Epoch: [2][7200/14300] Elapsed 92m 52s (remain 91m 33s) Loss: 0.0000(0.0012) Grad: 304.5048  LR: 0.000011  
Epoch: [2][7300/14300] Elapsed 94m 10s (remain 90m 16s) Loss: 0.0014(0.0013) Grad: 11932.6621  LR: 0.000011  
Epoch: [2][7400/14300] Elapsed 95m 28s (remain 89m 0s) Loss: 0.0000(0.0012) Grad: 41.4779  LR: 0.000011  
Epoch: [2][7500/14300] Elapsed 96m 45s (remain 87m 41s) Loss: 0.0000(0.0012) Grad: 247.1004  LR: 0.000011  
Epoch: [2][7600/14300] Elapsed 98m 2s (remain 86m 24s) Loss: 0.0012(0.0012) Grad: 10299.0420  LR: 0.000011  
Epoch: [2][7700/14300] Elapsed 99m 20s (remain 85m 7s) Loss: 0.0000(0.0012) Grad: 57.8930  LR: 0.000011  
Epoch: [2][7800/14300] Elapsed 100m 38s (remain 83m 50s) Loss: 0.0000(0.0012) Grad: 196.7478  LR: 0.000011  
Epoch: [2][7900/14300] Elapsed 101m 57s (remain 82m 34s) Loss: 0.0002(0.0012) Grad: 2934.5339  LR: 0.000011  
Epoch: [2][8000/14300] Elapsed 103m 14s (remain 81m 16s) Loss: 0.0025(0.0012) Grad: 51090.5625  LR: 0.000011  
Epoch: [2][8100/14300] Elapsed 104m 31s (remain 79m 59s) Loss: 0.0000(0.0012) Grad: 177.0305  LR: 0.000011  
Epoch: [2][8200/14300] Elapsed 105m 50s (remain 78m 42s) Loss: 0.0000(0.0012) Grad: 236.1201  LR: 0.000010  
Epoch: [2][8300/14300] Elapsed 107m 8s (remain 77m 25s) Loss: 0.0000(0.0012) Grad: 717.9183  LR: 0.000010  
Epoch: [2][8400/14300] Elapsed 108m 26s (remain 76m 8s) Loss: 0.0000(0.0012) Grad: 19.1529  LR: 0.000010  
Epoch: [2][8500/14300] Elapsed 109m 44s (remain 74m 51s) Loss: 0.0000(0.0012) Grad: 41.9779  LR: 0.000010  
Epoch: [2][8600/14300] Elapsed 111m 1s (remain 73m 33s) Loss: 0.0000(0.0012) Grad: 108.8314  LR: 0.000010  
Epoch: [2][8700/14300] Elapsed 112m 17s (remain 72m 15s) Loss: 0.0008(0.0012) Grad: 19109.1758  LR: 0.000010  
Epoch: [2][8800/14300] Elapsed 113m 35s (remain 70m 58s) Loss: 0.0000(0.0012) Grad: 242.7456  LR: 0.000010  
Epoch: [2][8900/14300] Elapsed 114m 51s (remain 69m 39s) Loss: 0.0000(0.0012) Grad: 164.0207  LR: 0.000010  
Epoch: [2][9000/14300] Elapsed 116m 7s (remain 68m 21s) Loss: 0.0001(0.0012) Grad: 1340.1582  LR: 0.000010  
Epoch: [2][9100/14300] Elapsed 117m 24s (remain 67m 4s) Loss: 0.0002(0.0012) Grad: 7961.7754  LR: 0.000010  
Epoch: [2][9200/14300] Elapsed 118m 41s (remain 65m 46s) Loss: 0.0001(0.0012) Grad: 2998.6975  LR: 0.000010  
Epoch: [2][9300/14300] Elapsed 119m 57s (remain 64m 28s) Loss: 0.0016(0.0012) Grad: 37690.2422  LR: 0.000010  
Epoch: [2][9400/14300] Elapsed 121m 15s (remain 63m 11s) Loss: 0.0000(0.0012) Grad: 260.3946  LR: 0.000010  
Epoch: [2][9500/14300] Elapsed 122m 32s (remain 61m 53s) Loss: 0.0000(0.0012) Grad: 321.5284  LR: 0.000010  
Epoch: [2][9600/14300] Elapsed 123m 49s (remain 60m 36s) Loss: 0.0000(0.0012) Grad: 136.8410  LR: 0.000010  
Epoch: [2][9700/14300] Elapsed 125m 6s (remain 59m 18s) Loss: 0.0000(0.0012) Grad: 570.4905  LR: 0.000010  
Epoch: [2][9800/14300] Elapsed 126m 25s (remain 58m 2s) Loss: 0.0054(0.0012) Grad: 65604.7812  LR: 0.000009  
Epoch: [2][9900/14300] Elapsed 127m 42s (remain 56m 44s) Loss: 0.0019(0.0012) Grad: 29187.4531  LR: 0.000009  
Epoch: [2][10000/14300] Elapsed 128m 58s (remain 55m 26s) Loss: 0.0000(0.0012) Grad: 1076.0983  LR: 0.000009  
Epoch: [2][10100/14300] Elapsed 130m 16s (remain 54m 9s) Loss: 0.0000(0.0012) Grad: 55.1715  LR: 0.000009  
Epoch: [2][10200/14300] Elapsed 131m 33s (remain 52m 51s) Loss: 0.0032(0.0012) Grad: 20680.3516  LR: 0.000009  
Epoch: [2][10300/14300] Elapsed 132m 52s (remain 51m 34s) Loss: 0.0017(0.0012) Grad: 12153.5273  LR: 0.000009  
Epoch: [2][10400/14300] Elapsed 134m 9s (remain 50m 17s) Loss: 0.0000(0.0012) Grad: 57.0552  LR: 0.000009  
Epoch: [2][10500/14300] Elapsed 135m 27s (remain 49m 0s) Loss: 0.0000(0.0012) Grad: 878.9404  LR: 0.000009  
Epoch: [2][10600/14300] Elapsed 136m 45s (remain 47m 43s) Loss: 0.0000(0.0012) Grad: 88.9278  LR: 0.000009  
Epoch: [2][10700/14300] Elapsed 138m 3s (remain 46m 25s) Loss: 0.0001(0.0012) Grad: 7122.6724  LR: 0.000009  
Epoch: [2][10800/14300] Elapsed 139m 19s (remain 45m 8s) Loss: 0.0017(0.0012) Grad: 31056.4707  LR: 0.000009  
Epoch: [2][10900/14300] Elapsed 140m 36s (remain 43m 50s) Loss: 0.0063(0.0012) Grad: 15604.7471  LR: 0.000009  
Epoch: [2][11000/14300] Elapsed 141m 53s (remain 42m 33s) Loss: 0.0060(0.0012) Grad: 111880.1016  LR: 0.000009  
Epoch: [2][11100/14300] Elapsed 143m 11s (remain 41m 15s) Loss: 0.0004(0.0012) Grad: 11361.6885  LR: 0.000009  
Epoch: [2][11200/14300] Elapsed 144m 28s (remain 39m 58s) Loss: 0.0000(0.0012) Grad: 1404.2329  LR: 0.000009  
Epoch: [2][11300/14300] Elapsed 145m 44s (remain 38m 40s) Loss: 0.0035(0.0012) Grad: 16433.1777  LR: 0.000009  
Epoch: [2][11400/14300] Elapsed 147m 1s (remain 37m 23s) Loss: 0.0006(0.0012) Grad: 9867.7236  LR: 0.000008  
Epoch: [2][11500/14300] Elapsed 148m 19s (remain 36m 5s) Loss: 0.0001(0.0012) Grad: 7648.4663  LR: 0.000008  
Epoch: [2][11600/14300] Elapsed 149m 36s (remain 34m 48s) Loss: 0.0000(0.0012) Grad: 1613.8694  LR: 0.000008  
Epoch: [2][11700/14300] Elapsed 150m 53s (remain 33m 30s) Loss: 0.0000(0.0012) Grad: 23.6908  LR: 0.000008  
Epoch: [2][11800/14300] Elapsed 152m 11s (remain 32m 13s) Loss: 0.0000(0.0012) Grad: 132.0764  LR: 0.000008  
Epoch: [2][11900/14300] Elapsed 153m 28s (remain 30m 56s) Loss: 0.0011(0.0012) Grad: 27728.2129  LR: 0.000008  
Epoch: [2][12000/14300] Elapsed 154m 46s (remain 29m 39s) Loss: 0.0000(0.0012) Grad: 37.8382  LR: 0.000008  
Epoch: [2][12100/14300] Elapsed 156m 6s (remain 28m 22s) Loss: 0.0000(0.0012) Grad: 23.7116  LR: 0.000008  
Epoch: [2][12200/14300] Elapsed 157m 24s (remain 27m 4s) Loss: 0.0000(0.0012) Grad: 31.5641  LR: 0.000008  
Epoch: [2][12300/14300] Elapsed 158m 41s (remain 25m 47s) Loss: 0.0000(0.0012) Grad: 1874.1765  LR: 0.000008  
Epoch: [2][12400/14300] Elapsed 159m 58s (remain 24m 29s) Loss: 0.0000(0.0012) Grad: 1501.5129  LR: 0.000008  
Epoch: [2][12500/14300] Elapsed 161m 15s (remain 23m 12s) Loss: 0.0000(0.0012) Grad: 542.6184  LR: 0.000008  
Epoch: [2][12600/14300] Elapsed 162m 33s (remain 21m 55s) Loss: 0.0000(0.0012) Grad: 21.2918  LR: 0.000008  
Epoch: [2][12700/14300] Elapsed 163m 50s (remain 20m 37s) Loss: 0.0179(0.0012) Grad: 119083.8984  LR: 0.000008  
Epoch: [2][12800/14300] Elapsed 165m 8s (remain 19m 20s) Loss: 0.0005(0.0012) Grad: 3445.5852  LR: 0.000008  
Epoch: [2][12900/14300] Elapsed 166m 25s (remain 18m 2s) Loss: 0.0018(0.0012) Grad: 92222.3984  LR: 0.000008  
Epoch: [2][13000/14300] Elapsed 167m 43s (remain 16m 45s) Loss: 0.0000(0.0012) Grad: 80.7005  LR: 0.000007  
Epoch: [2][13100/14300] Elapsed 169m 2s (remain 15m 28s) Loss: 0.0046(0.0012) Grad: 68482.9453  LR: 0.000007  
Epoch: [2][13200/14300] Elapsed 170m 19s (remain 14m 10s) Loss: 0.0002(0.0012) Grad: 5904.2036  LR: 0.000007  
Epoch: [2][13300/14300] Elapsed 171m 36s (remain 12m 53s) Loss: 0.0000(0.0012) Grad: 36.9796  LR: 0.000007  
Epoch: [2][13400/14300] Elapsed 172m 54s (remain 11m 35s) Loss: 0.0008(0.0012) Grad: 84636.3516  LR: 0.000007  
Epoch: [2][13500/14300] Elapsed 174m 12s (remain 10m 18s) Loss: 0.0000(0.0012) Grad: 61.6109  LR: 0.000007  
Epoch: [2][13600/14300] Elapsed 175m 30s (remain 9m 1s) Loss: 0.0059(0.0012) Grad: 149311.5781  LR: 0.000007  
Epoch: [2][13700/14300] Elapsed 176m 47s (remain 7m 43s) Loss: 0.0033(0.0012) Grad: 353249.7188  LR: 0.000007  
Epoch: [2][13800/14300] Elapsed 178m 4s (remain 6m 26s) Loss: 0.0000(0.0012) Grad: 132.5412  LR: 0.000007  
Epoch: [2][13900/14300] Elapsed 179m 21s (remain 5m 8s) Loss: 0.0000(0.0012) Grad: 49.3077  LR: 0.000007  
Epoch: [2][14000/14300] Elapsed 180m 39s (remain 3m 51s) Loss: 0.0002(0.0012) Grad: 5627.3813  LR: 0.000007  
Epoch: [2][14100/14300] Elapsed 181m 56s (remain 2m 34s) Loss: 0.0000(0.0012) Grad: 408.6159  LR: 0.000007  
Epoch: [2][14200/14300] Elapsed 183m 13s (remain 1m 16s) Loss: 0.0000(0.0012) Grad: 1575.2722  LR: 0.000007  
Epoch: [2][14299/14300] Elapsed 184m 30s (remain 0m 0s) Loss: 0.0000(0.0012) Grad: 2237.8804  LR: 0.000007  
EVAL: [0/1192] Elapsed 0m 1s (remain 26m 2s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 30s (remain 5m 34s) Loss: 0.0064(0.0028) 
EVAL: [200/1192] Elapsed 1m 1s (remain 5m 1s) Loss: 0.0091(0.0033) 
EVAL: [300/1192] Elapsed 1m 31s (remain 4m 29s) Loss: 0.0019(0.0035) 
EVAL: [400/1192] Elapsed 2m 0s (remain 3m 58s) Loss: 0.0039(0.0038) 
EVAL: [500/1192] Elapsed 2m 30s (remain 3m 27s) Loss: 0.0040(0.0036) 
EVAL: [600/1192] Elapsed 3m 0s (remain 2m 57s) Loss: 0.0000(0.0037) 
EVAL: [700/1192] Elapsed 3m 30s (remain 2m 27s) Loss: 0.0435(0.0044) 
EVAL: [800/1192] Elapsed 4m 0s (remain 1m 57s) Loss: 0.0020(0.0045) 
EVAL: [900/1192] Elapsed 4m 30s (remain 1m 27s) Loss: 0.0007(0.0045) 
EVAL: [1000/1192] Elapsed 5m 0s (remain 0m 57s) Loss: 0.0000(0.0044) 
EVAL: [1100/1192] Elapsed 5m 30s (remain 0m 27s) Loss: 0.0000(0.0043) 
EVAL: [1191/1192] Elapsed 5m 57s (remain 0m 0s) Loss: 0.0000(0.0041) 
Epoch 2 - avg_train_loss: 0.0012  avg_val_loss: 0.0041  time: 11433s
Epoch 2 - Score: 0.8883
Epoch 2 - Save Best Score: 0.8883 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 606627.63it/s]
100%
612602/612602 [00:45<00:00, 18076.62it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(53625, 11)
Epoch: [3][0/17875] Elapsed 0m 2s (remain 701m 42s) Loss: 0.0000(0.0000) Grad: 12.6240  LR: 0.000007  
Epoch: [3][100/17875] Elapsed 1m 20s (remain 236m 4s) Loss: 0.0000(0.0012) Grad: 15.4504  LR: 0.000007  
Epoch: [3][200/17875] Elapsed 2m 38s (remain 232m 23s) Loss: 0.0067(0.0010) Grad: 8629.9863  LR: 0.000007  
Epoch: [3][300/17875] Elapsed 3m 56s (remain 230m 12s) Loss: 0.0131(0.0009) Grad: 8812.7695  LR: 0.000006  
Epoch: [3][400/17875] Elapsed 5m 15s (remain 228m 49s) Loss: 0.0014(0.0009) Grad: 4583.5625  LR: 0.000006  
Epoch: [3][500/17875] Elapsed 6m 31s (remain 226m 24s) Loss: 0.0000(0.0008) Grad: 75.5600  LR: 0.000006  
Epoch: [3][600/17875] Elapsed 7m 47s (remain 224m 8s) Loss: 0.0002(0.0008) Grad: 1200.8875  LR: 0.000006  
Epoch: [3][700/17875] Elapsed 9m 4s (remain 222m 14s) Loss: 0.0001(0.0008) Grad: 1796.4249  LR: 0.000006  
Epoch: [3][800/17875] Elapsed 10m 21s (remain 220m 41s) Loss: 0.0000(0.0008) Grad: 78.8701  LR: 0.000006  
Epoch: [3][900/17875] Elapsed 11m 38s (remain 219m 15s) Loss: 0.0005(0.0008) Grad: 915.4109  LR: 0.000006  
Epoch: [3][1000/17875] Elapsed 12m 56s (remain 218m 13s) Loss: 0.0004(0.0008) Grad: 5699.9888  LR: 0.000006  
Epoch: [3][1100/17875] Elapsed 14m 15s (remain 217m 11s) Loss: 0.0000(0.0008) Grad: 32.6844  LR: 0.000006  
Epoch: [3][1200/17875] Elapsed 15m 32s (remain 215m 47s) Loss: 0.0000(0.0008) Grad: 26.5585  LR: 0.000006  
Epoch: [3][1300/17875] Elapsed 16m 50s (remain 214m 35s) Loss: 0.0000(0.0007) Grad: 192.7609  LR: 0.000006  
Epoch: [3][1400/17875] Elapsed 18m 8s (remain 213m 16s) Loss: 0.0032(0.0007) Grad: 9659.2051  LR: 0.000006  
Epoch: [3][1500/17875] Elapsed 19m 26s (remain 212m 6s) Loss: 0.0000(0.0008) Grad: 24.1971  LR: 0.000006  
Epoch: [3][1600/17875] Elapsed 20m 43s (remain 210m 41s) Loss: 0.0000(0.0008) Grad: 29.3407  LR: 0.000006  
Epoch: [3][1700/17875] Elapsed 22m 0s (remain 209m 11s) Loss: 0.0006(0.0008) Grad: 3504.2964  LR: 0.000006  
Epoch: [3][1800/17875] Elapsed 23m 16s (remain 207m 47s) Loss: 0.0000(0.0008) Grad: 302.0636  LR: 0.000006  
Epoch: [3][1900/17875] Elapsed 24m 34s (remain 206m 28s) Loss: 0.0000(0.0008) Grad: 62.1767  LR: 0.000005  
Epoch: [3][2000/17875] Elapsed 25m 51s (remain 205m 6s) Loss: 0.0001(0.0008) Grad: 668.5645  LR: 0.000005  
Epoch: [3][2100/17875] Elapsed 27m 9s (remain 203m 50s) Loss: 0.0000(0.0008) Grad: 23.5194  LR: 0.000005  
Epoch: [3][2200/17875] Elapsed 28m 26s (remain 202m 30s) Loss: 0.0004(0.0007) Grad: 2222.6816  LR: 0.000005  
Epoch: [3][2300/17875] Elapsed 29m 43s (remain 201m 13s) Loss: 0.0000(0.0008) Grad: 62.4626  LR: 0.000005  
Epoch: [3][2400/17875] Elapsed 31m 1s (remain 199m 58s) Loss: 0.0001(0.0008) Grad: 917.3102  LR: 0.000005  
Epoch: [3][2500/17875] Elapsed 32m 19s (remain 198m 40s) Loss: 0.0000(0.0008) Grad: 27.0137  LR: 0.000005  
Epoch: [3][2600/17875] Elapsed 33m 37s (remain 197m 25s) Loss: 0.0000(0.0008) Grad: 10.2950  LR: 0.000005  
Epoch: [3][2700/17875] Elapsed 34m 54s (remain 196m 9s) Loss: 0.0010(0.0008) Grad: 6299.2563  LR: 0.000005  
Epoch: [3][2800/17875] Elapsed 36m 12s (remain 194m 53s) Loss: 0.0042(0.0008) Grad: 5027.1357  LR: 0.000005  
Epoch: [3][2900/17875] Elapsed 37m 28s (remain 193m 27s) Loss: 0.0034(0.0008) Grad: 6111.0894  LR: 0.000005  
Epoch: [3][3000/17875] Elapsed 38m 46s (remain 192m 9s) Loss: 0.0042(0.0008) Grad: 6277.4009  LR: 0.000005  
Epoch: [3][3100/17875] Elapsed 40m 3s (remain 190m 48s) Loss: 0.0011(0.0008) Grad: 6682.4395  LR: 0.000005  
Epoch: [3][3200/17875] Elapsed 41m 19s (remain 189m 25s) Loss: 0.0007(0.0008) Grad: 2197.2930  LR: 0.000005  
Epoch: [3][3300/17875] Elapsed 42m 35s (remain 188m 4s) Loss: 0.0000(0.0008) Grad: 8.4871  LR: 0.000005  
Epoch: [3][3400/17875] Elapsed 43m 52s (remain 186m 41s) Loss: 0.0004(0.0007) Grad: 2515.5862  LR: 0.000005  
Epoch: [3][3500/17875] Elapsed 45m 8s (remain 185m 19s) Loss: 0.0003(0.0008) Grad: 549.8937  LR: 0.000004  
Epoch: [3][3600/17875] Elapsed 46m 25s (remain 184m 2s) Loss: 0.0000(0.0008) Grad: 8.8373  LR: 0.000004  
Epoch: [3][3700/17875] Elapsed 47m 42s (remain 182m 41s) Loss: 0.0121(0.0008) Grad: 27727.3926  LR: 0.000004  
Epoch: [3][3800/17875] Elapsed 49m 1s (remain 181m 29s) Loss: 0.0000(0.0007) Grad: 10.6251  LR: 0.000004  
Epoch: [3][3900/17875] Elapsed 50m 19s (remain 180m 15s) Loss: 0.0020(0.0008) Grad: 3471.1006  LR: 0.000004  
Epoch: [3][4000/17875] Elapsed 51m 36s (remain 178m 57s) Loss: 0.0000(0.0007) Grad: 5.2646  LR: 0.000004  
Epoch: [3][4100/17875] Elapsed 52m 54s (remain 177m 42s) Loss: 0.0000(0.0007) Grad: 382.5060  LR: 0.000004  
Epoch: [3][4200/17875] Elapsed 54m 12s (remain 176m 26s) Loss: 0.0006(0.0008) Grad: 25814.9766  LR: 0.000004  
Epoch: [3][4300/17875] Elapsed 55m 30s (remain 175m 11s) Loss: 0.0000(0.0008) Grad: 5.7210  LR: 0.000004  
Epoch: [3][4400/17875] Elapsed 56m 48s (remain 173m 53s) Loss: 0.0004(0.0008) Grad: 5399.1768  LR: 0.000004  
Epoch: [3][4500/17875] Elapsed 58m 6s (remain 172m 38s) Loss: 0.0010(0.0008) Grad: 8672.3867  LR: 0.000004  
Epoch: [3][4600/17875] Elapsed 59m 24s (remain 171m 23s) Loss: 0.0027(0.0008) Grad: 72895.0625  LR: 0.000004  
Epoch: [3][4700/17875] Elapsed 60m 42s (remain 170m 8s) Loss: 0.0002(0.0008) Grad: 2805.5366  LR: 0.000004  
Epoch: [3][4800/17875] Elapsed 62m 0s (remain 168m 51s) Loss: 0.0008(0.0008) Grad: 5006.1211  LR: 0.000004  
Epoch: [3][4900/17875] Elapsed 63m 17s (remain 167m 32s) Loss: 0.0021(0.0008) Grad: 51215.3672  LR: 0.000004  
Epoch: [3][5000/17875] Elapsed 64m 34s (remain 166m 13s) Loss: 0.0001(0.0008) Grad: 752.7598  LR: 0.000004  
Epoch: [3][5100/17875] Elapsed 65m 52s (remain 164m 57s) Loss: 0.0000(0.0008) Grad: 13.6052  LR: 0.000003  
Epoch: [3][5200/17875] Elapsed 67m 9s (remain 163m 40s) Loss: 0.0005(0.0008) Grad: 3639.6147  LR: 0.000003  
Epoch: [3][5300/17875] Elapsed 68m 27s (remain 162m 23s) Loss: 0.0013(0.0008) Grad: 15238.5127  LR: 0.000003  
Epoch: [3][5400/17875] Elapsed 69m 44s (remain 161m 4s) Loss: 0.0021(0.0008) Grad: 11666.9814  LR: 0.000003  
Epoch: [3][5500/17875] Elapsed 71m 2s (remain 159m 47s) Loss: 0.0030(0.0008) Grad: 24838.1836  LR: 0.000003  
Epoch: [3][5600/17875] Elapsed 72m 19s (remain 158m 29s) Loss: 0.0005(0.0008) Grad: 6302.2261  LR: 0.000003  
Epoch: [3][5700/17875] Elapsed 73m 36s (remain 157m 10s) Loss: 0.0000(0.0008) Grad: 33.3240  LR: 0.000003  
Epoch: [3][5800/17875] Elapsed 74m 54s (remain 155m 54s) Loss: 0.0000(0.0008) Grad: 34.3693  LR: 0.000003  
Epoch: [3][5900/17875] Elapsed 76m 11s (remain 154m 35s) Loss: 0.0000(0.0008) Grad: 462.7500  LR: 0.000003  
Epoch: [3][6000/17875] Elapsed 77m 29s (remain 153m 19s) Loss: 0.0002(0.0008) Grad: 4561.1797  LR: 0.000003  
Epoch: [3][6100/17875] Elapsed 78m 46s (remain 152m 1s) Loss: 0.0000(0.0008) Grad: 27.5889  LR: 0.000003  
Epoch: [3][6200/17875] Elapsed 80m 3s (remain 150m 42s) Loss: 0.0000(0.0008) Grad: 9.0292  LR: 0.000003  
Epoch: [3][6300/17875] Elapsed 81m 21s (remain 149m 26s) Loss: 0.0000(0.0008) Grad: 62.2626  LR: 0.000003  
Epoch: [3][6400/17875] Elapsed 82m 37s (remain 148m 7s) Loss: 0.0046(0.0008) Grad: 41961.8164  LR: 0.000003  
Epoch: [3][6500/17875] Elapsed 83m 54s (remain 146m 49s) Loss: 0.0017(0.0008) Grad: 18122.6465  LR: 0.000003  
Epoch: [3][6600/17875] Elapsed 85m 13s (remain 145m 33s) Loss: 0.0000(0.0008) Grad: 75.1788  LR: 0.000003  
Epoch: [3][6700/17875] Elapsed 86m 30s (remain 144m 15s) Loss: 0.0004(0.0008) Grad: 1800.2516  LR: 0.000003  
Epoch: [3][6800/17875] Elapsed 87m 49s (remain 143m 0s) Loss: 0.0014(0.0008) Grad: 49241.9570  LR: 0.000002  
Epoch: [3][6900/17875] Elapsed 89m 7s (remain 141m 43s) Loss: 0.0027(0.0008) Grad: 23454.3145  LR: 0.000002  
Epoch: [3][7000/17875] Elapsed 90m 25s (remain 140m 27s) Loss: 0.0030(0.0008) Grad: 21140.1133  LR: 0.000002  
Epoch: [3][7100/17875] Elapsed 91m 43s (remain 139m 10s) Loss: 0.0000(0.0008) Grad: 18.8576  LR: 0.000002  
Epoch: [3][7200/17875] Elapsed 93m 1s (remain 137m 53s) Loss: 0.0000(0.0008) Grad: 25.6869  LR: 0.000002  
Epoch: [3][7300/17875] Elapsed 94m 19s (remain 136m 36s) Loss: 0.0000(0.0008) Grad: 56.4130  LR: 0.000002  
Epoch: [3][7400/17875] Elapsed 95m 37s (remain 135m 19s) Loss: 0.0008(0.0008) Grad: 6589.9233  LR: 0.000002  
Epoch: [3][7500/17875] Elapsed 96m 55s (remain 134m 2s) Loss: 0.0003(0.0008) Grad: 1104.5872  LR: 0.000002  
Epoch: [3][7600/17875] Elapsed 98m 13s (remain 132m 45s) Loss: 0.0001(0.0008) Grad: 880.0449  LR: 0.000002  
Epoch: [3][7700/17875] Elapsed 99m 30s (remain 131m 27s) Loss: 0.0000(0.0008) Grad: 259.1916  LR: 0.000002  
Epoch: [3][7800/17875] Elapsed 100m 48s (remain 130m 10s) Loss: 0.0000(0.0008) Grad: 418.0983  LR: 0.000002  
Epoch: [3][7900/17875] Elapsed 102m 6s (remain 128m 54s) Loss: 0.0006(0.0008) Grad: 4098.6812  LR: 0.000002  
Epoch: [3][8000/17875] Elapsed 103m 24s (remain 127m 37s) Loss: 0.0074(0.0008) Grad: 19624.1133  LR: 0.000002  
Epoch: [3][8100/17875] Elapsed 104m 42s (remain 126m 19s) Loss: 0.0000(0.0008) Grad: 432.7219  LR: 0.000002  
Epoch: [3][8200/17875] Elapsed 105m 58s (remain 125m 0s) Loss: 0.0005(0.0008) Grad: 20453.4727  LR: 0.000002  
Epoch: [3][8300/17875] Elapsed 107m 14s (remain 123m 41s) Loss: 0.0000(0.0008) Grad: 474.8054  LR: 0.000002  
Epoch: [3][8400/17875] Elapsed 108m 33s (remain 122m 25s) Loss: 0.0001(0.0008) Grad: 2699.7332  LR: 0.000001  
Epoch: [3][8500/17875] Elapsed 109m 51s (remain 121m 7s) Loss: 0.0003(0.0008) Grad: 2647.5510  LR: 0.000001  
Epoch: [3][8600/17875] Elapsed 111m 8s (remain 119m 50s) Loss: 0.0000(0.0008) Grad: 135.8916  LR: 0.000001  
Epoch: [3][8700/17875] Elapsed 112m 27s (remain 118m 33s) Loss: 0.0000(0.0008) Grad: 20.6810  LR: 0.000001  
Epoch: [3][8800/17875] Elapsed 113m 44s (remain 117m 16s) Loss: 0.0000(0.0008) Grad: 845.6979  LR: 0.000001  
Epoch: [3][8900/17875] Elapsed 115m 2s (remain 115m 59s) Loss: 0.0015(0.0008) Grad: 244270.4219  LR: 0.000001  
Epoch: [3][9000/17875] Elapsed 116m 21s (remain 114m 42s) Loss: 0.0000(0.0008) Grad: 111.0299  LR: 0.000001  
Epoch: [3][9100/17875] Elapsed 117m 39s (remain 113m 25s) Loss: 0.0000(0.0008) Grad: 2.5894  LR: 0.000001  
Epoch: [3][9200/17875] Elapsed 118m 57s (remain 112m 8s) Loss: 0.0038(0.0008) Grad: 19554.7148  LR: 0.000001  
Epoch: [3][9300/17875] Elapsed 120m 14s (remain 110m 50s) Loss: 0.0000(0.0008) Grad: 34.8175  LR: 0.000001  
Epoch: [3][9400/17875] Elapsed 121m 31s (remain 109m 32s) Loss: 0.0000(0.0008) Grad: 106.0372  LR: 0.000001  
Epoch: [3][9500/17875] Elapsed 122m 48s (remain 108m 14s) Loss: 0.0000(0.0008) Grad: 208.0924  LR: 0.000001  
Epoch: [3][9600/17875] Elapsed 124m 4s (remain 106m 55s) Loss: 0.0000(0.0008) Grad: 38.6846  LR: 0.000001  
Epoch: [3][9700/17875] Elapsed 125m 22s (remain 105m 38s) Loss: 0.0040(0.0008) Grad: 73093.9531  LR: 0.000001  
Epoch: [3][9800/17875] Elapsed 126m 41s (remain 104m 21s) Loss: 0.0006(0.0008) Grad: 5127.6396  LR: 0.000001  
Epoch: [3][9900/17875] Elapsed 127m 59s (remain 103m 5s) Loss: 0.0000(0.0008) Grad: 43.0812  LR: 0.000001  
Epoch: [3][10000/17875] Elapsed 129m 18s (remain 101m 48s) Loss: 0.0000(0.0008) Grad: 145.3232  LR: 0.000000  
Epoch: [3][10100/17875] Elapsed 130m 34s (remain 100m 30s) Loss: 0.0001(0.0008) Grad: 1263.8373  LR: 0.000000  
Epoch: [3][10200/17875] Elapsed 131m 52s (remain 99m 12s) Loss: 0.0000(0.0008) Grad: 3269.2180  LR: 0.000000  
Epoch: [3][10300/17875] Elapsed 133m 8s (remain 97m 53s) Loss: 0.0000(0.0008) Grad: 14.2937  LR: 0.000000  
Epoch: [3][10400/17875] Elapsed 134m 25s (remain 96m 35s) Loss: 0.0000(0.0008) Grad: 349.1661  LR: 0.000000  
Epoch: [3][10500/17875] Elapsed 135m 42s (remain 95m 17s) Loss: 0.0000(0.0008) Grad: 1759.8888  LR: 0.000000  
Epoch: [3][10600/17875] Elapsed 136m 57s (remain 93m 58s) Loss: 0.0000(0.0008) Grad: 35.8366  LR: 0.000000  
Epoch: [3][10700/17875] Elapsed 138m 15s (remain 92m 41s) Loss: 0.0000(0.0008) Grad: 70.5426  LR: 0.000000  
Epoch: [3][10800/17875] Elapsed 139m 31s (remain 91m 22s) Loss: 0.0067(0.0008) Grad: 156425.6250  LR: 0.000000  
Epoch: [3][10900/17875] Elapsed 140m 48s (remain 90m 5s) Loss: 0.0007(0.0008) Grad: 100067.3438  LR: 0.000000  
Epoch: [3][11000/17875] Elapsed 142m 6s (remain 88m 47s) Loss: 0.0003(0.0008) Grad: 4418.7061  LR: 0.000000  
Epoch: [3][11100/17875] Elapsed 143m 22s (remain 87m 29s) Loss: 0.0020(0.0008) Grad: 266682.6562  LR: 0.000000  
Epoch: [3][11200/17875] Elapsed 144m 40s (remain 86m 11s) Loss: 0.0130(0.0008) Grad: 145932.0781  LR: 0.000000  
Epoch: [3][11300/17875] Elapsed 145m 58s (remain 84m 55s) Loss: 0.0002(0.0008) Grad: 1365.5415  LR: 0.000000  
Epoch: [3][11400/17875] Elapsed 147m 17s (remain 83m 38s) Loss: 0.0000(0.0008) Grad: 17.8591  LR: 0.000000  
Epoch: [3][11500/17875] Elapsed 148m 34s (remain 82m 20s) Loss: 0.0002(0.0008) Grad: 21854.5352  LR: 0.000000  
Epoch: [3][11600/17875] Elapsed 149m 51s (remain 81m 2s) Loss: 0.0000(0.0008) Grad: 342.1613  LR: 0.000000  
Epoch: [3][11700/17875] Elapsed 151m 8s (remain 79m 44s) Loss: 0.0000(0.0008) Grad: 1087.9688  LR: 0.000000  
Epoch: [3][11800/17875] Elapsed 152m 26s (remain 78m 27s) Loss: 0.0033(0.0008) Grad: 30595.7422  LR: 0.000000  
Epoch: [3][11900/17875] Elapsed 153m 43s (remain 77m 9s) Loss: 0.0011(0.0008) Grad: 48177.5547  LR: 0.000000  
Epoch: [3][12000/17875] Elapsed 155m 0s (remain 75m 52s) Loss: 0.0010(0.0008) Grad: 52647.3516  LR: 0.000000  
Epoch: [3][12100/17875] Elapsed 156m 17s (remain 74m 34s) Loss: 0.0003(0.0008) Grad: 21889.5176  LR: 0.000000  
Epoch: [3][12200/17875] Elapsed 157m 34s (remain 73m 16s) Loss: 0.0000(0.0008) Grad: 1047.8787  LR: 0.000000  
Epoch: [3][12300/17875] Elapsed 158m 53s (remain 71m 59s) Loss: 0.0000(0.0008) Grad: 148.4660  LR: 0.000000  
Epoch: [3][12400/17875] Elapsed 160m 10s (remain 70m 42s) Loss: 0.0022(0.0008) Grad: 100703.9531  LR: 0.000000  
Epoch: [3][12500/17875] Elapsed 161m 29s (remain 69m 25s) Loss: 0.0034(0.0008) Grad: 57195.9766  LR: 0.000000  
Epoch: [3][12600/17875] Elapsed 162m 47s (remain 68m 8s) Loss: 0.0000(0.0008) Grad: 152.3024  LR: 0.000000  
Epoch: [3][12700/17875] Elapsed 164m 6s (remain 66m 51s) Loss: 0.0007(0.0008) Grad: 49730.2070  LR: 0.000000  
Epoch: [3][12800/17875] Elapsed 165m 24s (remain 65m 33s) Loss: 0.0000(0.0008) Grad: 417.0174  LR: 0.000000  
Epoch: [3][12900/17875] Elapsed 166m 42s (remain 64m 16s) Loss: 0.0000(0.0008) Grad: 53.0906  LR: 0.000000  
Epoch: [3][13000/17875] Elapsed 168m 0s (remain 62m 59s) Loss: 0.0001(0.0008) Grad: 2753.7532  LR: 0.000000  
Epoch: [3][13100/17875] Elapsed 169m 16s (remain 61m 41s) Loss: 0.0016(0.0008) Grad: 35551.0430  LR: 0.000000  
Epoch: [3][13200/17875] Elapsed 170m 33s (remain 60m 23s) Loss: 0.0000(0.0008) Grad: 503.5760  LR: 0.000000  
Epoch: [3][13300/17875] Elapsed 171m 50s (remain 59m 5s) Loss: 0.0000(0.0008) Grad: 56.4577  LR: 0.000000  
Epoch: [3][13400/17875] Elapsed 173m 6s (remain 57m 47s) Loss: 0.0002(0.0008) Grad: 8693.1953  LR: 0.000000  
Epoch: [3][13500/17875] Elapsed 174m 24s (remain 56m 30s) Loss: 0.0004(0.0008) Grad: 39083.4336  LR: 0.000000  
Epoch: [3][13600/17875] Elapsed 175m 43s (remain 55m 13s) Loss: 0.0003(0.0008) Grad: 5287.2075  LR: 0.000000  
Epoch: [3][13700/17875] Elapsed 177m 1s (remain 53m 55s) Loss: 0.0000(0.0008) Grad: 652.1322  LR: 0.000000  
Epoch: [3][13800/17875] Elapsed 178m 19s (remain 52m 38s) Loss: 0.0005(0.0008) Grad: 37484.5703  LR: 0.000000  
Epoch: [3][13900/17875] Elapsed 179m 38s (remain 51m 21s) Loss: 0.0000(0.0008) Grad: 199.0616  LR: 0.000000  
Epoch: [3][14000/17875] Elapsed 180m 55s (remain 50m 3s) Loss: 0.0000(0.0008) Grad: 21.7892  LR: 0.000000  
Epoch: [3][14100/17875] Elapsed 182m 13s (remain 48m 46s) Loss: 0.0000(0.0008) Grad: 115.4860  LR: 0.000000  
Epoch: [3][14200/17875] Elapsed 183m 30s (remain 47m 28s) Loss: 0.0001(0.0008) Grad: 4155.3965  LR: 0.000000  
Epoch: [3][14300/17875] Elapsed 184m 47s (remain 46m 10s) Loss: 0.0000(0.0008) Grad: 53.4247  LR: 0.000000  
Epoch: [3][14400/17875] Elapsed 186m 4s (remain 44m 53s) Loss: 0.0000(0.0008) Grad: 297.0735  LR: 0.000000  
Epoch: [3][14500/17875] Elapsed 187m 21s (remain 43m 35s) Loss: 0.0000(0.0008) Grad: 40.2501  LR: 0.000000  
Epoch: [3][14600/17875] Elapsed 188m 39s (remain 42m 18s) Loss: 0.0000(0.0008) Grad: 38.5596  LR: 0.000000  
Epoch: [3][14700/17875] Elapsed 189m 57s (remain 41m 0s) Loss: 0.0000(0.0008) Grad: 232.9427  LR: 0.000000  
Epoch: [3][14800/17875] Elapsed 191m 15s (remain 39m 43s) Loss: 0.0000(0.0008) Grad: 93.4405  LR: 0.000000  
Epoch: [3][14900/17875] Elapsed 192m 34s (remain 38m 26s) Loss: 0.0000(0.0008) Grad: 54.0621  LR: 0.000000  
Epoch: [3][15000/17875] Elapsed 193m 52s (remain 37m 8s) Loss: 0.0015(0.0008) Grad: 62969.2695  LR: 0.000000  
Epoch: [3][15100/17875] Elapsed 195m 10s (remain 35m 51s) Loss: 0.0000(0.0008) Grad: 56.7936  LR: 0.000000  
Epoch: [3][15200/17875] Elapsed 196m 29s (remain 34m 33s) Loss: 0.0003(0.0008) Grad: 26643.5039  LR: 0.000000  
Epoch: [3][15300/17875] Elapsed 197m 47s (remain 33m 16s) Loss: 0.0000(0.0008) Grad: 109.3466  LR: 0.000000  
Epoch: [3][15400/17875] Elapsed 199m 5s (remain 31m 58s) Loss: 0.0001(0.0008) Grad: 3215.4199  LR: 0.000000  
Epoch: [3][15500/17875] Elapsed 200m 22s (remain 30m 41s) Loss: 0.0000(0.0008) Grad: 1708.1169  LR: 0.000000  
Epoch: [3][15600/17875] Elapsed 201m 39s (remain 29m 23s) Loss: 0.0000(0.0008) Grad: 55.8370  LR: 0.000000  
Epoch: [3][15700/17875] Elapsed 202m 56s (remain 28m 6s) Loss: 0.0000(0.0008) Grad: 126.7213  LR: 0.000000  
Epoch: [3][15800/17875] Elapsed 204m 14s (remain 26m 48s) Loss: 0.0028(0.0008) Grad: 404972.5938  LR: 0.000000  
Epoch: [3][15900/17875] Elapsed 205m 31s (remain 25m 30s) Loss: 0.0000(0.0008) Grad: 2355.5662  LR: 0.000000  
Epoch: [3][16000/17875] Elapsed 206m 47s (remain 24m 13s) Loss: 0.0000(0.0008) Grad: 106.1371  LR: 0.000000  
Epoch: [3][16100/17875] Elapsed 208m 4s (remain 22m 55s) Loss: 0.0000(0.0008) Grad: 144.0557  LR: 0.000000  
Epoch: [3][16200/17875] Elapsed 209m 22s (remain 21m 37s) Loss: 0.0054(0.0008) Grad: 1396683.1250  LR: 0.000000  
Epoch: [3][16300/17875] Elapsed 210m 38s (remain 20m 20s) Loss: 0.0000(0.0008) Grad: 116.9399  LR: 0.000000  
Epoch: [3][16400/17875] Elapsed 211m 55s (remain 19m 2s) Loss: 0.0000(0.0008) Grad: 13.7363  LR: 0.000000  
Epoch: [3][16500/17875] Elapsed 213m 12s (remain 17m 45s) Loss: 0.0001(0.0008) Grad: 5740.5889  LR: 0.000000  
Epoch: [3][16600/17875] Elapsed 214m 31s (remain 16m 27s) Loss: 0.0013(0.0008) Grad: 96143.2188  LR: 0.000000  
Epoch: [3][16700/17875] Elapsed 215m 48s (remain 15m 10s) Loss: 0.0000(0.0008) Grad: 36.9837  LR: 0.000000  
Epoch: [3][16800/17875] Elapsed 217m 4s (remain 13m 52s) Loss: 0.0068(0.0008) Grad: 709440.6250  LR: 0.000000  
Epoch: [3][16900/17875] Elapsed 218m 22s (remain 12m 35s) Loss: 0.0000(0.0008) Grad: 58.6580  LR: 0.000000  
Epoch: [3][17000/17875] Elapsed 219m 39s (remain 11m 17s) Loss: 0.0000(0.0008) Grad: 2093.7842  LR: 0.000000  
Epoch: [3][17100/17875] Elapsed 220m 56s (remain 9m 59s) Loss: 0.0004(0.0008) Grad: 43091.9062  LR: 0.000000  
Epoch: [3][17200/17875] Elapsed 222m 13s (remain 8m 42s) Loss: 0.0003(0.0008) Grad: 24305.4062  LR: 0.000000  
Epoch: [3][17300/17875] Elapsed 223m 30s (remain 7m 24s) Loss: 0.0081(0.0008) Grad: 443091.2500  LR: 0.000000  
Epoch: [3][17400/17875] Elapsed 224m 47s (remain 6m 7s) Loss: 0.0000(0.0008) Grad: 74.4774  LR: 0.000000  
Epoch: [3][17500/17875] Elapsed 226m 4s (remain 4m 49s) Loss: 0.0000(0.0008) Grad: 71.5859  LR: 0.000000  
Epoch: [3][17600/17875] Elapsed 227m 23s (remain 3m 32s) Loss: 0.0003(0.0008) Grad: 18716.7949  LR: 0.000000  
Epoch: [3][17700/17875] Elapsed 228m 40s (remain 2m 14s) Loss: 0.0002(0.0008) Grad: 28309.9922  LR: 0.000000  
Epoch: [3][17800/17875] Elapsed 229m 57s (remain 0m 57s) Loss: 0.0000(0.0008) Grad: 208.4890  LR: 0.000000  
Epoch: [3][17874/17875] Elapsed 230m 55s (remain 0m 0s) Loss: 0.0000(0.0008) Grad: 81.5451  LR: 0.000000  
EVAL: [0/1192] Elapsed 0m 1s (remain 24m 38s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 31s (remain 5m 35s) Loss: 0.0082(0.0027) 
EVAL: [200/1192] Elapsed 1m 1s (remain 5m 1s) Loss: 0.0063(0.0032) 
EVAL: [300/1192] Elapsed 1m 31s (remain 4m 30s) Loss: 0.0010(0.0034) 
EVAL: [400/1192] Elapsed 2m 1s (remain 4m 0s) Loss: 0.0038(0.0036) 
EVAL: [500/1192] Elapsed 2m 31s (remain 3m 28s) Loss: 0.0040(0.0034) 
EVAL: [600/1192] Elapsed 3m 1s (remain 2m 58s) Loss: 0.0000(0.0036) 
EVAL: [700/1192] Elapsed 3m 31s (remain 2m 28s) Loss: 0.0439(0.0042) 
EVAL: [800/1192] Elapsed 4m 2s (remain 1m 58s) Loss: 0.0029(0.0044) 
EVAL: [900/1192] Elapsed 4m 32s (remain 1m 27s) Loss: 0.0003(0.0044) 
EVAL: [1000/1192] Elapsed 5m 1s (remain 0m 57s) Loss: 0.0000(0.0043) 
EVAL: [1100/1192] Elapsed 5m 32s (remain 0m 27s) Loss: 0.0000(0.0041) 
EVAL: [1191/1192] Elapsed 5m 59s (remain 0m 0s) Loss: 0.0000(0.0040) 
Epoch 3 - avg_train_loss: 0.0008  avg_val_loss: 0.0040  time: 14219s
Epoch 3 - Score: 0.8900
Epoch 3 - Save Best Score: 0.8900 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 579619.24it/s]
100%
612602/612602 [00:44<00:00, 15695.61it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(64350, 11)
Epoch: [4][0/21450] Elapsed 0m 2s (remain 819m 30s) Loss: 0.0000(0.0000) Grad: 130.4985  LR: 0.000000  
Epoch: [4][100/21450] Elapsed 1m 18s (remain 277m 56s) Loss: 0.0000(0.0007) Grad: 192.7923  LR: 0.000000  
Epoch: [4][200/21450] Elapsed 2m 35s (remain 274m 14s) Loss: 0.0000(0.0008) Grad: 136.7366  LR: 0.000000  
Epoch: [4][300/21450] Elapsed 3m 52s (remain 272m 3s) Loss: 0.0067(0.0008) Grad: 8584.9678  LR: 0.000000  
Epoch: [4][400/21450] Elapsed 5m 10s (remain 271m 44s) Loss: 0.0000(0.0009) Grad: 7.8528  LR: 0.000000  
Epoch: [4][500/21450] Elapsed 6m 28s (remain 270m 57s) Loss: 0.0000(0.0009) Grad: 1.9857  LR: 0.000000  
Epoch: [4][600/21450] Elapsed 7m 47s (remain 270m 1s) Loss: 0.0001(0.0009) Grad: 1004.7427  LR: 0.000000  
Epoch: [4][700/21450] Elapsed 9m 5s (remain 268m 59s) Loss: 0.0000(0.0008) Grad: 9.2285  LR: 0.000000  
Epoch: [4][800/21450] Elapsed 10m 22s (remain 267m 34s) Loss: 0.0002(0.0009) Grad: 426.9514  LR: 0.000000  
Epoch: [4][900/21450] Elapsed 11m 41s (remain 266m 39s) Loss: 0.0000(0.0009) Grad: 3.7322  LR: 0.000000  
Epoch: [4][1000/21450] Elapsed 12m 59s (remain 265m 18s) Loss: 0.0000(0.0008) Grad: 77.4158  LR: 0.000000  
Epoch: [4][1100/21450] Elapsed 14m 17s (remain 264m 9s) Loss: 0.0000(0.0008) Grad: 32.1235  LR: 0.000000  
Epoch: [4][1200/21450] Elapsed 15m 34s (remain 262m 40s) Loss: 0.0000(0.0008) Grad: 26.9726  LR: 0.000000  
Epoch: [4][1300/21450] Elapsed 16m 51s (remain 261m 10s) Loss: 0.0002(0.0008) Grad: 1149.4115  LR: 0.000000  
Epoch: [4][1400/21450] Elapsed 18m 8s (remain 259m 33s) Loss: 0.0000(0.0008) Grad: 70.2000  LR: 0.000000  
Epoch: [4][1500/21450] Elapsed 19m 27s (remain 258m 33s) Loss: 0.0035(0.0008) Grad: 23827.8164  LR: 0.000000  
Epoch: [4][1600/21450] Elapsed 20m 45s (remain 257m 21s) Loss: 0.0000(0.0008) Grad: 476.1524  LR: 0.000000  
Epoch: [4][1700/21450] Elapsed 22m 3s (remain 256m 6s) Loss: 0.0000(0.0008) Grad: 34.0231  LR: 0.000000  
Epoch: [4][1800/21450] Elapsed 23m 21s (remain 254m 52s) Loss: 0.0000(0.0008) Grad: 4.7349  LR: 0.000000  
Epoch: [4][1900/21450] Elapsed 24m 40s (remain 253m 41s) Loss: 0.0000(0.0008) Grad: 0.8111  LR: 0.000000  
Epoch: [4][2000/21450] Elapsed 25m 57s (remain 252m 21s) Loss: 0.0013(0.0008) Grad: 5295.5664  LR: 0.000000  
Epoch: [4][2100/21450] Elapsed 27m 16s (remain 251m 11s) Loss: 0.0000(0.0008) Grad: 228.0097  LR: 0.000000  
Epoch: [4][2200/21450] Elapsed 28m 34s (remain 249m 50s) Loss: 0.0000(0.0008) Grad: 507.0571  LR: 0.000000  
Epoch: [4][2300/21450] Elapsed 29m 51s (remain 248m 29s) Loss: 0.0005(0.0008) Grad: 1375.2433  LR: 0.000000  
Epoch: [4][2400/21450] Elapsed 31m 9s (remain 247m 12s) Loss: 0.0000(0.0008) Grad: 7.8153  LR: 0.000000  
Epoch: [4][2500/21450] Elapsed 32m 26s (remain 245m 49s) Loss: 0.0000(0.0008) Grad: 4.0632  LR: 0.000000  
Epoch: [4][2600/21450] Elapsed 33m 43s (remain 244m 21s) Loss: 0.0002(0.0008) Grad: 1934.3264  LR: 0.000000  
Epoch: [4][2700/21450] Elapsed 34m 59s (remain 242m 51s) Loss: 0.0000(0.0008) Grad: 15.3009  LR: 0.000000  
Epoch: [4][2800/21450] Elapsed 36m 15s (remain 241m 26s) Loss: 0.0000(0.0008) Grad: 428.2397  LR: 0.000000  
Epoch: [4][2900/21450] Elapsed 37m 33s (remain 240m 9s) Loss: 0.0000(0.0008) Grad: 6.8104  LR: 0.000000  
Epoch: [4][3000/21450] Elapsed 38m 51s (remain 238m 52s) Loss: 0.0005(0.0008) Grad: 1520.9482  LR: 0.000000  
Epoch: [4][3100/21450] Elapsed 40m 9s (remain 237m 36s) Loss: 0.0000(0.0008) Grad: 9.0411  LR: 0.000000  
Epoch: [4][3200/21450] Elapsed 41m 28s (remain 236m 24s) Loss: 0.0000(0.0008) Grad: 21.6279  LR: 0.000000  
Epoch: [4][3300/21450] Elapsed 42m 46s (remain 235m 13s) Loss: 0.0000(0.0008) Grad: 10.0688  LR: 0.000000  
Epoch: [4][3400/21450] Elapsed 44m 3s (remain 233m 49s) Loss: 0.0000(0.0007) Grad: 121.7858  LR: 0.000000  
Epoch: [4][3500/21450] Elapsed 45m 21s (remain 232m 32s) Loss: 0.0000(0.0007) Grad: 10.2706  LR: 0.000000  
Epoch: [4][3600/21450] Elapsed 46m 37s (remain 231m 7s) Loss: 0.0012(0.0008) Grad: 6449.7275  LR: 0.000000  
Epoch: [4][3700/21450] Elapsed 47m 53s (remain 229m 42s) Loss: 0.0000(0.0007) Grad: 164.5433  LR: 0.000000  
Epoch: [4][3800/21450] Elapsed 49m 11s (remain 228m 26s) Loss: 0.0000(0.0007) Grad: 244.3186  LR: 0.000000  
Epoch: [4][3900/21450] Elapsed 50m 28s (remain 227m 4s) Loss: 0.0003(0.0007) Grad: 474.5417  LR: 0.000000  
Epoch: [4][4000/21450] Elapsed 51m 46s (remain 225m 49s) Loss: 0.0000(0.0008) Grad: 5.9556  LR: 0.000000  
Epoch: [4][4100/21450] Elapsed 53m 4s (remain 224m 33s) Loss: 0.0001(0.0007) Grad: 1698.8973  LR: 0.000000  
Epoch: [4][4200/21450] Elapsed 54m 21s (remain 223m 11s) Loss: 0.0000(0.0008) Grad: 1320.3151  LR: 0.000000  
Epoch: [4][4300/21450] Elapsed 55m 39s (remain 221m 55s) Loss: 0.0008(0.0008) Grad: 6147.1377  LR: 0.000000  
Epoch: [4][4400/21450] Elapsed 56m 56s (remain 220m 34s) Loss: 0.0029(0.0008) Grad: 12827.6709  LR: 0.000000  
Epoch: [4][4500/21450] Elapsed 58m 13s (remain 219m 14s) Loss: 0.0008(0.0008) Grad: 12153.7207  LR: 0.000000  
Epoch: [4][4600/21450] Elapsed 59m 29s (remain 217m 52s) Loss: 0.0000(0.0008) Grad: 12.2381  LR: 0.000000  
Epoch: [4][4700/21450] Elapsed 60m 46s (remain 216m 32s) Loss: 0.0022(0.0007) Grad: 35527.1484  LR: 0.000000  
Epoch: [4][4800/21450] Elapsed 62m 4s (remain 215m 17s) Loss: 0.0000(0.0007) Grad: 18.2446  LR: 0.000000  
Epoch: [4][4900/21450] Elapsed 63m 22s (remain 214m 0s) Loss: 0.0000(0.0007) Grad: 176.9759  LR: 0.000000  
Epoch: [4][5000/21450] Elapsed 64m 40s (remain 212m 44s) Loss: 0.0000(0.0007) Grad: 102.5447  LR: 0.000000  
Epoch: [4][5100/21450] Elapsed 65m 57s (remain 211m 23s) Loss: 0.0000(0.0007) Grad: 12.9389  LR: 0.000000  
Epoch: [4][5200/21450] Elapsed 67m 14s (remain 210m 4s) Loss: 0.0000(0.0007) Grad: 7.2241  LR: 0.000000  
Epoch: [4][5300/21450] Elapsed 68m 32s (remain 208m 49s) Loss: 0.0015(0.0007) Grad: 9046.7041  LR: 0.000000  
Epoch: [4][5400/21450] Elapsed 69m 49s (remain 207m 27s) Loss: 0.0000(0.0007) Grad: 210.6129  LR: 0.000000  
Epoch: [4][5500/21450] Elapsed 71m 5s (remain 206m 8s) Loss: 0.0000(0.0007) Grad: 102.2477  LR: 0.000000  
Epoch: [4][5600/21450] Elapsed 72m 22s (remain 204m 48s) Loss: 0.0000(0.0007) Grad: 22.9715  LR: 0.000000  
Epoch: [4][5700/21450] Elapsed 73m 40s (remain 203m 31s) Loss: 0.0000(0.0007) Grad: 192.1424  LR: 0.000000  
Epoch: [4][5800/21450] Elapsed 74m 58s (remain 202m 14s) Loss: 0.0002(0.0007) Grad: 757.9785  LR: 0.000000  
Epoch: [4][5900/21450] Elapsed 76m 16s (remain 200m 58s) Loss: 0.0000(0.0008) Grad: 9.9277  LR: 0.000000  
Epoch: [4][6000/21450] Elapsed 77m 34s (remain 199m 43s) Loss: 0.0017(0.0008) Grad: 30547.6973  LR: 0.000000  
Epoch: [4][6100/21450] Elapsed 78m 52s (remain 198m 26s) Loss: 0.0000(0.0007) Grad: 87.6435  LR: 0.000000  
Epoch: [4][6200/21450] Elapsed 80m 10s (remain 197m 9s) Loss: 0.0002(0.0007) Grad: 586.2679  LR: 0.000000  
Epoch: [4][6300/21450] Elapsed 81m 27s (remain 195m 50s) Loss: 0.0000(0.0007) Grad: 19.0886  LR: 0.000000  
Epoch: [4][6400/21450] Elapsed 82m 44s (remain 194m 32s) Loss: 0.0000(0.0007) Grad: 5.8749  LR: 0.000000  
Epoch: [4][6500/21450] Elapsed 84m 2s (remain 193m 14s) Loss: 0.0000(0.0007) Grad: 13.2197  LR: 0.000000  
Epoch: [4][6600/21450] Elapsed 85m 19s (remain 191m 57s) Loss: 0.0000(0.0007) Grad: 273.4685  LR: 0.000000  
Epoch: [4][6700/21450] Elapsed 86m 38s (remain 190m 41s) Loss: 0.0074(0.0007) Grad: 10386.1826  LR: 0.000000  
Epoch: [4][6800/21450] Elapsed 87m 55s (remain 189m 23s) Loss: 0.0005(0.0007) Grad: 9336.2715  LR: 0.000000  
Epoch: [4][6900/21450] Elapsed 89m 12s (remain 188m 3s) Loss: 0.0059(0.0007) Grad: 56463.8555  LR: 0.000000  
Epoch: [4][7000/21450] Elapsed 90m 29s (remain 186m 44s) Loss: 0.0000(0.0007) Grad: 175.6184  LR: 0.000000  
Epoch: [4][7100/21450] Elapsed 91m 46s (remain 185m 26s) Loss: 0.0000(0.0008) Grad: 88.4686  LR: 0.000000  
Epoch: [4][7200/21450] Elapsed 93m 3s (remain 184m 8s) Loss: 0.0000(0.0008) Grad: 417.6689  LR: 0.000000  
Epoch: [4][7300/21450] Elapsed 94m 22s (remain 182m 53s) Loss: 0.0000(0.0008) Grad: 208.4493  LR: 0.000000  
Epoch: [4][7400/21450] Elapsed 95m 40s (remain 181m 36s) Loss: 0.0003(0.0008) Grad: 2905.6067  LR: 0.000000  
Epoch: [4][7500/21450] Elapsed 96m 57s (remain 180m 19s) Loss: 0.0002(0.0008) Grad: 9444.1992  LR: 0.000000  
Epoch: [4][7600/21450] Elapsed 98m 16s (remain 179m 2s) Loss: 0.0046(0.0008) Grad: 41333.3164  LR: 0.000000  
Epoch: [4][7700/21450] Elapsed 99m 33s (remain 177m 45s) Loss: 0.0000(0.0008) Grad: 82.5277  LR: 0.000000  
Epoch: [4][7800/21450] Elapsed 100m 52s (remain 176m 28s) Loss: 0.0000(0.0008) Grad: 8.9556  LR: 0.000000  
Epoch: [4][7900/21450] Elapsed 102m 10s (remain 175m 12s) Loss: 0.0003(0.0007) Grad: 3167.5818  LR: 0.000000  
Epoch: [4][8000/21450] Elapsed 103m 26s (remain 173m 52s) Loss: 0.0015(0.0007) Grad: 17490.4746  LR: 0.000000  
Epoch: [4][8100/21450] Elapsed 104m 44s (remain 172m 35s) Loss: 0.0001(0.0008) Grad: 1517.5447  LR: 0.000000  
Epoch: [4][8200/21450] Elapsed 106m 1s (remain 171m 18s) Loss: 0.0001(0.0008) Grad: 1396.6794  LR: 0.000000  
Epoch: [4][8300/21450] Elapsed 107m 18s (remain 169m 59s) Loss: 0.0057(0.0008) Grad: 34509.4648  LR: 0.000000  
Epoch: [4][8400/21450] Elapsed 108m 37s (remain 168m 43s) Loss: 0.0031(0.0008) Grad: 36348.5039  LR: 0.000000  
Epoch: [4][8500/21450] Elapsed 109m 54s (remain 167m 25s) Loss: 0.0000(0.0007) Grad: 161.3740  LR: 0.000000  
Epoch: [4][8600/21450] Elapsed 111m 12s (remain 166m 7s) Loss: 0.0002(0.0007) Grad: 3106.3264  LR: 0.000000  
Epoch: [4][8700/21450] Elapsed 112m 29s (remain 164m 49s) Loss: 0.0000(0.0007) Grad: 386.6935  LR: 0.000000  
Epoch: [4][8800/21450] Elapsed 113m 46s (remain 163m 30s) Loss: 0.0003(0.0007) Grad: 10531.2510  LR: 0.000000  
Epoch: [4][8900/21450] Elapsed 115m 3s (remain 162m 13s) Loss: 0.0040(0.0007) Grad: 52492.9180  LR: 0.000000  
Epoch: [4][9000/21450] Elapsed 116m 21s (remain 160m 56s) Loss: 0.0000(0.0007) Grad: 909.3931  LR: 0.000000  
Epoch: [4][9100/21450] Elapsed 117m 39s (remain 159m 38s) Loss: 0.0006(0.0007) Grad: 2350.8730  LR: 0.000000  
Epoch: [4][9200/21450] Elapsed 118m 56s (remain 158m 20s) Loss: 0.0000(0.0007) Grad: 17.7887  LR: 0.000000  
Epoch: [4][9300/21450] Elapsed 120m 14s (remain 157m 3s) Loss: 0.0000(0.0007) Grad: 14.0381  LR: 0.000000  
Epoch: [4][9400/21450] Elapsed 121m 32s (remain 155m 46s) Loss: 0.0028(0.0007) Grad: 9133.3896  LR: 0.000000  
Epoch: [4][9500/21450] Elapsed 122m 51s (remain 154m 30s) Loss: 0.0001(0.0007) Grad: 1186.7297  LR: 0.000000  
Epoch: [4][9600/21450] Elapsed 124m 10s (remain 153m 14s) Loss: 0.0002(0.0007) Grad: 13528.9814  LR: 0.000000  
Epoch: [4][9700/21450] Elapsed 125m 27s (remain 151m 56s) Loss: 0.0000(0.0007) Grad: 429.6886  LR: 0.000000  
Epoch: [4][9800/21450] Elapsed 126m 45s (remain 150m 39s) Loss: 0.0020(0.0007) Grad: 52960.6367  LR: 0.000000  
Epoch: [4][9900/21450] Elapsed 128m 2s (remain 149m 20s) Loss: 0.0000(0.0007) Grad: 1262.0208  LR: 0.000000  
Epoch: [4][10000/21450] Elapsed 129m 20s (remain 148m 3s) Loss: 0.0000(0.0007) Grad: 109.1262  LR: 0.000000  
Epoch: [4][10100/21450] Elapsed 130m 37s (remain 146m 45s) Loss: 0.0004(0.0007) Grad: 6791.2764  LR: 0.000000  
Epoch: [4][10200/21450] Elapsed 131m 55s (remain 145m 28s) Loss: 0.0000(0.0007) Grad: 279.9324  LR: 0.000000  
Epoch: [4][10300/21450] Elapsed 133m 12s (remain 144m 10s) Loss: 0.0000(0.0007) Grad: 1926.8011  LR: 0.000000  
Epoch: [4][10400/21450] Elapsed 134m 29s (remain 142m 51s) Loss: 0.0000(0.0007) Grad: 18.7093  LR: 0.000000  
Epoch: [4][10500/21450] Elapsed 135m 46s (remain 141m 34s) Loss: 0.0000(0.0007) Grad: 29.7202  LR: 0.000000  
Epoch: [4][10600/21450] Elapsed 137m 3s (remain 140m 15s) Loss: 0.0000(0.0007) Grad: 21.9371  LR: 0.000000  
Epoch: [4][10700/21450] Elapsed 138m 21s (remain 138m 58s) Loss: 0.0006(0.0007) Grad: 33499.5156  LR: 0.000000  
Epoch: [4][10800/21450] Elapsed 139m 39s (remain 137m 41s) Loss: 0.0010(0.0007) Grad: 20544.9844  LR: 0.000000  
Epoch: [4][10900/21450] Elapsed 140m 58s (remain 136m 25s) Loss: 0.0217(0.0007) Grad: 323393.9688  LR: 0.000000  
Epoch: [4][11000/21450] Elapsed 142m 16s (remain 135m 8s) Loss: 0.0002(0.0007) Grad: 9033.2568  LR: 0.000000  
Epoch: [4][11100/21450] Elapsed 143m 33s (remain 133m 49s) Loss: 0.0001(0.0007) Grad: 1166.8229  LR: 0.000000  
Epoch: [4][11200/21450] Elapsed 144m 51s (remain 132m 32s) Loss: 0.0198(0.0007) Grad: 118205.3438  LR: 0.000000  
Epoch: [4][11300/21450] Elapsed 146m 8s (remain 131m 14s) Loss: 0.0017(0.0007) Grad: 47773.5508  LR: 0.000000  
Epoch: [4][11400/21450] Elapsed 147m 25s (remain 129m 56s) Loss: 0.0003(0.0007) Grad: 9253.3086  LR: 0.000000  
Epoch: [4][11500/21450] Elapsed 148m 43s (remain 128m 39s) Loss: 0.0000(0.0007) Grad: 22.6999  LR: 0.000000  
Epoch: [4][11600/21450] Elapsed 150m 0s (remain 127m 21s) Loss: 0.0047(0.0007) Grad: 17858.7949  LR: 0.000000  
Epoch: [4][11700/21450] Elapsed 151m 18s (remain 126m 3s) Loss: 0.0001(0.0007) Grad: 1762.9871  LR: 0.000000  
Epoch: [4][11800/21450] Elapsed 152m 35s (remain 124m 45s) Loss: 0.0000(0.0007) Grad: 45.2227  LR: 0.000000  
Epoch: [4][11900/21450] Elapsed 153m 52s (remain 123m 28s) Loss: 0.0000(0.0007) Grad: 682.1640  LR: 0.000000  
Epoch: [4][12000/21450] Elapsed 155m 9s (remain 122m 10s) Loss: 0.0000(0.0007) Grad: 104.1736  LR: 0.000000  
Epoch: [4][12100/21450] Elapsed 156m 26s (remain 120m 52s) Loss: 0.0002(0.0007) Grad: 2244.7576  LR: 0.000000  
Epoch: [4][12200/21450] Elapsed 157m 45s (remain 119m 35s) Loss: 0.0000(0.0007) Grad: 34.7643  LR: 0.000000  
Epoch: [4][12300/21450] Elapsed 159m 3s (remain 118m 17s) Loss: 0.0000(0.0007) Grad: 101.4036  LR: 0.000000  
Epoch: [4][12400/21450] Elapsed 160m 21s (remain 117m 0s) Loss: 0.0006(0.0007) Grad: 14490.4453  LR: 0.000000  
Epoch: [4][12500/21450] Elapsed 161m 39s (remain 115m 43s) Loss: 0.0012(0.0007) Grad: 122271.6953  LR: 0.000000  
Epoch: [4][12600/21450] Elapsed 162m 57s (remain 114m 26s) Loss: 0.0050(0.0007) Grad: 77743.5000  LR: 0.000000  
Epoch: [4][12700/21450] Elapsed 164m 14s (remain 113m 8s) Loss: 0.0010(0.0007) Grad: 70036.8125  LR: 0.000000  
Epoch: [4][12800/21450] Elapsed 165m 32s (remain 111m 50s) Loss: 0.0226(0.0007) Grad: 160888.4844  LR: 0.000000  
Epoch: [4][12900/21450] Elapsed 166m 48s (remain 110m 32s) Loss: 0.0092(0.0007) Grad: 131648.3906  LR: 0.000000  
Epoch: [4][13000/21450] Elapsed 168m 6s (remain 109m 14s) Loss: 0.0000(0.0007) Grad: 57.2520  LR: 0.000000  
Epoch: [4][13100/21450] Elapsed 169m 24s (remain 107m 57s) Loss: 0.0000(0.0007) Grad: 327.7594  LR: 0.000000  
Epoch: [4][13200/21450] Elapsed 170m 42s (remain 106m 40s) Loss: 0.0000(0.0007) Grad: 66.6325  LR: 0.000000  
Epoch: [4][13300/21450] Elapsed 171m 59s (remain 105m 22s) Loss: 0.0000(0.0007) Grad: 29.8549  LR: 0.000000  
Epoch: [4][13400/21450] Elapsed 173m 16s (remain 104m 4s) Loss: 0.0000(0.0007) Grad: 51.6258  LR: 0.000000  
Epoch: [4][13500/21450] Elapsed 174m 34s (remain 102m 46s) Loss: 0.0000(0.0007) Grad: 556.2509  LR: 0.000000  
Epoch: [4][13600/21450] Elapsed 175m 51s (remain 101m 28s) Loss: 0.0000(0.0007) Grad: 34.0636  LR: 0.000000  
Epoch: [4][13700/21450] Elapsed 177m 8s (remain 100m 11s) Loss: 0.0034(0.0007) Grad: 19182.3125  LR: 0.000000  
Epoch: [4][13800/21450] Elapsed 178m 25s (remain 98m 53s) Loss: 0.0000(0.0007) Grad: 3422.3420  LR: 0.000000  
Epoch: [4][13900/21450] Elapsed 179m 43s (remain 97m 35s) Loss: 0.0004(0.0007) Grad: 3884.5222  LR: 0.000000  
Epoch: [4][14000/21450] Elapsed 181m 0s (remain 96m 18s) Loss: 0.0002(0.0007) Grad: 13695.7266  LR: 0.000000  
Epoch: [4][14100/21450] Elapsed 182m 17s (remain 95m 0s) Loss: 0.0000(0.0007) Grad: 197.6814  LR: 0.000000  
Epoch: [4][14200/21450] Elapsed 183m 37s (remain 93m 43s) Loss: 0.0005(0.0007) Grad: 38098.8945  LR: 0.000000  
Epoch: [4][14300/21450] Elapsed 184m 55s (remain 92m 26s) Loss: 0.0000(0.0007) Grad: 6.9976  LR: 0.000000  
Epoch: [4][14400/21450] Elapsed 186m 13s (remain 91m 9s) Loss: 0.0004(0.0007) Grad: 30694.2988  LR: 0.000000  
Epoch: [4][14500/21450] Elapsed 187m 31s (remain 89m 51s) Loss: 0.0007(0.0007) Grad: 30455.3809  LR: 0.000000  
Epoch: [4][14600/21450] Elapsed 188m 48s (remain 88m 33s) Loss: 0.0000(0.0007) Grad: 20.5560  LR: 0.000000  
Epoch: [4][14700/21450] Elapsed 190m 6s (remain 87m 16s) Loss: 0.0006(0.0007) Grad: 47630.1523  LR: 0.000000  
Epoch: [4][14800/21450] Elapsed 191m 23s (remain 85m 58s) Loss: 0.0000(0.0007) Grad: 2034.7029  LR: 0.000000  
Epoch: [4][14900/21450] Elapsed 192m 40s (remain 84m 41s) Loss: 0.0000(0.0007) Grad: 20.3940  LR: 0.000000  
Epoch: [4][15000/21450] Elapsed 193m 57s (remain 83m 23s) Loss: 0.0001(0.0007) Grad: 3824.1240  LR: 0.000000  
Epoch: [4][15100/21450] Elapsed 195m 15s (remain 82m 5s) Loss: 0.0000(0.0007) Grad: 51.3948  LR: 0.000000  
Epoch: [4][15200/21450] Elapsed 196m 33s (remain 80m 48s) Loss: 0.0000(0.0007) Grad: 177.9961  LR: 0.000000  
Epoch: [4][15300/21450] Elapsed 197m 51s (remain 79m 30s) Loss: 0.0002(0.0007) Grad: 27316.6016  LR: 0.000000  
Epoch: [4][15400/21450] Elapsed 199m 9s (remain 78m 13s) Loss: 0.0000(0.0007) Grad: 170.1923  LR: 0.000000  
Epoch: [4][15500/21450] Elapsed 200m 26s (remain 76m 55s) Loss: 0.0000(0.0007) Grad: 104.8265  LR: 0.000000  
Epoch: [4][15600/21450] Elapsed 201m 45s (remain 75m 38s) Loss: 0.0001(0.0007) Grad: 8510.6406  LR: 0.000000  
Epoch: [4][15700/21450] Elapsed 203m 2s (remain 74m 20s) Loss: 0.0000(0.0007) Grad: 63.3470  LR: 0.000000  
Epoch: [4][15800/21450] Elapsed 204m 19s (remain 73m 2s) Loss: 0.0000(0.0007) Grad: 59.8440  LR: 0.000000  
Epoch: [4][15900/21450] Elapsed 205m 36s (remain 71m 45s) Loss: 0.0011(0.0007) Grad: 71626.5625  LR: 0.000000  
Epoch: [4][16000/21450] Elapsed 206m 54s (remain 70m 27s) Loss: 0.0005(0.0007) Grad: 205111.4219  LR: 0.000000  
Epoch: [4][16100/21450] Elapsed 208m 12s (remain 69m 10s) Loss: 0.0003(0.0007) Grad: 43676.2109  LR: 0.000000  
Epoch: [4][16200/21450] Elapsed 209m 29s (remain 67m 52s) Loss: 0.0000(0.0007) Grad: 1838.0155  LR: 0.000000  
Epoch: [4][16300/21450] Elapsed 210m 47s (remain 66m 35s) Loss: 0.0000(0.0007) Grad: 35.7423  LR: 0.000000  
Epoch: [4][16400/21450] Elapsed 212m 5s (remain 65m 17s) Loss: 0.0042(0.0007) Grad: 477553.7500  LR: 0.000000  
Epoch: [4][16500/21450] Elapsed 213m 22s (remain 63m 59s) Loss: 0.0000(0.0007) Grad: 212.8207  LR: 0.000000  
Epoch: [4][16600/21450] Elapsed 214m 41s (remain 62m 42s) Loss: 0.0004(0.0007) Grad: 50929.7266  LR: 0.000000  
Epoch: [4][16700/21450] Elapsed 215m 58s (remain 61m 24s) Loss: 0.0004(0.0007) Grad: 77407.6641  LR: 0.000000  
Epoch: [4][16800/21450] Elapsed 217m 17s (remain 60m 7s) Loss: 0.0000(0.0007) Grad: 5354.3115  LR: 0.000000  
Epoch: [4][16900/21450] Elapsed 218m 34s (remain 58m 49s) Loss: 0.0007(0.0007) Grad: 102130.7656  LR: 0.000000  
Epoch: [4][17000/21450] Elapsed 219m 52s (remain 57m 32s) Loss: 0.0010(0.0007) Grad: 38591.8281  LR: 0.000000  
Epoch: [4][17100/21450] Elapsed 221m 10s (remain 56m 14s) Loss: 0.0000(0.0007) Grad: 66.5112  LR: 0.000000  
Epoch: [4][17200/21450] Elapsed 222m 28s (remain 54m 57s) Loss: 0.0000(0.0007) Grad: 1216.1365  LR: 0.000000  
Epoch: [4][17300/21450] Elapsed 223m 46s (remain 53m 39s) Loss: 0.0000(0.0007) Grad: 54.3840  LR: 0.000000  
Epoch: [4][17400/21450] Elapsed 225m 4s (remain 52m 22s) Loss: 0.0000(0.0007) Grad: 182.2460  LR: 0.000000  
Epoch: [4][17500/21450] Elapsed 226m 22s (remain 51m 4s) Loss: 0.0008(0.0007) Grad: 163234.0781  LR: 0.000000  
Epoch: [4][17600/21450] Elapsed 227m 41s (remain 49m 47s) Loss: 0.0000(0.0007) Grad: 64.2097  LR: 0.000000  
Epoch: [4][17700/21450] Elapsed 228m 59s (remain 48m 29s) Loss: 0.0000(0.0007) Grad: 153.5677  LR: 0.000000  
Epoch: [4][17800/21450] Elapsed 230m 17s (remain 47m 12s) Loss: 0.0000(0.0007) Grad: 42.1741  LR: 0.000000  
Epoch: [4][17900/21450] Elapsed 231m 34s (remain 45m 54s) Loss: 0.0002(0.0007) Grad: 28098.7754  LR: 0.000000  
Epoch: [4][18000/21450] Elapsed 232m 51s (remain 44m 37s) Loss: 0.0000(0.0007) Grad: 339.8792  LR: 0.000000  
Epoch: [4][18100/21450] Elapsed 234m 8s (remain 43m 19s) Loss: 0.0003(0.0007) Grad: 15866.1641  LR: 0.000000  
Epoch: [4][18200/21450] Elapsed 235m 25s (remain 42m 1s) Loss: 0.0000(0.0007) Grad: 268.0234  LR: 0.000000  
Epoch: [4][18300/21450] Elapsed 236m 43s (remain 40m 44s) Loss: 0.0041(0.0007) Grad: 202603.5156  LR: 0.000000  
Epoch: [4][18400/21450] Elapsed 238m 2s (remain 39m 26s) Loss: 0.0056(0.0007) Grad: 106306.6797  LR: 0.000000  
Epoch: [4][18500/21450] Elapsed 239m 20s (remain 38m 8s) Loss: 0.0000(0.0007) Grad: 19.0533  LR: 0.000000  
Epoch: [4][18600/21450] Elapsed 240m 38s (remain 36m 51s) Loss: 0.0000(0.0007) Grad: 53.8682  LR: 0.000000  
Epoch: [4][18700/21450] Elapsed 241m 56s (remain 35m 33s) Loss: 0.0000(0.0007) Grad: 469.4153  LR: 0.000000  
Epoch: [4][18800/21450] Elapsed 243m 14s (remain 34m 16s) Loss: 0.0035(0.0007) Grad: 322381.8750  LR: 0.000000  
Epoch: [4][18900/21450] Elapsed 244m 31s (remain 32m 58s) Loss: 0.0000(0.0007) Grad: 327.9095  LR: 0.000000  
Epoch: [4][19000/21450] Elapsed 245m 48s (remain 31m 40s) Loss: 0.0000(0.0007) Grad: 101.1666  LR: 0.000000  
Epoch: [4][19100/21450] Elapsed 247m 6s (remain 30m 23s) Loss: 0.0000(0.0007) Grad: 46.8763  LR: 0.000000  
Epoch: [4][19200/21450] Elapsed 248m 24s (remain 29m 5s) Loss: 0.0012(0.0007) Grad: 56789.9648  LR: 0.000000  
Epoch: [4][19300/21450] Elapsed 249m 41s (remain 27m 48s) Loss: 0.0001(0.0007) Grad: 9229.4248  LR: 0.000000  
Epoch: [4][19400/21450] Elapsed 250m 58s (remain 26m 30s) Loss: 0.0000(0.0007) Grad: 35.2031  LR: 0.000000  
Epoch: [4][19500/21450] Elapsed 252m 15s (remain 25m 12s) Loss: 0.0032(0.0007) Grad: 107756.2109  LR: 0.000000  
Epoch: [4][19600/21450] Elapsed 253m 33s (remain 23m 55s) Loss: 0.0000(0.0007) Grad: 997.9017  LR: 0.000000  
Epoch: [4][19700/21450] Elapsed 254m 50s (remain 22m 37s) Loss: 0.0000(0.0007) Grad: 72.9106  LR: 0.000000  
Epoch: [4][19800/21450] Elapsed 256m 7s (remain 21m 19s) Loss: 0.0008(0.0007) Grad: 74003.1875  LR: 0.000000  
Epoch: [4][19900/21450] Elapsed 257m 26s (remain 20m 2s) Loss: 0.0000(0.0007) Grad: 94.7285  LR: 0.000000  
Epoch: [4][20000/21450] Elapsed 258m 44s (remain 18m 44s) Loss: 0.0000(0.0007) Grad: 503.7823  LR: 0.000000  
Epoch: [4][20100/21450] Elapsed 260m 3s (remain 17m 27s) Loss: 0.0000(0.0007) Grad: 355.4509  LR: 0.000000  
Epoch: [4][20200/21450] Elapsed 261m 23s (remain 16m 9s) Loss: 0.0000(0.0007) Grad: 1781.8806  LR: 0.000000  
Epoch: [4][20300/21450] Elapsed 262m 42s (remain 14m 52s) Loss: 0.0070(0.0007) Grad: 81071.5000  LR: 0.000000  
Epoch: [4][20400/21450] Elapsed 263m 58s (remain 13m 34s) Loss: 0.0000(0.0007) Grad: 3453.9456  LR: 0.000000  
Epoch: [4][20500/21450] Elapsed 265m 16s (remain 12m 16s) Loss: 0.0001(0.0007) Grad: 1169.8729  LR: 0.000000  
Epoch: [4][20600/21450] Elapsed 266m 33s (remain 10m 59s) Loss: 0.0126(0.0007) Grad: 96687.1016  LR: 0.000000  
Epoch: [4][20700/21450] Elapsed 267m 51s (remain 9m 41s) Loss: 0.0001(0.0007) Grad: 6688.5635  LR: 0.000000  
Epoch: [4][20800/21450] Elapsed 269m 8s (remain 8m 23s) Loss: 0.0001(0.0007) Grad: 15191.7471  LR: 0.000000  
Epoch: [4][20900/21450] Elapsed 270m 25s (remain 7m 6s) Loss: 0.0000(0.0007) Grad: 250.4998  LR: 0.000000  
Epoch: [4][21000/21450] Elapsed 271m 41s (remain 5m 48s) Loss: 0.0000(0.0007) Grad: 53.8025  LR: 0.000000  
Epoch: [4][21100/21450] Elapsed 272m 59s (remain 4m 30s) Loss: 0.0000(0.0007) Grad: 26.3797  LR: 0.000000  
Epoch: [4][21200/21450] Elapsed 274m 16s (remain 3m 13s) Loss: 0.0000(0.0007) Grad: 90.7588  LR: 0.000000  
Epoch: [4][21300/21450] Elapsed 275m 33s (remain 1m 55s) Loss: 0.0003(0.0007) Grad: 11670.3828  LR: 0.000000  
Epoch: [4][21400/21450] Elapsed 276m 51s (remain 0m 38s) Loss: 0.0000(0.0007) Grad: 433.7635  LR: 0.000000  
Epoch: [4][21449/21450] Elapsed 277m 29s (remain 0m 0s) Loss: 0.0000(0.0007) Grad: 1021.2480  LR: 0.000000  
EVAL: [0/1192] Elapsed 0m 1s (remain 22m 4s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 30s (remain 5m 30s) Loss: 0.0082(0.0027) 
EVAL: [200/1192] Elapsed 1m 0s (remain 4m 58s) Loss: 0.0063(0.0032) 
EVAL: [300/1192] Elapsed 1m 30s (remain 4m 26s) Loss: 0.0010(0.0034) 
EVAL: [400/1192] Elapsed 1m 59s (remain 3m 55s) Loss: 0.0038(0.0036) 
EVAL: [500/1192] Elapsed 2m 29s (remain 3m 26s) Loss: 0.0040(0.0034) 
EVAL: [600/1192] Elapsed 2m 59s (remain 2m 56s) Loss: 0.0000(0.0036) 
EVAL: [700/1192] Elapsed 3m 29s (remain 2m 26s) Loss: 0.0439(0.0042) 
EVAL: [800/1192] Elapsed 3m 59s (remain 1m 56s) Loss: 0.0029(0.0044) 
EVAL: [900/1192] Elapsed 4m 28s (remain 1m 26s) Loss: 0.0003(0.0044) 
EVAL: [1000/1192] Elapsed 4m 58s (remain 0m 57s) Loss: 0.0000(0.0043) 
EVAL: [1100/1192] Elapsed 5m 28s (remain 0m 27s) Loss: 0.0000(0.0041) 
EVAL: [1191/1192] Elapsed 5m 55s (remain 0m 0s) Loss: 0.0000(0.0040) 
Epoch 4 - avg_train_loss: 0.0007  avg_val_loss: 0.0040  time: 17009s
Epoch 4 - Score: 0.8900
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_0.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_0.npy
100%
612602/612602 [00:01<00:00, 612172.50it/s]
100%
612602/612602 [00:43<00:00, 15180.39it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(75075, 11)
Epoch: [5][0/25025] Elapsed 0m 2s (remain 936m 2s) Loss: 0.0000(0.0000) Grad: 43.4833  LR: 0.000000  
Epoch: [5][100/25025] Elapsed 1m 18s (remain 323m 28s) Loss: 0.0000(0.0007) Grad: 4.1378  LR: 0.000000  
Epoch: [5][200/25025] Elapsed 2m 35s (remain 319m 45s) Loss: 0.0000(0.0007) Grad: 2.5292  LR: 0.000000  
Epoch: [5][300/25025] Elapsed 3m 53s (remain 319m 56s) Loss: 0.0000(0.0008) Grad: 0.6729  LR: 0.000000  
Epoch: [5][400/25025] Elapsed 5m 12s (remain 319m 50s) Loss: 0.0000(0.0007) Grad: 7.3249  LR: 0.000000  
Epoch: [5][500/25025] Elapsed 6m 30s (remain 318m 28s) Loss: 0.0030(0.0007) Grad: 24662.5371  LR: 0.000000  
Epoch: [5][600/25025] Elapsed 7m 47s (remain 316m 53s) Loss: 0.0001(0.0007) Grad: 2200.0884  LR: 0.000000  
Epoch: [5][700/25025] Elapsed 9m 4s (remain 315m 1s) Loss: 0.0000(0.0008) Grad: 65.5107  LR: 0.000000  
Epoch: [5][800/25025] Elapsed 10m 22s (remain 313m 37s) Loss: 0.0001(0.0008) Grad: 309.4701  LR: 0.000000  
Epoch: [5][900/25025] Elapsed 11m 39s (remain 312m 14s) Loss: 0.0000(0.0007) Grad: 13.2807  LR: 0.000000  
Epoch: [5][1000/25025] Elapsed 12m 57s (remain 311m 6s) Loss: 0.0020(0.0007) Grad: 7144.6606  LR: 0.000000  
Epoch: [5][1100/25025] Elapsed 14m 14s (remain 309m 29s) Loss: 0.0000(0.0007) Grad: 7.2384  LR: 0.000000  
Epoch: [5][1200/25025] Elapsed 15m 31s (remain 308m 1s) Loss: 0.0007(0.0007) Grad: 6010.6846  LR: 0.000000  
Epoch: [5][1300/25025] Elapsed 16m 47s (remain 306m 19s) Loss: 0.0000(0.0007) Grad: 20.1090  LR: 0.000000  
Epoch: [5][1400/25025] Elapsed 18m 4s (remain 304m 55s) Loss: 0.0000(0.0007) Grad: 28.4624  LR: 0.000000  
Epoch: [5][1500/25025] Elapsed 19m 23s (remain 303m 49s) Loss: 0.0000(0.0007) Grad: 1.9529  LR: 0.000000  
Epoch: [5][1600/25025] Elapsed 20m 40s (remain 302m 32s) Loss: 0.0082(0.0007) Grad: 7217.2021  LR: 0.000000  
Epoch: [5][1700/25025] Elapsed 21m 57s (remain 301m 4s) Loss: 0.0001(0.0007) Grad: 2042.3608  LR: 0.000000  
Epoch: [5][1800/25025] Elapsed 23m 13s (remain 299m 33s) Loss: 0.0007(0.0007) Grad: 7698.8901  LR: 0.000000  
Epoch: [5][1900/25025] Elapsed 24m 31s (remain 298m 15s) Loss: 0.0000(0.0007) Grad: 13.7512  LR: 0.000000  
Epoch: [5][2000/25025] Elapsed 25m 48s (remain 296m 58s) Loss: 0.0000(0.0007) Grad: 54.8488  LR: 0.000000  
Epoch: [5][2100/25025] Elapsed 27m 6s (remain 295m 48s) Loss: 0.0000(0.0007) Grad: 21.4730  LR: 0.000000  
Epoch: [5][2200/25025] Elapsed 28m 23s (remain 294m 27s) Loss: 0.0022(0.0007) Grad: 24179.1836  LR: 0.000000  
Epoch: [5][2300/25025] Elapsed 29m 40s (remain 293m 1s) Loss: 0.0001(0.0007) Grad: 723.5687  LR: 0.000000  
Epoch: [5][2400/25025] Elapsed 30m 58s (remain 291m 52s) Loss: 0.0056(0.0007) Grad: 47920.4961  LR: 0.000000  
Epoch: [5][2500/25025] Elapsed 32m 16s (remain 290m 39s) Loss: 0.0000(0.0007) Grad: 30.3348  LR: 0.000000  
Epoch: [5][2600/25025] Elapsed 33m 34s (remain 289m 25s) Loss: 0.0000(0.0007) Grad: 28.4856  LR: 0.000000  
Epoch: [5][2700/25025] Elapsed 34m 52s (remain 288m 12s) Loss: 0.0000(0.0007) Grad: 56.9761  LR: 0.000000  
Epoch: [5][2800/25025] Elapsed 36m 9s (remain 286m 55s) Loss: 0.0000(0.0007) Grad: 441.3403  LR: 0.000000  
Epoch: [5][2900/25025] Elapsed 37m 27s (remain 285m 42s) Loss: 0.0011(0.0007) Grad: 1852.5024  LR: 0.000000  
Epoch: [5][3000/25025] Elapsed 38m 46s (remain 284m 33s) Loss: 0.0002(0.0007) Grad: 3066.1211  LR: 0.000000  
Epoch: [5][3100/25025] Elapsed 40m 4s (remain 283m 23s) Loss: 0.0010(0.0007) Grad: 13417.5029  LR: 0.000000  
Epoch: [5][3200/25025] Elapsed 41m 22s (remain 282m 5s) Loss: 0.0000(0.0007) Grad: 24.2391  LR: 0.000000  
Epoch: [5][3300/25025] Elapsed 42m 38s (remain 280m 39s) Loss: 0.0000(0.0007) Grad: 4.3312  LR: 0.000000  
Epoch: [5][3400/25025] Elapsed 43m 55s (remain 279m 16s) Loss: 0.0000(0.0007) Grad: 0.5686  LR: 0.000000  
Epoch: [5][3500/25025] Elapsed 45m 11s (remain 277m 52s) Loss: 0.0114(0.0007) Grad: 12907.7119  LR: 0.000000  
Epoch: [5][3600/25025] Elapsed 46m 28s (remain 276m 29s) Loss: 0.0000(0.0007) Grad: 11.9636  LR: 0.000000  
Epoch: [5][3700/25025] Elapsed 47m 45s (remain 275m 7s) Loss: 0.0000(0.0007) Grad: 12.8407  LR: 0.000000  
Epoch: [5][3800/25025] Elapsed 49m 1s (remain 273m 45s) Loss: 0.0012(0.0007) Grad: 7599.5830  LR: 0.000000  
Epoch: [5][3900/25025] Elapsed 50m 17s (remain 272m 22s) Loss: 0.0008(0.0007) Grad: 2191.8201  LR: 0.000000  
Epoch: [5][4000/25025] Elapsed 51m 34s (remain 271m 2s) Loss: 0.0001(0.0007) Grad: 455.5618  LR: 0.000000  
Epoch: [5][4100/25025] Elapsed 52m 51s (remain 269m 43s) Loss: 0.0006(0.0007) Grad: 5984.0659  LR: 0.000000  
Epoch: [5][4200/25025] Elapsed 54m 8s (remain 268m 21s) Loss: 0.0000(0.0007) Grad: 47.0098  LR: 0.000000  
Epoch: [5][4300/25025] Elapsed 55m 23s (remain 266m 52s) Loss: 0.0002(0.0007) Grad: 6911.5215  LR: 0.000000  
Epoch: [5][4400/25025] Elapsed 56m 40s (remain 265m 37s) Loss: 0.0009(0.0007) Grad: 6812.3403  LR: 0.000000  
Epoch: [5][4500/25025] Elapsed 57m 57s (remain 264m 15s) Loss: 0.0000(0.0007) Grad: 169.6458  LR: 0.000000  
Epoch: [5][4600/25025] Elapsed 59m 13s (remain 262m 56s) Loss: 0.0002(0.0007) Grad: 2310.1719  LR: 0.000000  
Epoch: [5][4700/25025] Elapsed 60m 31s (remain 261m 39s) Loss: 0.0071(0.0007) Grad: 163688.6562  LR: 0.000000  
Epoch: [5][4800/25025] Elapsed 61m 48s (remain 260m 23s) Loss: 0.0000(0.0007) Grad: 341.1954  LR: 0.000000  
Epoch: [5][4900/25025] Elapsed 63m 6s (remain 259m 6s) Loss: 0.0007(0.0007) Grad: 6330.4785  LR: 0.000000  
Epoch: [5][5000/25025] Elapsed 64m 23s (remain 257m 49s) Loss: 0.0000(0.0007) Grad: 59.5521  LR: 0.000000  
Epoch: [5][5100/25025] Elapsed 65m 41s (remain 256m 34s) Loss: 0.0002(0.0007) Grad: 520.5857  LR: 0.000000  
Epoch: [5][5200/25025] Elapsed 66m 58s (remain 255m 16s) Loss: 0.0000(0.0007) Grad: 126.5924  LR: 0.000000  
Epoch: [5][5300/25025] Elapsed 68m 15s (remain 253m 58s) Loss: 0.0000(0.0007) Grad: 28.8682  LR: 0.000000  
Epoch: [5][5400/25025] Elapsed 69m 31s (remain 252m 38s) Loss: 0.0000(0.0007) Grad: 26.7194  LR: 0.000000  
Epoch: [5][5500/25025] Elapsed 70m 48s (remain 251m 19s) Loss: 0.0000(0.0007) Grad: 8.4843  LR: 0.000000  
Epoch: [5][5600/25025] Elapsed 72m 7s (remain 250m 7s) Loss: 0.0000(0.0007) Grad: 27.0610  LR: 0.000000  
Epoch: [5][5700/25025] Elapsed 73m 25s (remain 248m 51s) Loss: 0.0004(0.0007) Grad: 4521.1387  LR: 0.000000  
Epoch: [5][5800/25025] Elapsed 74m 43s (remain 247m 36s) Loss: 0.0000(0.0007) Grad: 13.4220  LR: 0.000000  
Epoch: [5][5900/25025] Elapsed 76m 0s (remain 246m 19s) Loss: 0.0000(0.0007) Grad: 53.8542  LR: 0.000000  
Epoch: [5][6000/25025] Elapsed 77m 16s (remain 244m 59s) Loss: 0.0000(0.0007) Grad: 12.7966  LR: 0.000000  
Epoch: [5][6100/25025] Elapsed 78m 33s (remain 243m 41s) Loss: 0.0000(0.0007) Grad: 22.5459  LR: 0.000000  
Epoch: [5][6200/25025] Elapsed 79m 52s (remain 242m 27s) Loss: 0.0003(0.0007) Grad: 1391.7233  LR: 0.000000  
Epoch: [5][6300/25025] Elapsed 81m 10s (remain 241m 12s) Loss: 0.0000(0.0007) Grad: 345.6735  LR: 0.000000  
Epoch: [5][6400/25025] Elapsed 82m 28s (remain 239m 59s) Loss: 0.0000(0.0007) Grad: 11.8377  LR: 0.000000  
Epoch: [5][6500/25025] Elapsed 83m 47s (remain 238m 45s) Loss: 0.0024(0.0007) Grad: 14816.3525  LR: 0.000000  
Epoch: [5][6600/25025] Elapsed 85m 5s (remain 237m 30s) Loss: 0.0161(0.0007) Grad: 34638.7422  LR: 0.000000  
Epoch: [5][6700/25025] Elapsed 86m 23s (remain 236m 13s) Loss: 0.0000(0.0007) Grad: 66.2619  LR: 0.000000  
Epoch: [5][6800/25025] Elapsed 87m 40s (remain 234m 57s) Loss: 0.0000(0.0007) Grad: 42.6357  LR: 0.000000  
Epoch: [5][6900/25025] Elapsed 88m 57s (remain 233m 37s) Loss: 0.0000(0.0007) Grad: 160.9540  LR: 0.000000  
Epoch: [5][7000/25025] Elapsed 90m 13s (remain 232m 17s) Loss: 0.0075(0.0007) Grad: 8826.5400  LR: 0.000000  
Epoch: [5][7100/25025] Elapsed 91m 29s (remain 230m 55s) Loss: 0.0005(0.0007) Grad: 2522.0146  LR: 0.000000  
Epoch: [5][7200/25025] Elapsed 92m 46s (remain 229m 38s) Loss: 0.0045(0.0007) Grad: 137346.2344  LR: 0.000000  
Epoch: [5][7300/25025] Elapsed 94m 3s (remain 228m 21s) Loss: 0.0000(0.0007) Grad: 17.6222  LR: 0.000000  
Epoch: [5][7400/25025] Elapsed 95m 22s (remain 227m 6s) Loss: 0.0000(0.0007) Grad: 17.3497  LR: 0.000000  
Epoch: [5][7500/25025] Elapsed 96m 40s (remain 225m 50s) Loss: 0.0002(0.0007) Grad: 13025.0947  LR: 0.000000  
Epoch: [5][7600/25025] Elapsed 97m 57s (remain 224m 34s) Loss: 0.0000(0.0007) Grad: 12.9464  LR: 0.000000  
Epoch: [5][7700/25025] Elapsed 99m 15s (remain 223m 17s) Loss: 0.0000(0.0007) Grad: 6.6502  LR: 0.000000  
Epoch: [5][7800/25025] Elapsed 100m 32s (remain 222m 0s) Loss: 0.0000(0.0007) Grad: 1.4819  LR: 0.000000  
Epoch: [5][7900/25025] Elapsed 101m 50s (remain 220m 43s) Loss: 0.0000(0.0007) Grad: 499.1166  LR: 0.000000  
Epoch: [5][8000/25025] Elapsed 103m 8s (remain 219m 27s) Loss: 0.0019(0.0007) Grad: 65209.4961  LR: 0.000000  
Epoch: [5][8100/25025] Elapsed 104m 25s (remain 218m 8s) Loss: 0.0000(0.0007) Grad: 195.0323  LR: 0.000000  
Epoch: [5][8200/25025] Elapsed 105m 43s (remain 216m 52s) Loss: 0.0000(0.0007) Grad: 13.9470  LR: 0.000000  
Epoch: [5][8300/25025] Elapsed 107m 0s (remain 215m 34s) Loss: 0.0000(0.0007) Grad: 112.8262  LR: 0.000000  
Epoch: [5][8400/25025] Elapsed 108m 18s (remain 214m 18s) Loss: 0.0000(0.0007) Grad: 197.9868  LR: 0.000000  
Epoch: [5][8500/25025] Elapsed 109m 36s (remain 213m 3s) Loss: 0.0065(0.0007) Grad: 28886.9707  LR: 0.000000  
Epoch: [5][8600/25025] Elapsed 110m 54s (remain 211m 46s) Loss: 0.0000(0.0007) Grad: 8.7555  LR: 0.000000  
Epoch: [5][8700/25025] Elapsed 112m 13s (remain 210m 32s) Loss: 0.0001(0.0007) Grad: 2425.4644  LR: 0.000000  
Epoch: [5][8800/25025] Elapsed 113m 31s (remain 209m 15s) Loss: 0.0000(0.0007) Grad: 107.5461  LR: 0.000000  
Epoch: [5][8900/25025] Elapsed 114m 48s (remain 207m 58s) Loss: 0.0000(0.0007) Grad: 37.9211  LR: 0.000000  
Epoch: [5][9000/25025] Elapsed 116m 4s (remain 206m 39s) Loss: 0.0000(0.0007) Grad: 26.1125  LR: 0.000000  
Epoch: [5][9100/25025] Elapsed 117m 22s (remain 205m 22s) Loss: 0.0000(0.0007) Grad: 157.9989  LR: 0.000000  
Epoch: [5][9200/25025] Elapsed 118m 39s (remain 204m 3s) Loss: 0.0020(0.0007) Grad: 66917.7031  LR: 0.000000  
Epoch: [5][9300/25025] Elapsed 119m 56s (remain 202m 46s) Loss: 0.0000(0.0007) Grad: 13.6661  LR: 0.000000  
Epoch: [5][9400/25025] Elapsed 121m 15s (remain 201m 31s) Loss: 0.0010(0.0007) Grad: 52417.0898  LR: 0.000000  
Epoch: [5][9500/25025] Elapsed 122m 34s (remain 200m 17s) Loss: 0.0000(0.0007) Grad: 260.0587  LR: 0.000000  
Epoch: [5][9600/25025] Elapsed 123m 52s (remain 199m 0s) Loss: 0.0000(0.0007) Grad: 1738.1484  LR: 0.000000  
Epoch: [5][9700/25025] Elapsed 125m 10s (remain 197m 43s) Loss: 0.0098(0.0007) Grad: 87674.7578  LR: 0.000000  
Epoch: [5][9800/25025] Elapsed 126m 28s (remain 196m 27s) Loss: 0.0000(0.0007) Grad: 21.9282  LR: 0.000000  
Epoch: [5][9900/25025] Elapsed 127m 46s (remain 195m 10s) Loss: 0.0029(0.0007) Grad: 56512.7461  LR: 0.000000  
Epoch: [5][10000/25025] Elapsed 129m 4s (remain 193m 54s) Loss: 0.0000(0.0007) Grad: 45.1907  LR: 0.000000  
Epoch: [5][10100/25025] Elapsed 130m 22s (remain 192m 38s) Loss: 0.0000(0.0007) Grad: 9.5001  LR: 0.000000  
Epoch: [5][10200/25025] Elapsed 131m 40s (remain 191m 21s) Loss: 0.0009(0.0007) Grad: 25627.9141  LR: 0.000000  
Epoch: [5][10300/25025] Elapsed 132m 58s (remain 190m 4s) Loss: 0.0108(0.0007) Grad: 187492.5781  LR: 0.000000  
Epoch: [5][10400/25025] Elapsed 134m 16s (remain 188m 47s) Loss: 0.0000(0.0007) Grad: 26.8073  LR: 0.000000  
Epoch: [5][10500/25025] Elapsed 135m 33s (remain 187m 29s) Loss: 0.0004(0.0007) Grad: 6146.3730  LR: 0.000000  
Epoch: [5][10600/25025] Elapsed 136m 50s (remain 186m 11s) Loss: 0.0007(0.0007) Grad: 11441.9639  LR: 0.000000  
Epoch: [5][10700/25025] Elapsed 138m 7s (remain 184m 53s) Loss: 0.0002(0.0007) Grad: 2431.6096  LR: 0.000000  
Epoch: [5][10800/25025] Elapsed 139m 25s (remain 183m 36s) Loss: 0.0001(0.0007) Grad: 451.5365  LR: 0.000000  
Epoch: [5][10900/25025] Elapsed 140m 42s (remain 182m 18s) Loss: 0.0000(0.0007) Grad: 659.0914  LR: 0.000000  
Epoch: [5][11000/25025] Elapsed 141m 59s (remain 181m 0s) Loss: 0.0000(0.0007) Grad: 19.0376  LR: 0.000000  
Epoch: [5][11100/25025] Elapsed 143m 16s (remain 179m 42s) Loss: 0.0000(0.0007) Grad: 48.2455  LR: 0.000000  
Epoch: [5][11200/25025] Elapsed 144m 32s (remain 178m 23s) Loss: 0.0003(0.0007) Grad: 1568.9900  LR: 0.000000  
Epoch: [5][11300/25025] Elapsed 145m 50s (remain 177m 6s) Loss: 0.0000(0.0007) Grad: 138.7672  LR: 0.000000  
Epoch: [5][11400/25025] Elapsed 147m 7s (remain 175m 48s) Loss: 0.0000(0.0007) Grad: 13.9083  LR: 0.000000  
Epoch: [5][11500/25025] Elapsed 148m 24s (remain 174m 30s) Loss: 0.0001(0.0007) Grad: 4910.3940  LR: 0.000000  
Epoch: [5][11600/25025] Elapsed 149m 43s (remain 173m 14s) Loss: 0.0000(0.0007) Grad: 45.5443  LR: 0.000000  
Epoch: [5][11700/25025] Elapsed 151m 1s (remain 171m 58s) Loss: 0.0004(0.0007) Grad: 5805.0010  LR: 0.000000  
Epoch: [5][11800/25025] Elapsed 152m 20s (remain 170m 42s) Loss: 0.0000(0.0007) Grad: 253.9794  LR: 0.000000  
Epoch: [5][11900/25025] Elapsed 153m 38s (remain 169m 25s) Loss: 0.0006(0.0007) Grad: 15234.0439  LR: 0.000000  
Epoch: [5][12000/25025] Elapsed 154m 57s (remain 168m 9s) Loss: 0.0003(0.0007) Grad: 4549.6929  LR: 0.000000  
Epoch: [5][12100/25025] Elapsed 156m 15s (remain 166m 52s) Loss: 0.0001(0.0007) Grad: 10711.5527  LR: 0.000000  
Epoch: [5][12200/25025] Elapsed 157m 34s (remain 165m 36s) Loss: 0.0000(0.0007) Grad: 843.5600  LR: 0.000000  
Epoch: [5][12300/25025] Elapsed 158m 51s (remain 164m 19s) Loss: 0.0002(0.0007) Grad: 5862.2134  LR: 0.000000  
Epoch: [5][12400/25025] Elapsed 160m 9s (remain 163m 1s) Loss: 0.0000(0.0007) Grad: 21.9237  LR: 0.000000  
Epoch: [5][12500/25025] Elapsed 161m 25s (remain 161m 43s) Loss: 0.0000(0.0007) Grad: 16.9176  LR: 0.000000  
Epoch: [5][12600/25025] Elapsed 162m 43s (remain 160m 26s) Loss: 0.0000(0.0007) Grad: 1198.1567  LR: 0.000000  
Epoch: [5][12700/25025] Elapsed 163m 59s (remain 159m 7s) Loss: 0.0000(0.0007) Grad: 61.8688  LR: 0.000000  
Epoch: [5][12800/25025] Elapsed 165m 17s (remain 157m 50s) Loss: 0.0000(0.0007) Grad: 546.7509  LR: 0.000000  
Epoch: [5][12900/25025] Elapsed 166m 35s (remain 156m 33s) Loss: 0.0037(0.0007) Grad: 77858.3750  LR: 0.000000  
Epoch: [5][13000/25025] Elapsed 167m 51s (remain 155m 14s) Loss: 0.0020(0.0007) Grad: 136265.3438  LR: 0.000000  
Epoch: [5][13100/25025] Elapsed 169m 8s (remain 153m 57s) Loss: 0.0000(0.0007) Grad: 212.0974  LR: 0.000000  
Epoch: [5][13200/25025] Elapsed 170m 25s (remain 152m 39s) Loss: 0.0004(0.0007) Grad: 56700.0781  LR: 0.000000  
Epoch: [5][13300/25025] Elapsed 171m 44s (remain 151m 22s) Loss: 0.0000(0.0007) Grad: 41.3281  LR: 0.000000  
Epoch: [5][13400/25025] Elapsed 173m 2s (remain 150m 5s) Loss: 0.0033(0.0007) Grad: 306504.7188  LR: 0.000000  
Epoch: [5][13500/25025] Elapsed 174m 21s (remain 148m 49s) Loss: 0.0000(0.0007) Grad: 27.0102  LR: 0.000000  
Epoch: [5][13600/25025] Elapsed 175m 38s (remain 147m 31s) Loss: 0.0000(0.0007) Grad: 99.2291  LR: 0.000000  
Epoch: [5][13700/25025] Elapsed 176m 57s (remain 146m 15s) Loss: 0.0000(0.0007) Grad: 662.9437  LR: 0.000000  
Epoch: [5][13800/25025] Elapsed 178m 15s (remain 144m 58s) Loss: 0.0000(0.0007) Grad: 42.8758  LR: 0.000000  
Epoch: [5][13900/25025] Elapsed 179m 33s (remain 143m 41s) Loss: 0.0001(0.0007) Grad: 4092.2444  LR: 0.000000  
Epoch: [5][14000/25025] Elapsed 180m 52s (remain 142m 24s) Loss: 0.0000(0.0007) Grad: 65.1627  LR: 0.000000  
Epoch: [5][14100/25025] Elapsed 182m 9s (remain 141m 6s) Loss: 0.0000(0.0007) Grad: 46.8310  LR: 0.000000  
Epoch: [5][14200/25025] Elapsed 183m 27s (remain 139m 49s) Loss: 0.0000(0.0007) Grad: 29.3535  LR: 0.000000  
Epoch: [5][14300/25025] Elapsed 184m 44s (remain 138m 32s) Loss: 0.0000(0.0007) Grad: 860.7486  LR: 0.000000  
Epoch: [5][14400/25025] Elapsed 186m 1s (remain 137m 14s) Loss: 0.0000(0.0007) Grad: 866.7969  LR: 0.000000  
Epoch: [5][14500/25025] Elapsed 187m 18s (remain 135m 56s) Loss: 0.0007(0.0007) Grad: 14166.9033  LR: 0.000000  
Epoch: [5][14600/25025] Elapsed 188m 36s (remain 134m 39s) Loss: 0.0001(0.0007) Grad: 16232.0117  LR: 0.000000  
Epoch: [5][14700/25025] Elapsed 189m 55s (remain 133m 22s) Loss: 0.0047(0.0007) Grad: 110604.3125  LR: 0.000000  
Epoch: [5][14800/25025] Elapsed 191m 13s (remain 132m 5s) Loss: 0.0000(0.0007) Grad: 71.0603  LR: 0.000000  
Epoch: [5][14900/25025] Elapsed 192m 30s (remain 130m 47s) Loss: 0.0000(0.0007) Grad: 45.6943  LR: 0.000000  
Epoch: [5][15000/25025] Elapsed 193m 48s (remain 129m 30s) Loss: 0.0000(0.0007) Grad: 32.2394  LR: 0.000000  
Epoch: [5][15100/25025] Elapsed 195m 4s (remain 128m 12s) Loss: 0.0003(0.0007) Grad: 15576.6689  LR: 0.000000  
Epoch: [5][15200/25025] Elapsed 196m 22s (remain 126m 54s) Loss: 0.0000(0.0007) Grad: 88.4470  LR: 0.000000  
Epoch: [5][15300/25025] Elapsed 197m 39s (remain 125m 37s) Loss: 0.0000(0.0007) Grad: 195.4974  LR: 0.000000  
Epoch: [5][15400/25025] Elapsed 198m 57s (remain 124m 19s) Loss: 0.0000(0.0007) Grad: 17.7573  LR: 0.000000  
Epoch: [5][15500/25025] Elapsed 200m 14s (remain 123m 1s) Loss: 0.0000(0.0007) Grad: 225.3969  LR: 0.000000  
Epoch: [5][15600/25025] Elapsed 201m 32s (remain 121m 44s) Loss: 0.0001(0.0007) Grad: 33803.9766  LR: 0.000000  
Epoch: [5][15700/25025] Elapsed 202m 49s (remain 120m 26s) Loss: 0.0035(0.0007) Grad: 472911.5312  LR: 0.000000  
Epoch: [5][15800/25025] Elapsed 204m 6s (remain 119m 8s) Loss: 0.0008(0.0007) Grad: 37858.1211  LR: 0.000000  
Epoch: [5][15900/25025] Elapsed 205m 23s (remain 117m 51s) Loss: 0.0000(0.0007) Grad: 41.6622  LR: 0.000000  
Epoch: [5][16000/25025] Elapsed 206m 40s (remain 116m 33s) Loss: 0.0000(0.0007) Grad: 129.5607  LR: 0.000000  
Epoch: [5][16100/25025] Elapsed 207m 57s (remain 115m 15s) Loss: 0.0000(0.0007) Grad: 79.7244  LR: 0.000000  
Epoch: [5][16200/25025] Elapsed 209m 13s (remain 113m 57s) Loss: 0.0017(0.0007) Grad: 156483.5469  LR: 0.000000  
Epoch: [5][16300/25025] Elapsed 210m 30s (remain 112m 39s) Loss: 0.0000(0.0007) Grad: 73.7162  LR: 0.000000  
Epoch: [5][16400/25025] Elapsed 211m 47s (remain 111m 21s) Loss: 0.0000(0.0007) Grad: 442.5600  LR: 0.000000  
Epoch: [5][16500/25025] Elapsed 213m 5s (remain 110m 4s) Loss: 0.0000(0.0007) Grad: 69.2273  LR: 0.000000  
Epoch: [5][16600/25025] Elapsed 214m 23s (remain 108m 47s) Loss: 0.0000(0.0007) Grad: 37413.0273  LR: 0.000000  
Epoch: [5][16700/25025] Elapsed 215m 40s (remain 107m 29s) Loss: 0.0000(0.0007) Grad: 3056.8789  LR: 0.000000  
Epoch: [5][16800/25025] Elapsed 216m 58s (remain 106m 12s) Loss: 0.0000(0.0007) Grad: 446.2744  LR: 0.000000  
Epoch: [5][16900/25025] Elapsed 218m 16s (remain 104m 55s) Loss: 0.0000(0.0007) Grad: 92.4454  LR: 0.000000  
Epoch: [5][17000/25025] Elapsed 219m 35s (remain 103m 38s) Loss: 0.0000(0.0007) Grad: 182.5719  LR: 0.000000  
Epoch: [5][17100/25025] Elapsed 220m 51s (remain 102m 20s) Loss: 0.0000(0.0007) Grad: 253.4339  LR: 0.000000  
Epoch: [5][17200/25025] Elapsed 222m 8s (remain 101m 2s) Loss: 0.0002(0.0007) Grad: 20435.0234  LR: 0.000000  
Epoch: [5][17300/25025] Elapsed 223m 25s (remain 99m 44s) Loss: 0.0002(0.0007) Grad: 5233.3350  LR: 0.000000  
Epoch: [5][17400/25025] Elapsed 224m 42s (remain 98m 27s) Loss: 0.0001(0.0007) Grad: 5033.2305  LR: 0.000000  
Epoch: [5][17500/25025] Elapsed 226m 0s (remain 97m 9s) Loss: 0.0000(0.0007) Grad: 1316.5248  LR: 0.000000  
Epoch: [5][17600/25025] Elapsed 227m 17s (remain 95m 52s) Loss: 0.0013(0.0007) Grad: 115255.9297  LR: 0.000000  
Epoch: [5][17700/25025] Elapsed 228m 34s (remain 94m 34s) Loss: 0.0023(0.0007) Grad: 235791.5625  LR: 0.000000  
Epoch: [5][17800/25025] Elapsed 229m 50s (remain 93m 16s) Loss: 0.0003(0.0007) Grad: 45696.0742  LR: 0.000000  
Epoch: [5][17900/25025] Elapsed 231m 8s (remain 91m 59s) Loss: 0.0000(0.0007) Grad: 158.2856  LR: 0.000000  
Epoch: [5][18000/25025] Elapsed 232m 25s (remain 90m 41s) Loss: 0.0014(0.0007) Grad: 158447.7344  LR: 0.000000  
Epoch: [5][18100/25025] Elapsed 233m 42s (remain 89m 24s) Loss: 0.0023(0.0007) Grad: 51518.6484  LR: 0.000000  
Epoch: [5][18200/25025] Elapsed 234m 59s (remain 88m 6s) Loss: 0.0000(0.0007) Grad: 313.3334  LR: 0.000000  
Epoch: [5][18300/25025] Elapsed 236m 17s (remain 86m 48s) Loss: 0.0003(0.0007) Grad: 26677.8848  LR: 0.000000  
Epoch: [5][18400/25025] Elapsed 237m 34s (remain 85m 31s) Loss: 0.0002(0.0007) Grad: 7029.7451  LR: 0.000000  
Epoch: [5][18500/25025] Elapsed 238m 51s (remain 84m 13s) Loss: 0.0044(0.0007) Grad: 110261.3672  LR: 0.000000  
Epoch: [5][18600/25025] Elapsed 240m 8s (remain 82m 56s) Loss: 0.0000(0.0007) Grad: 141.4320  LR: 0.000000  
Epoch: [5][18700/25025] Elapsed 241m 25s (remain 81m 38s) Loss: 0.0000(0.0007) Grad: 44.3990  LR: 0.000000  
Epoch: [5][18800/25025] Elapsed 242m 43s (remain 80m 21s) Loss: 0.0001(0.0007) Grad: 6646.6309  LR: 0.000000  
Epoch: [5][18900/25025] Elapsed 244m 0s (remain 79m 3s) Loss: 0.0000(0.0007) Grad: 2457.8396  LR: 0.000000  
Epoch: [5][19000/25025] Elapsed 245m 18s (remain 77m 46s) Loss: 0.0002(0.0007) Grad: 13872.6328  LR: 0.000000  
Epoch: [5][19100/25025] Elapsed 246m 36s (remain 76m 29s) Loss: 0.0000(0.0007) Grad: 119.9530  LR: 0.000000  
Epoch: [5][19200/25025] Elapsed 247m 55s (remain 75m 11s) Loss: 0.0003(0.0007) Grad: 12318.6094  LR: 0.000000  
Epoch: [5][19300/25025] Elapsed 249m 13s (remain 73m 54s) Loss: 0.0000(0.0007) Grad: 740.5104  LR: 0.000000  
Epoch: [5][19400/25025] Elapsed 250m 31s (remain 72m 37s) Loss: 0.0000(0.0007) Grad: 231.5039  LR: 0.000000  
Epoch: [5][19500/25025] Elapsed 251m 49s (remain 71m 19s) Loss: 0.0000(0.0007) Grad: 5.9741  LR: 0.000000  
Epoch: [5][19600/25025] Elapsed 253m 6s (remain 70m 2s) Loss: 0.0000(0.0007) Grad: 491.8301  LR: 0.000000  
Epoch: [5][19700/25025] Elapsed 254m 25s (remain 68m 45s) Loss: 0.0002(0.0007) Grad: 3302.2900  LR: 0.000000  
Epoch: [5][19800/25025] Elapsed 255m 43s (remain 67m 27s) Loss: 0.0000(0.0007) Grad: 2154.5864  LR: 0.000000  
Epoch: [5][19900/25025] Elapsed 257m 1s (remain 66m 10s) Loss: 0.0000(0.0007) Grad: 4.9259  LR: 0.000000  
Epoch: [5][20000/25025] Elapsed 258m 19s (remain 64m 53s) Loss: 0.0001(0.0007) Grad: 10515.4746  LR: 0.000000  
Epoch: [5][20100/25025] Elapsed 259m 37s (remain 63m 35s) Loss: 0.0002(0.0007) Grad: 2720.5842  LR: 0.000000  
Epoch: [5][20200/25025] Elapsed 260m 53s (remain 62m 18s) Loss: 0.0002(0.0007) Grad: 5255.3081  LR: 0.000000  
Epoch: [5][20300/25025] Elapsed 262m 11s (remain 61m 0s) Loss: 0.0000(0.0007) Grad: 284.6826  LR: 0.000000  
Epoch: [5][20400/25025] Elapsed 263m 29s (remain 59m 43s) Loss: 0.0000(0.0007) Grad: 2792.3940  LR: 0.000000  
Epoch: [5][20500/25025] Elapsed 264m 48s (remain 58m 26s) Loss: 0.0000(0.0007) Grad: 177.1451  LR: 0.000000  
Epoch: [5][20600/25025] Elapsed 266m 6s (remain 57m 8s) Loss: 0.0020(0.0007) Grad: 56759.7891  LR: 0.000000  
Epoch: [5][20700/25025] Elapsed 267m 25s (remain 55m 51s) Loss: 0.0006(0.0007) Grad: 50707.3711  LR: 0.000000  
Epoch: [5][20800/25025] Elapsed 268m 42s (remain 54m 33s) Loss: 0.0000(0.0007) Grad: 100.0278  LR: 0.000000  
Epoch: [5][20900/25025] Elapsed 270m 0s (remain 53m 16s) Loss: 0.0000(0.0007) Grad: 65.7478  LR: 0.000000  
Epoch: [5][21000/25025] Elapsed 271m 18s (remain 51m 59s) Loss: 0.0000(0.0007) Grad: 125.8941  LR: 0.000000  
Epoch: [5][21100/25025] Elapsed 272m 35s (remain 50m 41s) Loss: 0.0000(0.0007) Grad: 183.2680  LR: 0.000000  
Epoch: [5][21200/25025] Elapsed 273m 51s (remain 49m 23s) Loss: 0.0001(0.0007) Grad: 25855.6699  LR: 0.000000  
Epoch: [5][21300/25025] Elapsed 275m 9s (remain 48m 6s) Loss: 0.0000(0.0007) Grad: 49.5011  LR: 0.000000  
Epoch: [5][21400/25025] Elapsed 276m 26s (remain 46m 48s) Loss: 0.0000(0.0007) Grad: 538.2349  LR: 0.000000  
Epoch: [5][21500/25025] Elapsed 277m 44s (remain 45m 31s) Loss: 0.0002(0.0007) Grad: 52869.3672  LR: 0.000000  
Epoch: [5][21600/25025] Elapsed 279m 1s (remain 44m 13s) Loss: 0.0003(0.0007) Grad: 10434.2412  LR: 0.000000  
Epoch: [5][21700/25025] Elapsed 280m 18s (remain 42m 56s) Loss: 0.0000(0.0007) Grad: 2009.4403  LR: 0.000000  
Epoch: [5][21800/25025] Elapsed 281m 36s (remain 41m 38s) Loss: 0.0040(0.0007) Grad: 220815.2500  LR: 0.000000  
Epoch: [5][21900/25025] Elapsed 282m 52s (remain 40m 21s) Loss: 0.0015(0.0007) Grad: 48474.4258  LR: 0.000000  
Epoch: [5][22000/25025] Elapsed 284m 11s (remain 39m 3s) Loss: 0.0000(0.0007) Grad: 112.9697  LR: 0.000000  
Epoch: [5][22100/25025] Elapsed 285m 29s (remain 37m 46s) Loss: 0.0000(0.0007) Grad: 5301.2461  LR: 0.000000  
Epoch: [5][22200/25025] Elapsed 286m 48s (remain 36m 28s) Loss: 0.0001(0.0007) Grad: 19508.2070  LR: 0.000000  
Epoch: [5][22300/25025] Elapsed 288m 5s (remain 35m 11s) Loss: 0.0000(0.0007) Grad: 1960.3149  LR: 0.000000  
Epoch: [5][22400/25025] Elapsed 289m 22s (remain 33m 53s) Loss: 0.0000(0.0007) Grad: 392.7830  LR: 0.000000  
Epoch: [5][22500/25025] Elapsed 290m 40s (remain 32m 36s) Loss: 0.0008(0.0007) Grad: 71100.7031  LR: 0.000000  
Epoch: [5][22600/25025] Elapsed 291m 58s (remain 31m 18s) Loss: 0.0012(0.0007) Grad: 52345.9102  LR: 0.000000  
Epoch: [5][22700/25025] Elapsed 293m 17s (remain 30m 1s) Loss: 0.0001(0.0007) Grad: 4315.7612  LR: 0.000000  
Epoch: [5][22800/25025] Elapsed 294m 35s (remain 28m 44s) Loss: 0.0001(0.0007) Grad: 5117.0952  LR: 0.000000  
Epoch: [5][22900/25025] Elapsed 295m 53s (remain 27m 26s) Loss: 0.0000(0.0007) Grad: 478.0417  LR: 0.000000  
Epoch: [5][23000/25025] Elapsed 297m 11s (remain 26m 9s) Loss: 0.0000(0.0007) Grad: 256.8927  LR: 0.000000  
Epoch: [5][23100/25025] Elapsed 298m 29s (remain 24m 51s) Loss: 0.0033(0.0007) Grad: 40708.1992  LR: 0.000000  
Epoch: [5][23200/25025] Elapsed 299m 45s (remain 23m 33s) Loss: 0.0002(0.0007) Grad: 4690.7896  LR: 0.000000  
Epoch: [5][23300/25025] Elapsed 301m 3s (remain 22m 16s) Loss: 0.0000(0.0007) Grad: 196.9349  LR: 0.000000  
Epoch: [5][23400/25025] Elapsed 302m 22s (remain 20m 59s) Loss: 0.0000(0.0007) Grad: 298.3185  LR: 0.000000  
Epoch: [5][23500/25025] Elapsed 303m 40s (remain 19m 41s) Loss: 0.0000(0.0007) Grad: 98.2065  LR: 0.000000  
Epoch: [5][23600/25025] Elapsed 304m 58s (remain 18m 24s) Loss: 0.0000(0.0007) Grad: 319.3592  LR: 0.000000  
Epoch: [5][23700/25025] Elapsed 306m 15s (remain 17m 6s) Loss: 0.0000(0.0007) Grad: 266.0325  LR: 0.000000  
Epoch: [5][23800/25025] Elapsed 307m 33s (remain 15m 48s) Loss: 0.0000(0.0007) Grad: 97.1441  LR: 0.000000  
Epoch: [5][23900/25025] Elapsed 308m 51s (remain 14m 31s) Loss: 0.0000(0.0007) Grad: 1456.4114  LR: 0.000000  
Epoch: [5][24000/25025] Elapsed 310m 10s (remain 13m 14s) Loss: 0.0000(0.0007) Grad: 3442.1250  LR: 0.000000  
Epoch: [5][24100/25025] Elapsed 311m 28s (remain 11m 56s) Loss: 0.0000(0.0007) Grad: 8394.6152  LR: 0.000000  
Epoch: [5][24200/25025] Elapsed 312m 46s (remain 10m 38s) Loss: 0.0000(0.0007) Grad: 176.7087  LR: 0.000000  
Epoch: [5][24300/25025] Elapsed 314m 3s (remain 9m 21s) Loss: 0.0000(0.0007) Grad: 508.1189  LR: 0.000000  
Epoch: [5][24400/25025] Elapsed 315m 21s (remain 8m 3s) Loss: 0.0003(0.0007) Grad: 34546.4023  LR: 0.000000  
Epoch: [5][24500/25025] Elapsed 316m 39s (remain 6m 46s) Loss: 0.0000(0.0007) Grad: 51.6761  LR: 0.000000  
Epoch: [5][24600/25025] Elapsed 317m 57s (remain 5m 28s) Loss: 0.0005(0.0007) Grad: 536717.4375  LR: 0.000000  
Epoch: [5][24700/25025] Elapsed 319m 15s (remain 4m 11s) Loss: 0.0001(0.0007) Grad: 3595.6921  LR: 0.000000  
Epoch: [5][24800/25025] Elapsed 320m 31s (remain 2m 53s) Loss: 0.0000(0.0007) Grad: 64.3584  LR: 0.000000  
Epoch: [5][24900/25025] Elapsed 321m 49s (remain 1m 36s) Loss: 0.0002(0.0007) Grad: 5808.8071  LR: 0.000000  
Epoch: [5][25000/25025] Elapsed 323m 6s (remain 0m 18s) Loss: 0.0000(0.0007) Grad: 7.1822  LR: 0.000000  
Epoch: [5][25024/25025] Elapsed 323m 25s (remain 0m 0s) Loss: 0.0048(0.0007) Grad: 36090.8398  LR: 0.000000  
EVAL: [0/1192] Elapsed 0m 1s (remain 22m 38s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 31s (remain 5m 43s) Loss: 0.0082(0.0027) 
EVAL: [200/1192] Elapsed 1m 2s (remain 5m 7s) Loss: 0.0063(0.0032) 
EVAL: [300/1192] Elapsed 1m 31s (remain 4m 32s) Loss: 0.0010(0.0034) 
EVAL: [400/1192] Elapsed 2m 2s (remain 4m 0s) Loss: 0.0038(0.0036) 
EVAL: [500/1192] Elapsed 2m 31s (remain 3m 29s) Loss: 0.0040(0.0034) 
EVAL: [600/1192] Elapsed 3m 1s (remain 2m 58s) Loss: 0.0000(0.0036) 
EVAL: [700/1192] Elapsed 3m 32s (remain 2m 28s) Loss: 0.0439(0.0042) 
EVAL: [800/1192] Elapsed 4m 2s (remain 1m 58s) Loss: 0.0029(0.0044) 
EVAL: [900/1192] Elapsed 4m 32s (remain 1m 28s) Loss: 0.0003(0.0044) 
EVAL: [1000/1192] Elapsed 5m 1s (remain 0m 57s) Loss: 0.0000(0.0043) 
EVAL: [1100/1192] Elapsed 5m 31s (remain 0m 27s) Loss: 0.0000(0.0041) 
EVAL: [1191/1192] Elapsed 5m 59s (remain 0m 0s) Loss: 0.0000(0.0040) 
Epoch 5 - avg_train_loss: 0.0007  avg_val_loss: 0.0040  time: 19769s
Epoch 5 - Score: 0.8900
========== fold: 1 training ==========
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy
100%
612602/612602 [00:01<00:00, 637924.77it/s]
100%
612602/612602 [00:44<00:00, 15790.91it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(21450, 11)
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2ForMaskedLM: ['mask_predictions.dense.bias', 'mask_predictions.LayerNorm.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'deberta.embeddings.position_embeddings.weight', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2ForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaV2ForMaskedLM were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Load weight from ../output/nbme-score-clinical-patient-notes/nbme-exp073/microsoft-deberta-v3-large-mlm-epoch-12.bin
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy
100%
612602/612602 [00:01<00:00, 570012.10it/s]
100%
612602/612602 [00:45<00:00, 15978.17it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(32175, 11)
Epoch: [1][0/10725] Elapsed 0m 2s (remain 428m 3s) Loss: 0.0960(0.0960) Grad: 91485.9297  LR: 0.000000  
Epoch: [1][100/10725] Elapsed 1m 21s (remain 142m 38s) Loss: 0.0761(0.0880) Grad: 71912.8516  LR: 0.000001  
Epoch: [1][200/10725] Elapsed 2m 38s (remain 138m 34s) Loss: 0.0386(0.0721) Grad: 33519.8359  LR: 0.000001  
Epoch: [1][300/10725] Elapsed 3m 56s (remain 136m 29s) Loss: 0.0106(0.0555) Grad: 6288.7441  LR: 0.000002  
Epoch: [1][400/10725] Elapsed 5m 15s (remain 135m 18s) Loss: 0.0196(0.0450) Grad: 6712.0024  LR: 0.000002  
Epoch: [1][500/10725] Elapsed 6m 33s (remain 133m 46s) Loss: 0.0143(0.0384) Grad: 4303.4780  LR: 0.000003  
Epoch: [1][600/10725] Elapsed 7m 50s (remain 132m 6s) Loss: 0.0113(0.0340) Grad: 4471.0605  LR: 0.000003  
Epoch: [1][700/10725] Elapsed 9m 8s (remain 130m 42s) Loss: 0.0131(0.0307) Grad: 4125.2749  LR: 0.000004  
Epoch: [1][800/10725] Elapsed 10m 25s (remain 129m 13s) Loss: 0.0018(0.0282) Grad: 3454.3457  LR: 0.000004  
Epoch: [1][900/10725] Elapsed 11m 43s (remain 127m 49s) Loss: 0.0130(0.0263) Grad: 4570.2749  LR: 0.000005  
Epoch: [1][1000/10725] Elapsed 13m 0s (remain 126m 25s) Loss: 0.0077(0.0247) Grad: 4318.2578  LR: 0.000006  
Epoch: [1][1100/10725] Elapsed 14m 19s (remain 125m 13s) Loss: 0.0030(0.0233) Grad: 2589.3179  LR: 0.000006  
Epoch: [1][1200/10725] Elapsed 15m 37s (remain 123m 50s) Loss: 0.0034(0.0219) Grad: 6991.3101  LR: 0.000007  
Epoch: [1][1300/10725] Elapsed 16m 54s (remain 122m 31s) Loss: 0.0089(0.0207) Grad: 12597.5107  LR: 0.000007  
Epoch: [1][1400/10725] Elapsed 18m 12s (remain 121m 13s) Loss: 0.0038(0.0196) Grad: 11916.3418  LR: 0.000008  
Epoch: [1][1500/10725] Elapsed 19m 31s (remain 120m 0s) Loss: 0.0039(0.0185) Grad: 7356.5928  LR: 0.000008  
Epoch: [1][1600/10725] Elapsed 20m 51s (remain 118m 52s) Loss: 0.0254(0.0177) Grad: 49467.4219  LR: 0.000009  
Epoch: [1][1700/10725] Elapsed 22m 11s (remain 117m 41s) Loss: 0.0009(0.0168) Grad: 5767.0986  LR: 0.000010  
Epoch: [1][1800/10725] Elapsed 23m 29s (remain 116m 24s) Loss: 0.0190(0.0161) Grad: 9527.6348  LR: 0.000010  
Epoch: [1][1900/10725] Elapsed 24m 49s (remain 115m 12s) Loss: 0.0020(0.0155) Grad: 7136.9961  LR: 0.000011  
Epoch: [1][2000/10725] Elapsed 26m 6s (remain 113m 51s) Loss: 0.0004(0.0148) Grad: 1013.1073  LR: 0.000011  
Epoch: [1][2100/10725] Elapsed 27m 25s (remain 112m 36s) Loss: 0.0009(0.0143) Grad: 2706.7686  LR: 0.000012  
Epoch: [1][2200/10725] Elapsed 28m 44s (remain 111m 17s) Loss: 0.0048(0.0137) Grad: 15553.0547  LR: 0.000012  
Epoch: [1][2300/10725] Elapsed 30m 1s (remain 109m 54s) Loss: 0.0011(0.0133) Grad: 4056.6584  LR: 0.000013  
Epoch: [1][2400/10725] Elapsed 31m 20s (remain 108m 38s) Loss: 0.0010(0.0128) Grad: 3517.1995  LR: 0.000013  
Epoch: [1][2500/10725] Elapsed 32m 37s (remain 107m 18s) Loss: 0.0036(0.0125) Grad: 32045.5234  LR: 0.000014  
Epoch: [1][2600/10725] Elapsed 33m 57s (remain 106m 3s) Loss: 0.0007(0.0121) Grad: 1801.1404  LR: 0.000015  
Epoch: [1][2700/10725] Elapsed 35m 17s (remain 104m 50s) Loss: 0.0005(0.0117) Grad: 1637.6732  LR: 0.000015  
Epoch: [1][2800/10725] Elapsed 36m 36s (remain 103m 32s) Loss: 0.0083(0.0114) Grad: 47191.8867  LR: 0.000016  
Epoch: [1][2900/10725] Elapsed 37m 54s (remain 102m 14s) Loss: 0.0036(0.0111) Grad: 11375.3242  LR: 0.000016  
Epoch: [1][3000/10725] Elapsed 39m 13s (remain 100m 58s) Loss: 0.0029(0.0108) Grad: 15806.9951  LR: 0.000017  
Epoch: [1][3100/10725] Elapsed 40m 33s (remain 99m 41s) Loss: 0.0010(0.0106) Grad: 2986.7532  LR: 0.000017  
Epoch: [1][3200/10725] Elapsed 41m 51s (remain 98m 23s) Loss: 0.0103(0.0103) Grad: 30802.4746  LR: 0.000018  
Epoch: [1][3300/10725] Elapsed 43m 10s (remain 97m 6s) Loss: 0.0035(0.0101) Grad: 9580.5498  LR: 0.000018  
Epoch: [1][3400/10725] Elapsed 44m 28s (remain 95m 47s) Loss: 0.0004(0.0099) Grad: 1155.3033  LR: 0.000019  
Epoch: [1][3500/10725] Elapsed 45m 47s (remain 94m 29s) Loss: 0.0013(0.0097) Grad: 6109.9673  LR: 0.000020  
Epoch: [1][3600/10725] Elapsed 47m 7s (remain 93m 13s) Loss: 0.0004(0.0095) Grad: 990.5145  LR: 0.000020  
Epoch: [1][3700/10725] Elapsed 48m 25s (remain 91m 54s) Loss: 0.0238(0.0093) Grad: 33925.8047  LR: 0.000020  
Epoch: [1][3800/10725] Elapsed 49m 44s (remain 90m 36s) Loss: 0.0019(0.0091) Grad: 6211.4165  LR: 0.000020  
Epoch: [1][3900/10725] Elapsed 51m 2s (remain 89m 17s) Loss: 0.0046(0.0089) Grad: 12396.3379  LR: 0.000020  
Epoch: [1][4000/10725] Elapsed 52m 20s (remain 87m 57s) Loss: 0.0006(0.0088) Grad: 5429.6372  LR: 0.000020  
Epoch: [1][4100/10725] Elapsed 53m 38s (remain 86m 37s) Loss: 0.0001(0.0086) Grad: 157.9724  LR: 0.000020  
Epoch: [1][4200/10725] Elapsed 54m 56s (remain 85m 18s) Loss: 0.0109(0.0085) Grad: 35695.6641  LR: 0.000020  
Epoch: [1][4300/10725] Elapsed 56m 15s (remain 84m 1s) Loss: 0.0005(0.0083) Grad: 11120.7637  LR: 0.000020  
Epoch: [1][4400/10725] Elapsed 57m 33s (remain 82m 43s) Loss: 0.0011(0.0082) Grad: 11922.1807  LR: 0.000019  
Epoch: [1][4500/10725] Elapsed 58m 52s (remain 81m 24s) Loss: 0.0028(0.0081) Grad: 22513.1406  LR: 0.000019  
Epoch: [1][4600/10725] Elapsed 60m 11s (remain 80m 7s) Loss: 0.0004(0.0079) Grad: 15846.6738  LR: 0.000019  
Epoch: [1][4700/10725] Elapsed 61m 29s (remain 78m 48s) Loss: 0.0021(0.0078) Grad: 9569.3574  LR: 0.000019  
Epoch: [1][4800/10725] Elapsed 62m 48s (remain 77m 30s) Loss: 0.0005(0.0077) Grad: 4112.3115  LR: 0.000019  
Epoch: [1][4900/10725] Elapsed 64m 7s (remain 76m 12s) Loss: 0.0016(0.0076) Grad: 7047.2178  LR: 0.000019  
Epoch: [1][5000/10725] Elapsed 65m 25s (remain 74m 53s) Loss: 0.0048(0.0074) Grad: 11473.4004  LR: 0.000019  
Epoch: [1][5100/10725] Elapsed 66m 43s (remain 73m 33s) Loss: 0.0019(0.0073) Grad: 10736.6152  LR: 0.000019  
Epoch: [1][5200/10725] Elapsed 68m 2s (remain 72m 15s) Loss: 0.0004(0.0072) Grad: 4873.6543  LR: 0.000019  
Epoch: [1][5300/10725] Elapsed 69m 20s (remain 70m 57s) Loss: 0.0019(0.0071) Grad: 6159.7915  LR: 0.000019  
Epoch: [1][5400/10725] Elapsed 70m 39s (remain 69m 39s) Loss: 0.0050(0.0070) Grad: 13819.2275  LR: 0.000019  
Epoch: [1][5500/10725] Elapsed 71m 58s (remain 68m 21s) Loss: 0.0000(0.0069) Grad: 55.9221  LR: 0.000019  
Epoch: [1][5600/10725] Elapsed 73m 17s (remain 67m 3s) Loss: 0.0003(0.0068) Grad: 8122.9854  LR: 0.000019  
Epoch: [1][5700/10725] Elapsed 74m 35s (remain 65m 44s) Loss: 0.0002(0.0068) Grad: 282.9332  LR: 0.000019  
Epoch: [1][5800/10725] Elapsed 75m 54s (remain 64m 26s) Loss: 0.0001(0.0067) Grad: 3327.1494  LR: 0.000019  
Epoch: [1][5900/10725] Elapsed 77m 14s (remain 63m 8s) Loss: 0.0003(0.0066) Grad: 1180.1998  LR: 0.000019  
Epoch: [1][6000/10725] Elapsed 78m 31s (remain 61m 49s) Loss: 0.0005(0.0065) Grad: 3677.9929  LR: 0.000018  
Epoch: [1][6100/10725] Elapsed 79m 50s (remain 60m 30s) Loss: 0.0001(0.0065) Grad: 217.9258  LR: 0.000018  
Epoch: [1][6200/10725] Elapsed 81m 8s (remain 59m 11s) Loss: 0.0002(0.0064) Grad: 1728.5325  LR: 0.000018  
Epoch: [1][6300/10725] Elapsed 82m 26s (remain 57m 52s) Loss: 0.0012(0.0063) Grad: 7783.2979  LR: 0.000018  
Epoch: [1][6400/10725] Elapsed 83m 44s (remain 56m 33s) Loss: 0.0000(0.0062) Grad: 77.0162  LR: 0.000018  
Epoch: [1][6500/10725] Elapsed 85m 2s (remain 55m 15s) Loss: 0.0078(0.0062) Grad: 40717.1250  LR: 0.000018  
Epoch: [1][6600/10725] Elapsed 86m 19s (remain 53m 55s) Loss: 0.0149(0.0061) Grad: 20021.3223  LR: 0.000018  
Epoch: [1][6700/10725] Elapsed 87m 37s (remain 52m 37s) Loss: 0.0000(0.0061) Grad: 102.2815  LR: 0.000018  
Epoch: [1][6800/10725] Elapsed 88m 55s (remain 51m 18s) Loss: 0.0001(0.0060) Grad: 462.8588  LR: 0.000018  
Epoch: [1][6900/10725] Elapsed 90m 13s (remain 49m 59s) Loss: 0.0038(0.0059) Grad: 10764.5430  LR: 0.000018  
Epoch: [1][7000/10725] Elapsed 91m 32s (remain 48m 41s) Loss: 0.0001(0.0059) Grad: 498.4266  LR: 0.000018  
Epoch: [1][7100/10725] Elapsed 92m 51s (remain 47m 23s) Loss: 0.0021(0.0058) Grad: 19751.7227  LR: 0.000018  
Epoch: [1][7200/10725] Elapsed 94m 9s (remain 46m 4s) Loss: 0.0145(0.0058) Grad: 35044.4375  LR: 0.000018  
Epoch: [1][7300/10725] Elapsed 95m 26s (remain 44m 45s) Loss: 0.0019(0.0057) Grad: 14327.2441  LR: 0.000018  
Epoch: [1][7400/10725] Elapsed 96m 45s (remain 43m 27s) Loss: 0.0003(0.0057) Grad: 8403.8701  LR: 0.000018  
Epoch: [1][7500/10725] Elapsed 98m 3s (remain 42m 8s) Loss: 0.0005(0.0056) Grad: 3937.2476  LR: 0.000018  
Epoch: [1][7600/10725] Elapsed 99m 21s (remain 40m 50s) Loss: 0.0000(0.0055) Grad: 63.1579  LR: 0.000017  
Epoch: [1][7700/10725] Elapsed 100m 39s (remain 39m 31s) Loss: 0.0009(0.0055) Grad: 65604.2031  LR: 0.000017  
Epoch: [1][7800/10725] Elapsed 101m 59s (remain 38m 13s) Loss: 0.0000(0.0055) Grad: 49.9067  LR: 0.000017  
Epoch: [1][7900/10725] Elapsed 103m 18s (remain 36m 55s) Loss: 0.0007(0.0054) Grad: 12430.2881  LR: 0.000017  
Epoch: [1][8000/10725] Elapsed 104m 36s (remain 35m 37s) Loss: 0.0011(0.0054) Grad: 15036.2334  LR: 0.000017  
Epoch: [1][8100/10725] Elapsed 105m 55s (remain 34m 18s) Loss: 0.0020(0.0053) Grad: 19436.0605  LR: 0.000017  
Epoch: [1][8200/10725] Elapsed 107m 13s (remain 33m 0s) Loss: 0.0000(0.0053) Grad: 170.9880  LR: 0.000017  
Epoch: [1][8300/10725] Elapsed 108m 31s (remain 31m 41s) Loss: 0.0005(0.0052) Grad: 30005.6719  LR: 0.000017  
Epoch: [1][8400/10725] Elapsed 109m 49s (remain 30m 23s) Loss: 0.0039(0.0052) Grad: 83709.7969  LR: 0.000017  
Epoch: [1][8500/10725] Elapsed 111m 7s (remain 29m 4s) Loss: 0.0030(0.0051) Grad: 15506.3232  LR: 0.000017  
Epoch: [1][8600/10725] Elapsed 112m 26s (remain 27m 46s) Loss: 0.0001(0.0051) Grad: 973.5686  LR: 0.000017  
Epoch: [1][8700/10725] Elapsed 113m 43s (remain 26m 27s) Loss: 0.0043(0.0051) Grad: 25020.6348  LR: 0.000017  
Epoch: [1][8800/10725] Elapsed 115m 1s (remain 25m 8s) Loss: 0.0000(0.0050) Grad: 129.7269  LR: 0.000017  
Epoch: [1][8900/10725] Elapsed 116m 20s (remain 23m 50s) Loss: 0.0001(0.0050) Grad: 3157.2842  LR: 0.000017  
Epoch: [1][9000/10725] Elapsed 117m 38s (remain 22m 31s) Loss: 0.0002(0.0050) Grad: 3165.0405  LR: 0.000017  
Epoch: [1][9100/10725] Elapsed 118m 55s (remain 21m 13s) Loss: 0.0000(0.0049) Grad: 80.1489  LR: 0.000017  
Epoch: [1][9200/10725] Elapsed 120m 14s (remain 19m 54s) Loss: 0.0000(0.0049) Grad: 198.6439  LR: 0.000017  
Epoch: [1][9300/10725] Elapsed 121m 33s (remain 18m 36s) Loss: 0.0000(0.0048) Grad: 229.1442  LR: 0.000016  
Epoch: [1][9400/10725] Elapsed 122m 51s (remain 17m 18s) Loss: 0.0000(0.0048) Grad: 18.4620  LR: 0.000016  
Epoch: [1][9500/10725] Elapsed 124m 9s (remain 15m 59s) Loss: 0.0000(0.0048) Grad: 96.5625  LR: 0.000016  
Epoch: [1][9600/10725] Elapsed 125m 27s (remain 14m 41s) Loss: 0.0000(0.0047) Grad: 115.0107  LR: 0.000016  
Epoch: [1][9700/10725] Elapsed 126m 44s (remain 13m 22s) Loss: 0.0007(0.0047) Grad: 23155.3047  LR: 0.000016  
Epoch: [1][9800/10725] Elapsed 128m 2s (remain 12m 4s) Loss: 0.0065(0.0047) Grad: 13480.9482  LR: 0.000016  
Epoch: [1][9900/10725] Elapsed 129m 18s (remain 10m 45s) Loss: 0.0005(0.0046) Grad: 16049.7832  LR: 0.000016  
Epoch: [1][10000/10725] Elapsed 130m 36s (remain 9m 27s) Loss: 0.0001(0.0046) Grad: 345.7596  LR: 0.000016  
Epoch: [1][10100/10725] Elapsed 131m 54s (remain 8m 8s) Loss: 0.0000(0.0046) Grad: 24.2811  LR: 0.000016  
Epoch: [1][10200/10725] Elapsed 133m 12s (remain 6m 50s) Loss: 0.0000(0.0046) Grad: 69.1768  LR: 0.000016  
Epoch: [1][10300/10725] Elapsed 134m 30s (remain 5m 32s) Loss: 0.0002(0.0045) Grad: 6769.7075  LR: 0.000016  
Epoch: [1][10400/10725] Elapsed 135m 47s (remain 4m 13s) Loss: 0.0000(0.0045) Grad: 139.4797  LR: 0.000016  
Epoch: [1][10500/10725] Elapsed 137m 5s (remain 2m 55s) Loss: 0.0019(0.0045) Grad: 23112.8984  LR: 0.000016  
Epoch: [1][10600/10725] Elapsed 138m 23s (remain 1m 37s) Loss: 0.0000(0.0044) Grad: 147.0250  LR: 0.000016  
Epoch: [1][10700/10725] Elapsed 139m 43s (remain 0m 18s) Loss: 0.0022(0.0044) Grad: 23051.1152  LR: 0.000016  
Epoch: [1][10724/10725] Elapsed 140m 2s (remain 0m 0s) Loss: 0.0002(0.0044) Grad: 4673.7603  LR: 0.000016  
EVAL: [0/1192] Elapsed 0m 1s (remain 26m 56s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 31s (remain 5m 44s) Loss: 0.0001(0.0016) 
EVAL: [200/1192] Elapsed 1m 2s (remain 5m 6s) Loss: 0.0001(0.0024) 
EVAL: [300/1192] Elapsed 1m 32s (remain 4m 33s) Loss: 0.0004(0.0039) 
EVAL: [400/1192] Elapsed 2m 3s (remain 4m 3s) Loss: 0.0069(0.0040) 
EVAL: [500/1192] Elapsed 2m 34s (remain 3m 32s) Loss: 0.0121(0.0036) 
EVAL: [600/1192] Elapsed 3m 4s (remain 3m 1s) Loss: 0.0582(0.0036) 
EVAL: [700/1192] Elapsed 3m 34s (remain 2m 30s) Loss: 0.0024(0.0041) 
EVAL: [800/1192] Elapsed 4m 4s (remain 1m 59s) Loss: 0.0049(0.0040) 
EVAL: [900/1192] Elapsed 4m 34s (remain 1m 28s) Loss: 0.0003(0.0038) 
EVAL: [1000/1192] Elapsed 5m 3s (remain 0m 57s) Loss: 0.0000(0.0037) 
EVAL: [1100/1192] Elapsed 5m 33s (remain 0m 27s) Loss: 0.0017(0.0035) 
EVAL: [1191/1192] Elapsed 6m 1s (remain 0m 0s) Loss: 0.0020(0.0033) 
Epoch 1 - avg_train_loss: 0.0044  avg_val_loss: 0.0033  time: 8768s
Epoch 1 - Score: 0.8735
Epoch 1 - Save Best Score: 0.8735 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy
100%
612602/612602 [00:01<00:00, 603160.07it/s]
100%
612602/612602 [00:43<00:00, 16492.57it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(42900, 11)
Epoch: [2][0/14300] Elapsed 0m 2s (remain 608m 32s) Loss: 0.0000(0.0000) Grad: 1003.2454  LR: 0.000016  
Epoch: [2][100/14300] Elapsed 1m 21s (remain 189m 52s) Loss: 0.0003(0.0016) Grad: 3334.9070  LR: 0.000015  
Epoch: [2][200/14300] Elapsed 2m 40s (remain 187m 3s) Loss: 0.0001(0.0016) Grad: 390.6207  LR: 0.000015  
Epoch: [2][300/14300] Elapsed 3m 58s (remain 185m 2s) Loss: 0.0004(0.0014) Grad: 4347.4834  LR: 0.000015  
Epoch: [2][400/14300] Elapsed 5m 16s (remain 182m 38s) Loss: 0.0000(0.0013) Grad: 26.7060  LR: 0.000015  
Epoch: [2][500/14300] Elapsed 6m 33s (remain 180m 48s) Loss: 0.0003(0.0013) Grad: 1122.2559  LR: 0.000015  
Epoch: [2][600/14300] Elapsed 7m 51s (remain 179m 6s) Loss: 0.0000(0.0012) Grad: 74.0411  LR: 0.000015  
Epoch: [2][700/14300] Elapsed 9m 8s (remain 177m 17s) Loss: 0.0016(0.0013) Grad: 9572.7266  LR: 0.000015  
Epoch: [2][800/14300] Elapsed 10m 26s (remain 175m 51s) Loss: 0.0002(0.0013) Grad: 1013.4861  LR: 0.000015  
Epoch: [2][900/14300] Elapsed 11m 44s (remain 174m 31s) Loss: 0.0048(0.0012) Grad: 39223.5703  LR: 0.000015  
Epoch: [2][1000/14300] Elapsed 13m 3s (remain 173m 23s) Loss: 0.0001(0.0012) Grad: 178.8746  LR: 0.000015  
Epoch: [2][1100/14300] Elapsed 14m 21s (remain 172m 4s) Loss: 0.0000(0.0012) Grad: 71.6916  LR: 0.000015  
Epoch: [2][1200/14300] Elapsed 15m 40s (remain 170m 55s) Loss: 0.0000(0.0012) Grad: 18.8930  LR: 0.000015  
Epoch: [2][1300/14300] Elapsed 16m 57s (remain 169m 31s) Loss: 0.0005(0.0012) Grad: 6606.3813  LR: 0.000015  
Epoch: [2][1400/14300] Elapsed 18m 16s (remain 168m 14s) Loss: 0.0007(0.0012) Grad: 2596.3323  LR: 0.000015  
Epoch: [2][1500/14300] Elapsed 19m 33s (remain 166m 47s) Loss: 0.0000(0.0012) Grad: 62.6965  LR: 0.000015  
Epoch: [2][1600/14300] Elapsed 20m 50s (remain 165m 21s) Loss: 0.0001(0.0012) Grad: 119.8864  LR: 0.000015  
Epoch: [2][1700/14300] Elapsed 22m 9s (remain 164m 10s) Loss: 0.0002(0.0012) Grad: 411.4431  LR: 0.000014  
Epoch: [2][1800/14300] Elapsed 23m 28s (remain 162m 55s) Loss: 0.0000(0.0012) Grad: 159.1267  LR: 0.000014  
Epoch: [2][1900/14300] Elapsed 24m 46s (remain 161m 33s) Loss: 0.0008(0.0012) Grad: 2344.0781  LR: 0.000014  
Epoch: [2][2000/14300] Elapsed 26m 4s (remain 160m 15s) Loss: 0.0000(0.0012) Grad: 5.2840  LR: 0.000014  
Epoch: [2][2100/14300] Elapsed 27m 21s (remain 158m 52s) Loss: 0.0003(0.0012) Grad: 1779.8521  LR: 0.000014  
Epoch: [2][2200/14300] Elapsed 28m 39s (remain 157m 30s) Loss: 0.0002(0.0012) Grad: 971.1131  LR: 0.000014  
Epoch: [2][2300/14300] Elapsed 29m 57s (remain 156m 11s) Loss: 0.0004(0.0012) Grad: 14280.6162  LR: 0.000014  
Epoch: [2][2400/14300] Elapsed 31m 14s (remain 154m 51s) Loss: 0.0004(0.0012) Grad: 1496.6321  LR: 0.000014  
Epoch: [2][2500/14300] Elapsed 32m 32s (remain 153m 30s) Loss: 0.0001(0.0012) Grad: 339.3765  LR: 0.000014  
Epoch: [2][2600/14300] Elapsed 33m 49s (remain 152m 9s) Loss: 0.0000(0.0012) Grad: 84.7093  LR: 0.000014  
Epoch: [2][2700/14300] Elapsed 35m 7s (remain 150m 50s) Loss: 0.0001(0.0012) Grad: 229.4699  LR: 0.000014  
Epoch: [2][2800/14300] Elapsed 36m 26s (remain 149m 36s) Loss: 0.0001(0.0012) Grad: 262.8103  LR: 0.000014  
Epoch: [2][2900/14300] Elapsed 37m 45s (remain 148m 20s) Loss: 0.0005(0.0012) Grad: 4281.6440  LR: 0.000014  
Epoch: [2][3000/14300] Elapsed 39m 3s (remain 147m 1s) Loss: 0.0058(0.0012) Grad: 16840.6055  LR: 0.000014  
Epoch: [2][3100/14300] Elapsed 40m 21s (remain 145m 45s) Loss: 0.0009(0.0012) Grad: 7366.1719  LR: 0.000014  
Epoch: [2][3200/14300] Elapsed 41m 40s (remain 144m 31s) Loss: 0.0057(0.0011) Grad: 13801.0381  LR: 0.000014  
Epoch: [2][3300/14300] Elapsed 42m 58s (remain 143m 11s) Loss: 0.0000(0.0012) Grad: 64.6432  LR: 0.000014  
Epoch: [2][3400/14300] Elapsed 44m 16s (remain 141m 51s) Loss: 0.0018(0.0012) Grad: 3332.6238  LR: 0.000013  
Epoch: [2][3500/14300] Elapsed 45m 33s (remain 140m 32s) Loss: 0.0000(0.0012) Grad: 30.0791  LR: 0.000013  
Epoch: [2][3600/14300] Elapsed 46m 51s (remain 139m 13s) Loss: 0.0002(0.0012) Grad: 1083.9282  LR: 0.000013  
Epoch: [2][3700/14300] Elapsed 48m 9s (remain 137m 54s) Loss: 0.0000(0.0012) Grad: 12.9365  LR: 0.000013  
Epoch: [2][3800/14300] Elapsed 49m 26s (remain 136m 34s) Loss: 0.0000(0.0012) Grad: 30.0483  LR: 0.000013  
Epoch: [2][3900/14300] Elapsed 50m 45s (remain 135m 17s) Loss: 0.0001(0.0012) Grad: 860.9508  LR: 0.000013  
Epoch: [2][4000/14300] Elapsed 52m 2s (remain 133m 57s) Loss: 0.0000(0.0012) Grad: 38.6226  LR: 0.000013  
Epoch: [2][4100/14300] Elapsed 53m 20s (remain 132m 39s) Loss: 0.0012(0.0012) Grad: 22697.8750  LR: 0.000013  
Epoch: [2][4200/14300] Elapsed 54m 38s (remain 131m 20s) Loss: 0.0013(0.0011) Grad: 7809.5293  LR: 0.000013  
Epoch: [2][4300/14300] Elapsed 55m 56s (remain 130m 3s) Loss: 0.0004(0.0011) Grad: 1061.7340  LR: 0.000013  
Epoch: [2][4400/14300] Elapsed 57m 13s (remain 128m 43s) Loss: 0.0005(0.0011) Grad: 3420.1021  LR: 0.000013  
Epoch: [2][4500/14300] Elapsed 58m 30s (remain 127m 22s) Loss: 0.0000(0.0011) Grad: 135.0235  LR: 0.000013  
Epoch: [2][4600/14300] Elapsed 59m 47s (remain 126m 2s) Loss: 0.0000(0.0011) Grad: 52.8142  LR: 0.000013  
Epoch: [2][4700/14300] Elapsed 61m 4s (remain 124m 42s) Loss: 0.0000(0.0011) Grad: 78.7428  LR: 0.000013  
Epoch: [2][4800/14300] Elapsed 62m 21s (remain 123m 21s) Loss: 0.0033(0.0011) Grad: 16642.8633  LR: 0.000013  
Epoch: [2][4900/14300] Elapsed 63m 39s (remain 122m 4s) Loss: 0.0003(0.0011) Grad: 6588.2871  LR: 0.000013  
Epoch: [2][5000/14300] Elapsed 64m 57s (remain 120m 46s) Loss: 0.0000(0.0011) Grad: 57.8923  LR: 0.000012  
Epoch: [2][5100/14300] Elapsed 66m 14s (remain 119m 27s) Loss: 0.0001(0.0011) Grad: 4287.9692  LR: 0.000012  
Epoch: [2][5200/14300] Elapsed 67m 32s (remain 118m 9s) Loss: 0.0167(0.0011) Grad: 99562.5234  LR: 0.000012  
Epoch: [2][5300/14300] Elapsed 68m 48s (remain 116m 49s) Loss: 0.0000(0.0011) Grad: 41.1792  LR: 0.000012  
Epoch: [2][5400/14300] Elapsed 70m 6s (remain 115m 30s) Loss: 0.0000(0.0011) Grad: 94.2439  LR: 0.000012  
Epoch: [2][5500/14300] Elapsed 71m 23s (remain 114m 11s) Loss: 0.0000(0.0011) Grad: 95.7402  LR: 0.000012  
Epoch: [2][5600/14300] Elapsed 72m 40s (remain 112m 52s) Loss: 0.0000(0.0011) Grad: 44.3106  LR: 0.000012  
Epoch: [2][5700/14300] Elapsed 73m 58s (remain 111m 34s) Loss: 0.0015(0.0011) Grad: 3991.1353  LR: 0.000012  
Epoch: [2][5800/14300] Elapsed 75m 17s (remain 110m 19s) Loss: 0.0001(0.0011) Grad: 321.2121  LR: 0.000012  
Epoch: [2][5900/14300] Elapsed 76m 36s (remain 109m 2s) Loss: 0.0000(0.0011) Grad: 304.2308  LR: 0.000012  
Epoch: [2][6000/14300] Elapsed 77m 55s (remain 107m 46s) Loss: 0.0000(0.0011) Grad: 1862.1935  LR: 0.000012  
Epoch: [2][6100/14300] Elapsed 79m 13s (remain 106m 28s) Loss: 0.0000(0.0011) Grad: 74.0747  LR: 0.000012  
Epoch: [2][6200/14300] Elapsed 80m 32s (remain 105m 11s) Loss: 0.0002(0.0011) Grad: 3573.6660  LR: 0.000012  
Epoch: [2][6300/14300] Elapsed 81m 49s (remain 103m 52s) Loss: 0.0001(0.0011) Grad: 1098.3691  LR: 0.000012  
Epoch: [2][6400/14300] Elapsed 83m 6s (remain 102m 33s) Loss: 0.0000(0.0011) Grad: 75.4951  LR: 0.000012  
Epoch: [2][6500/14300] Elapsed 84m 24s (remain 101m 15s) Loss: 0.0103(0.0011) Grad: 33145.4805  LR: 0.000012  
Epoch: [2][6600/14300] Elapsed 85m 42s (remain 99m 58s) Loss: 0.0003(0.0011) Grad: 2596.6453  LR: 0.000011  
Epoch: [2][6700/14300] Elapsed 87m 0s (remain 98m 40s) Loss: 0.0014(0.0011) Grad: 8485.6777  LR: 0.000011  
Epoch: [2][6800/14300] Elapsed 88m 19s (remain 97m 22s) Loss: 0.0013(0.0011) Grad: 13498.0762  LR: 0.000011  
Epoch: [2][6900/14300] Elapsed 89m 37s (remain 96m 6s) Loss: 0.0002(0.0011) Grad: 1475.3358  LR: 0.000011  
Epoch: [2][7000/14300] Elapsed 90m 57s (remain 94m 49s) Loss: 0.0000(0.0011) Grad: 26.9246  LR: 0.000011  
Epoch: [2][7100/14300] Elapsed 92m 16s (remain 93m 32s) Loss: 0.0000(0.0011) Grad: 7.0819  LR: 0.000011  
Epoch: [2][7200/14300] Elapsed 93m 35s (remain 92m 15s) Loss: 0.0006(0.0011) Grad: 4448.2700  LR: 0.000011  
Epoch: [2][7300/14300] Elapsed 94m 53s (remain 90m 57s) Loss: 0.0000(0.0011) Grad: 143.6774  LR: 0.000011  
Epoch: [2][7400/14300] Elapsed 96m 11s (remain 89m 40s) Loss: 0.0003(0.0011) Grad: 4577.4829  LR: 0.000011  
Epoch: [2][7500/14300] Elapsed 97m 29s (remain 88m 22s) Loss: 0.0029(0.0011) Grad: 9118.8135  LR: 0.000011  
Epoch: [2][7600/14300] Elapsed 98m 47s (remain 87m 4s) Loss: 0.0001(0.0011) Grad: 1904.3202  LR: 0.000011  
Epoch: [2][7700/14300] Elapsed 100m 5s (remain 85m 46s) Loss: 0.0000(0.0011) Grad: 28.2094  LR: 0.000011  
Epoch: [2][7800/14300] Elapsed 101m 23s (remain 84m 27s) Loss: 0.0008(0.0011) Grad: 4450.4985  LR: 0.000011  
Epoch: [2][7900/14300] Elapsed 102m 40s (remain 83m 9s) Loss: 0.0029(0.0011) Grad: 12112.3516  LR: 0.000011  
Epoch: [2][8000/14300] Elapsed 103m 57s (remain 81m 50s) Loss: 0.0000(0.0011) Grad: 328.1860  LR: 0.000011  
Epoch: [2][8100/14300] Elapsed 105m 15s (remain 80m 32s) Loss: 0.0001(0.0011) Grad: 6348.1055  LR: 0.000011  
Epoch: [2][8200/14300] Elapsed 106m 32s (remain 79m 13s) Loss: 0.0000(0.0011) Grad: 249.5174  LR: 0.000010  
Epoch: [2][8300/14300] Elapsed 107m 50s (remain 77m 56s) Loss: 0.0000(0.0011) Grad: 1877.5913  LR: 0.000010  
Epoch: [2][8400/14300] Elapsed 109m 9s (remain 76m 38s) Loss: 0.0017(0.0011) Grad: 199912.0156  LR: 0.000010  
Epoch: [2][8500/14300] Elapsed 110m 27s (remain 75m 21s) Loss: 0.0000(0.0011) Grad: 252.9138  LR: 0.000010  
Epoch: [2][8600/14300] Elapsed 111m 44s (remain 74m 2s) Loss: 0.0005(0.0011) Grad: 10390.6787  LR: 0.000010  
Epoch: [2][8700/14300] Elapsed 113m 2s (remain 72m 44s) Loss: 0.0012(0.0011) Grad: 16340.8271  LR: 0.000010  
Epoch: [2][8800/14300] Elapsed 114m 19s (remain 71m 25s) Loss: 0.0001(0.0011) Grad: 4309.6035  LR: 0.000010  
Epoch: [2][8900/14300] Elapsed 115m 37s (remain 70m 8s) Loss: 0.0000(0.0011) Grad: 954.1835  LR: 0.000010  
Epoch: [2][9000/14300] Elapsed 116m 55s (remain 68m 49s) Loss: 0.0017(0.0011) Grad: 14509.5117  LR: 0.000010  
Epoch: [2][9100/14300] Elapsed 118m 14s (remain 67m 33s) Loss: 0.0004(0.0011) Grad: 7757.4941  LR: 0.000010  
Epoch: [2][9200/14300] Elapsed 119m 33s (remain 66m 15s) Loss: 0.0000(0.0011) Grad: 202.3801  LR: 0.000010  
Epoch: [2][9300/14300] Elapsed 120m 52s (remain 64m 57s) Loss: 0.0009(0.0011) Grad: 17756.6777  LR: 0.000010  
Epoch: [2][9400/14300] Elapsed 122m 11s (remain 63m 40s) Loss: 0.0066(0.0011) Grad: 99620.5625  LR: 0.000010  
Epoch: [2][9500/14300] Elapsed 123m 30s (remain 62m 22s) Loss: 0.0000(0.0011) Grad: 414.3385  LR: 0.000010  
Epoch: [2][9600/14300] Elapsed 124m 49s (remain 61m 5s) Loss: 0.0000(0.0011) Grad: 556.0711  LR: 0.000010  
Epoch: [2][9700/14300] Elapsed 126m 7s (remain 59m 47s) Loss: 0.0007(0.0011) Grad: 23973.5820  LR: 0.000010  
Epoch: [2][9800/14300] Elapsed 127m 24s (remain 58m 29s) Loss: 0.0033(0.0011) Grad: 49105.9766  LR: 0.000009  
Epoch: [2][9900/14300] Elapsed 128m 43s (remain 57m 11s) Loss: 0.0009(0.0011) Grad: 10174.1953  LR: 0.000009  
Epoch: [2][10000/14300] Elapsed 130m 1s (remain 55m 53s) Loss: 0.0007(0.0011) Grad: 7608.7808  LR: 0.000009  
Epoch: [2][10100/14300] Elapsed 131m 19s (remain 54m 35s) Loss: 0.0062(0.0011) Grad: 44582.5664  LR: 0.000009  
Epoch: [2][10200/14300] Elapsed 132m 38s (remain 53m 17s) Loss: 0.0008(0.0011) Grad: 46384.1016  LR: 0.000009  
Epoch: [2][10300/14300] Elapsed 133m 57s (remain 52m 0s) Loss: 0.0000(0.0011) Grad: 26.1196  LR: 0.000009  
Epoch: [2][10400/14300] Elapsed 135m 16s (remain 50m 42s) Loss: 0.0003(0.0011) Grad: 7052.0972  LR: 0.000009  
Epoch: [2][10500/14300] Elapsed 136m 34s (remain 49m 24s) Loss: 0.0002(0.0011) Grad: 3556.7646  LR: 0.000009  
Epoch: [2][10600/14300] Elapsed 137m 53s (remain 48m 6s) Loss: 0.0000(0.0011) Grad: 28.7248  LR: 0.000009  
Epoch: [2][10700/14300] Elapsed 139m 10s (remain 46m 48s) Loss: 0.0004(0.0011) Grad: 8234.4912  LR: 0.000009  
Epoch: [2][10800/14300] Elapsed 140m 28s (remain 45m 30s) Loss: 0.0000(0.0011) Grad: 86.7450  LR: 0.000009  
Epoch: [2][10900/14300] Elapsed 141m 45s (remain 44m 12s) Loss: 0.0012(0.0011) Grad: 22042.1641  LR: 0.000009  
Epoch: [2][11000/14300] Elapsed 143m 3s (remain 42m 54s) Loss: 0.0019(0.0011) Grad: 32816.3633  LR: 0.000009  
Epoch: [2][11100/14300] Elapsed 144m 21s (remain 41m 35s) Loss: 0.0000(0.0011) Grad: 20.0722  LR: 0.000009  
Epoch: [2][11200/14300] Elapsed 145m 39s (remain 40m 17s) Loss: 0.0000(0.0011) Grad: 99.3855  LR: 0.000009  
Epoch: [2][11300/14300] Elapsed 146m 57s (remain 38m 59s) Loss: 0.0000(0.0011) Grad: 60.3720  LR: 0.000009  
Epoch: [2][11400/14300] Elapsed 148m 14s (remain 37m 41s) Loss: 0.0000(0.0011) Grad: 363.4564  LR: 0.000008  
Epoch: [2][11500/14300] Elapsed 149m 33s (remain 36m 23s) Loss: 0.0000(0.0011) Grad: 23.7852  LR: 0.000008  
Epoch: [2][11600/14300] Elapsed 150m 52s (remain 35m 6s) Loss: 0.0000(0.0011) Grad: 232.6448  LR: 0.000008  
Epoch: [2][11700/14300] Elapsed 152m 9s (remain 33m 47s) Loss: 0.0001(0.0011) Grad: 5606.1069  LR: 0.000008  
Epoch: [2][11800/14300] Elapsed 153m 27s (remain 32m 29s) Loss: 0.0000(0.0011) Grad: 22.7065  LR: 0.000008  
Epoch: [2][11900/14300] Elapsed 154m 45s (remain 31m 11s) Loss: 0.0000(0.0011) Grad: 508.4689  LR: 0.000008  
Epoch: [2][12000/14300] Elapsed 156m 2s (remain 29m 53s) Loss: 0.0000(0.0011) Grad: 317.3105  LR: 0.000008  
Epoch: [2][12100/14300] Elapsed 157m 21s (remain 28m 35s) Loss: 0.0000(0.0011) Grad: 348.0130  LR: 0.000008  
Epoch: [2][12200/14300] Elapsed 158m 39s (remain 27m 17s) Loss: 0.0000(0.0011) Grad: 186.8627  LR: 0.000008  
Epoch: [2][12300/14300] Elapsed 159m 58s (remain 25m 59s) Loss: 0.0000(0.0011) Grad: 39.5977  LR: 0.000008  
Epoch: [2][12400/14300] Elapsed 161m 16s (remain 24m 41s) Loss: 0.0071(0.0011) Grad: 84266.0625  LR: 0.000008  
Epoch: [2][12500/14300] Elapsed 162m 34s (remain 23m 23s) Loss: 0.0002(0.0011) Grad: 26008.5195  LR: 0.000008  
Epoch: [2][12600/14300] Elapsed 163m 53s (remain 22m 5s) Loss: 0.0003(0.0011) Grad: 26619.8984  LR: 0.000008  
Epoch: [2][12700/14300] Elapsed 165m 11s (remain 20m 47s) Loss: 0.0004(0.0011) Grad: 14442.9121  LR: 0.000008  
Epoch: [2][12800/14300] Elapsed 166m 29s (remain 19m 29s) Loss: 0.0038(0.0011) Grad: 160060.6875  LR: 0.000008  
Epoch: [2][12900/14300] Elapsed 167m 47s (remain 18m 11s) Loss: 0.0000(0.0011) Grad: 2979.4668  LR: 0.000008  
Epoch: [2][13000/14300] Elapsed 169m 5s (remain 16m 53s) Loss: 0.0000(0.0011) Grad: 33.7280  LR: 0.000007  
Epoch: [2][13100/14300] Elapsed 170m 22s (remain 15m 35s) Loss: 0.0000(0.0011) Grad: 1667.4327  LR: 0.000007  
Epoch: [2][13200/14300] Elapsed 171m 39s (remain 14m 17s) Loss: 0.0000(0.0011) Grad: 253.9574  LR: 0.000007  
Epoch: [2][13300/14300] Elapsed 172m 57s (remain 12m 59s) Loss: 0.0000(0.0011) Grad: 110.6088  LR: 0.000007  
Epoch: [2][13400/14300] Elapsed 174m 15s (remain 11m 41s) Loss: 0.0034(0.0011) Grad: 104954.0469  LR: 0.000007  
Epoch: [2][13500/14300] Elapsed 175m 33s (remain 10m 23s) Loss: 0.0000(0.0011) Grad: 32.7572  LR: 0.000007  
Epoch: [2][13600/14300] Elapsed 176m 51s (remain 9m 5s) Loss: 0.0140(0.0011) Grad: 56597.5156  LR: 0.000007  
Epoch: [2][13700/14300] Elapsed 178m 9s (remain 7m 47s) Loss: 0.0000(0.0011) Grad: 49.4389  LR: 0.000007  
Epoch: [2][13800/14300] Elapsed 179m 27s (remain 6m 29s) Loss: 0.0044(0.0011) Grad: 220763.1250  LR: 0.000007  
Epoch: [2][13900/14300] Elapsed 180m 44s (remain 5m 11s) Loss: 0.0000(0.0011) Grad: 31.0945  LR: 0.000007  
Epoch: [2][14000/14300] Elapsed 182m 1s (remain 3m 53s) Loss: 0.0000(0.0011) Grad: 162.6362  LR: 0.000007  
Epoch: [2][14100/14300] Elapsed 183m 19s (remain 2m 35s) Loss: 0.0000(0.0011) Grad: 558.5020  LR: 0.000007  
Epoch: [2][14200/14300] Elapsed 184m 37s (remain 1m 17s) Loss: 0.0000(0.0011) Grad: 102.4542  LR: 0.000007  
Epoch: [2][14299/14300] Elapsed 185m 54s (remain 0m 0s) Loss: 0.0004(0.0011) Grad: 22336.7324  LR: 0.000007  
EVAL: [0/1192] Elapsed 0m 1s (remain 29m 17s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 32s (remain 5m 49s) Loss: 0.0002(0.0027) 
EVAL: [200/1192] Elapsed 1m 2s (remain 5m 9s) Loss: 0.0000(0.0030) 
EVAL: [300/1192] Elapsed 1m 33s (remain 4m 36s) Loss: 0.0006(0.0048) 
EVAL: [400/1192] Elapsed 2m 4s (remain 4m 5s) Loss: 0.0148(0.0049) 
EVAL: [500/1192] Elapsed 2m 34s (remain 3m 32s) Loss: 0.0163(0.0045) 
EVAL: [600/1192] Elapsed 3m 4s (remain 3m 1s) Loss: 0.0743(0.0045) 
EVAL: [700/1192] Elapsed 3m 35s (remain 2m 30s) Loss: 0.0037(0.0052) 
EVAL: [800/1192] Elapsed 4m 5s (remain 1m 59s) Loss: 0.0029(0.0050) 
EVAL: [900/1192] Elapsed 4m 36s (remain 1m 29s) Loss: 0.0000(0.0048) 
EVAL: [1000/1192] Elapsed 5m 6s (remain 0m 58s) Loss: 0.0000(0.0046) 
EVAL: [1100/1192] Elapsed 5m 37s (remain 0m 27s) Loss: 0.0032(0.0044) 
EVAL: [1191/1192] Elapsed 6m 4s (remain 0m 0s) Loss: 0.0052(0.0042) 
Epoch 2 - avg_train_loss: 0.0011  avg_val_loss: 0.0042  time: 11523s
Epoch 2 - Score: 0.8876
Epoch 2 - Save Best Score: 0.8876 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy
100%
612602/612602 [00:01<00:00, 588009.03it/s]
100%
612602/612602 [00:43<00:00, 17360.82it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(53625, 11)
Epoch: [3][0/17875] Elapsed 0m 2s (remain 783m 44s) Loss: 0.0004(0.0004) Grad: 2663.9712  LR: 0.000007  
Epoch: [3][100/17875] Elapsed 1m 19s (remain 233m 13s) Loss: 0.0000(0.0011) Grad: 18.4184  LR: 0.000007  
Epoch: [3][200/17875] Elapsed 2m 36s (remain 229m 28s) Loss: 0.0012(0.0008) Grad: 10123.7871  LR: 0.000007  
Epoch: [3][300/17875] Elapsed 3m 54s (remain 228m 21s) Loss: 0.0000(0.0008) Grad: 206.4765  LR: 0.000006  
Epoch: [3][400/17875] Elapsed 5m 12s (remain 227m 18s) Loss: 0.0008(0.0008) Grad: 9341.0273  LR: 0.000006  
Epoch: [3][500/17875] Elapsed 6m 31s (remain 226m 1s) Loss: 0.0000(0.0007) Grad: 16.4064  LR: 0.000006  
Epoch: [3][600/17875] Elapsed 7m 48s (remain 224m 24s) Loss: 0.0004(0.0007) Grad: 1254.3105  LR: 0.000006  
Epoch: [3][700/17875] Elapsed 9m 6s (remain 223m 5s) Loss: 0.0001(0.0007) Grad: 271.5234  LR: 0.000006  
Epoch: [3][800/17875] Elapsed 10m 23s (remain 221m 24s) Loss: 0.0001(0.0008) Grad: 369.3791  LR: 0.000006  
Epoch: [3][900/17875] Elapsed 11m 39s (remain 219m 42s) Loss: 0.0000(0.0008) Grad: 31.0956  LR: 0.000006  
Epoch: [3][1000/17875] Elapsed 12m 57s (remain 218m 27s) Loss: 0.0001(0.0008) Grad: 394.9518  LR: 0.000006  
Epoch: [3][1100/17875] Elapsed 14m 14s (remain 217m 0s) Loss: 0.0000(0.0007) Grad: 22.0032  LR: 0.000006  
Epoch: [3][1200/17875] Elapsed 15m 32s (remain 215m 43s) Loss: 0.0016(0.0007) Grad: 7631.0752  LR: 0.000006  
Epoch: [3][1300/17875] Elapsed 16m 48s (remain 214m 12s) Loss: 0.0003(0.0007) Grad: 1450.2117  LR: 0.000006  
Epoch: [3][1400/17875] Elapsed 18m 5s (remain 212m 49s) Loss: 0.0000(0.0007) Grad: 50.1405  LR: 0.000006  
Epoch: [3][1500/17875] Elapsed 19m 23s (remain 211m 30s) Loss: 0.0010(0.0007) Grad: 10152.3711  LR: 0.000006  
Epoch: [3][1600/17875] Elapsed 20m 40s (remain 210m 4s) Loss: 0.0001(0.0008) Grad: 240.5393  LR: 0.000006  
Epoch: [3][1700/17875] Elapsed 21m 57s (remain 208m 45s) Loss: 0.0001(0.0007) Grad: 556.0814  LR: 0.000006  
Epoch: [3][1800/17875] Elapsed 23m 14s (remain 207m 22s) Loss: 0.0000(0.0008) Grad: 354.1158  LR: 0.000006  
Epoch: [3][1900/17875] Elapsed 24m 31s (remain 206m 5s) Loss: 0.0000(0.0008) Grad: 25.3620  LR: 0.000005  
Epoch: [3][2000/17875] Elapsed 25m 49s (remain 204m 54s) Loss: 0.0005(0.0008) Grad: 3832.3850  LR: 0.000005  
Epoch: [3][2100/17875] Elapsed 27m 6s (remain 203m 34s) Loss: 0.0000(0.0008) Grad: 232.6588  LR: 0.000005  
Epoch: [3][2200/17875] Elapsed 28m 25s (remain 202m 23s) Loss: 0.0026(0.0007) Grad: 3405.1001  LR: 0.000005  
Epoch: [3][2300/17875] Elapsed 29m 42s (remain 201m 3s) Loss: 0.0000(0.0007) Grad: 40.8416  LR: 0.000005  
Epoch: [3][2400/17875] Elapsed 30m 59s (remain 199m 42s) Loss: 0.0001(0.0007) Grad: 225.0918  LR: 0.000005  
Epoch: [3][2500/17875] Elapsed 32m 16s (remain 198m 25s) Loss: 0.0000(0.0007) Grad: 19.2576  LR: 0.000005  
Epoch: [3][2600/17875] Elapsed 33m 35s (remain 197m 15s) Loss: 0.0000(0.0007) Grad: 19.6271  LR: 0.000005  
Epoch: [3][2700/17875] Elapsed 34m 53s (remain 196m 0s) Loss: 0.0003(0.0007) Grad: 2166.3811  LR: 0.000005  
Epoch: [3][2800/17875] Elapsed 36m 11s (remain 194m 44s) Loss: 0.0000(0.0007) Grad: 7.1162  LR: 0.000005  
Epoch: [3][2900/17875] Elapsed 37m 30s (remain 193m 34s) Loss: 0.0000(0.0007) Grad: 162.8934  LR: 0.000005  
Epoch: [3][3000/17875] Elapsed 38m 48s (remain 192m 19s) Loss: 0.0003(0.0007) Grad: 975.8208  LR: 0.000005  
Epoch: [3][3100/17875] Elapsed 40m 6s (remain 191m 5s) Loss: 0.0016(0.0007) Grad: 4262.3584  LR: 0.000005  
Epoch: [3][3200/17875] Elapsed 41m 23s (remain 189m 46s) Loss: 0.0004(0.0007) Grad: 5396.0859  LR: 0.000005  
Epoch: [3][3300/17875] Elapsed 42m 40s (remain 188m 24s) Loss: 0.0046(0.0007) Grad: 9713.1553  LR: 0.000005  
Epoch: [3][3400/17875] Elapsed 43m 57s (remain 187m 4s) Loss: 0.0000(0.0007) Grad: 67.9863  LR: 0.000005  
Epoch: [3][3500/17875] Elapsed 45m 14s (remain 185m 42s) Loss: 0.0000(0.0007) Grad: 81.6024  LR: 0.000004  
Epoch: [3][3600/17875] Elapsed 46m 31s (remain 184m 26s) Loss: 0.0000(0.0007) Grad: 9.0314  LR: 0.000004  
Epoch: [3][3700/17875] Elapsed 47m 49s (remain 183m 8s) Loss: 0.0001(0.0007) Grad: 1115.3649  LR: 0.000004  
Epoch: [3][3800/17875] Elapsed 49m 5s (remain 181m 47s) Loss: 0.0003(0.0007) Grad: 4155.3545  LR: 0.000004  
Epoch: [3][3900/17875] Elapsed 50m 23s (remain 180m 31s) Loss: 0.0000(0.0007) Grad: 6.3610  LR: 0.000004  
Epoch: [3][4000/17875] Elapsed 51m 41s (remain 179m 14s) Loss: 0.0000(0.0007) Grad: 945.0820  LR: 0.000004  
Epoch: [3][4100/17875] Elapsed 52m 58s (remain 177m 56s) Loss: 0.0001(0.0007) Grad: 2494.6685  LR: 0.000004  
Epoch: [3][4200/17875] Elapsed 54m 15s (remain 176m 35s) Loss: 0.0000(0.0007) Grad: 170.7771  LR: 0.000004  
Epoch: [3][4300/17875] Elapsed 55m 33s (remain 175m 19s) Loss: 0.0016(0.0007) Grad: 17172.0508  LR: 0.000004  
Epoch: [3][4400/17875] Elapsed 56m 51s (remain 174m 4s) Loss: 0.0006(0.0007) Grad: 3368.2810  LR: 0.000004  
Epoch: [3][4500/17875] Elapsed 58m 8s (remain 172m 46s) Loss: 0.0000(0.0007) Grad: 107.8307  LR: 0.000004  
Epoch: [3][4600/17875] Elapsed 59m 25s (remain 171m 26s) Loss: 0.0000(0.0007) Grad: 14.6389  LR: 0.000004  
Epoch: [3][4700/17875] Elapsed 60m 43s (remain 170m 9s) Loss: 0.0000(0.0007) Grad: 748.2932  LR: 0.000004  
Epoch: [3][4800/17875] Elapsed 61m 59s (remain 168m 48s) Loss: 0.0000(0.0007) Grad: 588.0742  LR: 0.000004  
Epoch: [3][4900/17875] Elapsed 63m 16s (remain 167m 31s) Loss: 0.0000(0.0007) Grad: 14.2279  LR: 0.000004  
Epoch: [3][5000/17875] Elapsed 64m 35s (remain 166m 15s) Loss: 0.0000(0.0007) Grad: 27.1205  LR: 0.000004  
Epoch: [3][5100/17875] Elapsed 65m 53s (remain 165m 1s) Loss: 0.0004(0.0007) Grad: 3713.6289  LR: 0.000003  
Epoch: [3][5200/17875] Elapsed 67m 11s (remain 163m 43s) Loss: 0.0000(0.0007) Grad: 129.4379  LR: 0.000003  
Epoch: [3][5300/17875] Elapsed 68m 29s (remain 162m 27s) Loss: 0.0000(0.0007) Grad: 11.7740  LR: 0.000003  
Epoch: [3][5400/17875] Elapsed 69m 47s (remain 161m 11s) Loss: 0.0001(0.0007) Grad: 586.7022  LR: 0.000003  
Epoch: [3][5500/17875] Elapsed 71m 4s (remain 159m 51s) Loss: 0.0004(0.0007) Grad: 3603.1155  LR: 0.000003  
Epoch: [3][5600/17875] Elapsed 72m 22s (remain 158m 35s) Loss: 0.0027(0.0007) Grad: 42053.6758  LR: 0.000003  
Epoch: [3][5700/17875] Elapsed 73m 39s (remain 157m 16s) Loss: 0.0001(0.0007) Grad: 1606.7424  LR: 0.000003  
Epoch: [3][5800/17875] Elapsed 74m 56s (remain 155m 59s) Loss: 0.0000(0.0007) Grad: 35.6881  LR: 0.000003  
Epoch: [3][5900/17875] Elapsed 76m 14s (remain 154m 41s) Loss: 0.0023(0.0007) Grad: 21228.9258  LR: 0.000003  
Epoch: [3][6000/17875] Elapsed 77m 30s (remain 153m 22s) Loss: 0.0000(0.0007) Grad: 19.2288  LR: 0.000003  
Epoch: [3][6100/17875] Elapsed 78m 47s (remain 152m 2s) Loss: 0.0006(0.0007) Grad: 8244.6602  LR: 0.000003  
Epoch: [3][6200/17875] Elapsed 80m 4s (remain 150m 45s) Loss: 0.0000(0.0007) Grad: 12.1654  LR: 0.000003  
Epoch: [3][6300/17875] Elapsed 81m 22s (remain 149m 28s) Loss: 0.0001(0.0007) Grad: 4529.7803  LR: 0.000003  
Epoch: [3][6400/17875] Elapsed 82m 39s (remain 148m 10s) Loss: 0.0005(0.0007) Grad: 11602.3623  LR: 0.000003  
Epoch: [3][6500/17875] Elapsed 83m 56s (remain 146m 52s) Loss: 0.0000(0.0007) Grad: 136.8745  LR: 0.000003  
Epoch: [3][6600/17875] Elapsed 85m 14s (remain 145m 35s) Loss: 0.0008(0.0007) Grad: 9374.5889  LR: 0.000003  
Epoch: [3][6700/17875] Elapsed 86m 32s (remain 144m 17s) Loss: 0.0000(0.0007) Grad: 50.4008  LR: 0.000003  
Epoch: [3][6800/17875] Elapsed 87m 49s (remain 142m 59s) Loss: 0.0000(0.0007) Grad: 35.5686  LR: 0.000002  
Epoch: [3][6900/17875] Elapsed 89m 6s (remain 141m 41s) Loss: 0.0000(0.0007) Grad: 22.2397  LR: 0.000002  
Epoch: [3][7000/17875] Elapsed 90m 24s (remain 140m 26s) Loss: 0.0000(0.0007) Grad: 573.9084  LR: 0.000002  
Epoch: [3][7100/17875] Elapsed 91m 42s (remain 139m 8s) Loss: 0.0000(0.0007) Grad: 291.0385  LR: 0.000002  
Epoch: [3][7200/17875] Elapsed 92m 59s (remain 137m 51s) Loss: 0.0002(0.0007) Grad: 2069.4939  LR: 0.000002  
Epoch: [3][7300/17875] Elapsed 94m 17s (remain 136m 34s) Loss: 0.0022(0.0007) Grad: 8005.8135  LR: 0.000002  
Epoch: [3][7400/17875] Elapsed 95m 35s (remain 135m 16s) Loss: 0.0069(0.0007) Grad: 29265.7832  LR: 0.000002  
Epoch: [3][7500/17875] Elapsed 96m 54s (remain 134m 1s) Loss: 0.0000(0.0007) Grad: 262.8211  LR: 0.000002  
Epoch: [3][7600/17875] Elapsed 98m 11s (remain 132m 42s) Loss: 0.0000(0.0007) Grad: 14.7341  LR: 0.000002  
Epoch: [3][7700/17875] Elapsed 99m 28s (remain 131m 25s) Loss: 0.0121(0.0007) Grad: 14294.3203  LR: 0.000002  
Epoch: [3][7800/17875] Elapsed 100m 47s (remain 130m 9s) Loss: 0.0008(0.0007) Grad: 5795.3164  LR: 0.000002  
Epoch: [3][7900/17875] Elapsed 102m 5s (remain 128m 52s) Loss: 0.0004(0.0007) Grad: 6315.4731  LR: 0.000002  
Epoch: [3][8000/17875] Elapsed 103m 24s (remain 127m 36s) Loss: 0.0001(0.0007) Grad: 3206.4534  LR: 0.000002  
Epoch: [3][8100/17875] Elapsed 104m 42s (remain 126m 19s) Loss: 0.0002(0.0007) Grad: 12580.7842  LR: 0.000002  
Epoch: [3][8200/17875] Elapsed 106m 0s (remain 125m 2s) Loss: 0.0001(0.0007) Grad: 1220.3496  LR: 0.000002  
Epoch: [3][8300/17875] Elapsed 107m 19s (remain 123m 46s) Loss: 0.0000(0.0007) Grad: 35.4701  LR: 0.000002  
Epoch: [3][8400/17875] Elapsed 108m 37s (remain 122m 29s) Loss: 0.0028(0.0007) Grad: 7747.3335  LR: 0.000001  
Epoch: [3][8500/17875] Elapsed 109m 56s (remain 121m 13s) Loss: 0.0025(0.0007) Grad: 279438.5000  LR: 0.000001  
Epoch: [3][8600/17875] Elapsed 111m 14s (remain 119m 56s) Loss: 0.0005(0.0007) Grad: 21761.3633  LR: 0.000001  
Epoch: [3][8700/17875] Elapsed 112m 33s (remain 118m 40s) Loss: 0.0000(0.0007) Grad: 1271.5438  LR: 0.000001  
Epoch: [3][8800/17875] Elapsed 113m 51s (remain 117m 23s) Loss: 0.0001(0.0007) Grad: 5982.4546  LR: 0.000001  
Epoch: [3][8900/17875] Elapsed 115m 9s (remain 116m 6s) Loss: 0.0048(0.0007) Grad: 26794.2734  LR: 0.000001  
Epoch: [3][9000/17875] Elapsed 116m 26s (remain 114m 48s) Loss: 0.0000(0.0007) Grad: 20.4699  LR: 0.000001  
Epoch: [3][9100/17875] Elapsed 117m 43s (remain 113m 29s) Loss: 0.0000(0.0007) Grad: 1722.3552  LR: 0.000001  
Epoch: [3][9200/17875] Elapsed 119m 1s (remain 112m 12s) Loss: 0.0001(0.0007) Grad: 1063.6488  LR: 0.000001  
Epoch: [3][9300/17875] Elapsed 120m 20s (remain 110m 55s) Loss: 0.0000(0.0007) Grad: 102.2658  LR: 0.000001  
Epoch: [3][9400/17875] Elapsed 121m 37s (remain 109m 37s) Loss: 0.0055(0.0007) Grad: 215850.0312  LR: 0.000001  
Epoch: [3][9500/17875] Elapsed 122m 55s (remain 108m 20s) Loss: 0.0019(0.0008) Grad: 65357.2656  LR: 0.000001  
Epoch: [3][9600/17875] Elapsed 124m 13s (remain 107m 3s) Loss: 0.0000(0.0008) Grad: 143.4496  LR: 0.000001  
Epoch: [3][9700/17875] Elapsed 125m 31s (remain 105m 45s) Loss: 0.0000(0.0008) Grad: 70.1006  LR: 0.000001  
Epoch: [3][9800/17875] Elapsed 126m 49s (remain 104m 28s) Loss: 0.0000(0.0007) Grad: 27.6126  LR: 0.000001  
Epoch: [3][9900/17875] Elapsed 128m 6s (remain 103m 10s) Loss: 0.0001(0.0007) Grad: 3292.0972  LR: 0.000001  
Epoch: [3][10000/17875] Elapsed 129m 24s (remain 101m 53s) Loss: 0.0000(0.0007) Grad: 389.9382  LR: 0.000000  
Epoch: [3][10100/17875] Elapsed 130m 43s (remain 100m 36s) Loss: 0.0000(0.0007) Grad: 57.2838  LR: 0.000000  
Epoch: [3][10200/17875] Elapsed 132m 0s (remain 99m 18s) Loss: 0.0000(0.0007) Grad: 8.7488  LR: 0.000000  
Epoch: [3][10300/17875] Elapsed 133m 18s (remain 98m 0s) Loss: 0.0000(0.0007) Grad: 71.0738  LR: 0.000000  
Epoch: [3][10400/17875] Elapsed 134m 36s (remain 96m 43s) Loss: 0.0001(0.0007) Grad: 1753.6801  LR: 0.000000  
Epoch: [3][10500/17875] Elapsed 135m 55s (remain 95m 26s) Loss: 0.0007(0.0007) Grad: 26816.5059  LR: 0.000000  
Epoch: [3][10600/17875] Elapsed 137m 13s (remain 94m 9s) Loss: 0.0000(0.0007) Grad: 12.3174  LR: 0.000000  
Epoch: [3][10700/17875] Elapsed 138m 32s (remain 92m 52s) Loss: 0.0000(0.0007) Grad: 36.5569  LR: 0.000000  
Epoch: [3][10800/17875] Elapsed 139m 52s (remain 91m 36s) Loss: 0.0001(0.0007) Grad: 575.9795  LR: 0.000000  
Epoch: [3][10900/17875] Elapsed 141m 11s (remain 90m 19s) Loss: 0.0001(0.0007) Grad: 1962.4121  LR: 0.000000  
Epoch: [3][11000/17875] Elapsed 142m 29s (remain 89m 2s) Loss: 0.0000(0.0008) Grad: 804.1597  LR: 0.000000  
Epoch: [3][11100/17875] Elapsed 143m 47s (remain 87m 44s) Loss: 0.0091(0.0008) Grad: 107754.8594  LR: 0.000000  
Epoch: [3][11200/17875] Elapsed 145m 5s (remain 86m 27s) Loss: 0.0000(0.0008) Grad: 1247.3391  LR: 0.000000  
Epoch: [3][11300/17875] Elapsed 146m 22s (remain 85m 8s) Loss: 0.0000(0.0008) Grad: 87.9536  LR: 0.000000  
Epoch: [3][11400/17875] Elapsed 147m 40s (remain 83m 51s) Loss: 0.0000(0.0008) Grad: 29.4608  LR: 0.000000  
Epoch: [3][11500/17875] Elapsed 148m 57s (remain 82m 33s) Loss: 0.0000(0.0008) Grad: 16.2180  LR: 0.000000  
Epoch: [3][11600/17875] Elapsed 150m 15s (remain 81m 15s) Loss: 0.0044(0.0008) Grad: 81509.1016  LR: 0.000000  
Epoch: [3][11700/17875] Elapsed 151m 32s (remain 79m 57s) Loss: 0.0001(0.0008) Grad: 1338.0421  LR: 0.000000  
Epoch: [3][11800/17875] Elapsed 152m 50s (remain 78m 39s) Loss: 0.0000(0.0008) Grad: 102.6737  LR: 0.000000  
Epoch: [3][11900/17875] Elapsed 154m 8s (remain 77m 22s) Loss: 0.0000(0.0008) Grad: 86.1767  LR: 0.000000  
Epoch: [3][12000/17875] Elapsed 155m 24s (remain 76m 4s) Loss: 0.0003(0.0008) Grad: 20861.6699  LR: 0.000000  
Epoch: [3][12100/17875] Elapsed 156m 43s (remain 74m 46s) Loss: 0.0002(0.0008) Grad: 6383.9683  LR: 0.000000  
Epoch: [3][12200/17875] Elapsed 158m 0s (remain 73m 28s) Loss: 0.0001(0.0008) Grad: 697.1808  LR: 0.000000  
Epoch: [3][12300/17875] Elapsed 159m 18s (remain 72m 11s) Loss: 0.0016(0.0008) Grad: 165443.7188  LR: 0.000000  
Epoch: [3][12400/17875] Elapsed 160m 36s (remain 70m 53s) Loss: 0.0000(0.0008) Grad: 190.8864  LR: 0.000000  
Epoch: [3][12500/17875] Elapsed 161m 54s (remain 69m 36s) Loss: 0.0000(0.0008) Grad: 18.7516  LR: 0.000000  
Epoch: [3][12600/17875] Elapsed 163m 13s (remain 68m 18s) Loss: 0.0000(0.0008) Grad: 144.9736  LR: 0.000000  
Epoch: [3][12700/17875] Elapsed 164m 32s (remain 67m 1s) Loss: 0.0003(0.0008) Grad: 17145.2207  LR: 0.000000  
Epoch: [3][12800/17875] Elapsed 165m 50s (remain 65m 43s) Loss: 0.0001(0.0008) Grad: 1270.0243  LR: 0.000000  
Epoch: [3][12900/17875] Elapsed 167m 8s (remain 64m 26s) Loss: 0.0002(0.0008) Grad: 4995.1782  LR: 0.000000  
Epoch: [3][13000/17875] Elapsed 168m 26s (remain 63m 8s) Loss: 0.0003(0.0008) Grad: 22332.8301  LR: 0.000000  
Epoch: [3][13100/17875] Elapsed 169m 45s (remain 61m 51s) Loss: 0.0002(0.0008) Grad: 9925.7656  LR: 0.000000  
Epoch: [3][13200/17875] Elapsed 171m 3s (remain 60m 34s) Loss: 0.0000(0.0008) Grad: 583.6879  LR: 0.000000  
Epoch: [3][13300/17875] Elapsed 172m 21s (remain 59m 16s) Loss: 0.0000(0.0008) Grad: 981.2326  LR: 0.000000  
Epoch: [3][13400/17875] Elapsed 173m 39s (remain 57m 58s) Loss: 0.0000(0.0008) Grad: 60.0499  LR: 0.000000  
Epoch: [3][13500/17875] Elapsed 174m 56s (remain 56m 40s) Loss: 0.0151(0.0008) Grad: 398144.0625  LR: 0.000000  
Epoch: [3][13600/17875] Elapsed 176m 14s (remain 55m 22s) Loss: 0.0001(0.0008) Grad: 2401.0591  LR: 0.000000  
Epoch: [3][13700/17875] Elapsed 177m 31s (remain 54m 4s) Loss: 0.0000(0.0008) Grad: 12.8753  LR: 0.000000  
Epoch: [3][13800/17875] Elapsed 178m 50s (remain 52m 47s) Loss: 0.0000(0.0008) Grad: 435.4350  LR: 0.000000  
Epoch: [3][13900/17875] Elapsed 180m 7s (remain 51m 29s) Loss: 0.0000(0.0008) Grad: 1486.1135  LR: 0.000000  
Epoch: [3][14000/17875] Elapsed 181m 24s (remain 50m 11s) Loss: 0.0010(0.0008) Grad: 29665.8105  LR: 0.000000  
Epoch: [3][14100/17875] Elapsed 182m 42s (remain 48m 53s) Loss: 0.0062(0.0008) Grad: 180152.0156  LR: 0.000000  
Epoch: [3][14200/17875] Elapsed 183m 59s (remain 47m 35s) Loss: 0.0000(0.0008) Grad: 157.6270  LR: 0.000000  
Epoch: [3][14300/17875] Elapsed 185m 17s (remain 46m 18s) Loss: 0.0000(0.0008) Grad: 46.2629  LR: 0.000000  
Epoch: [3][14400/17875] Elapsed 186m 34s (remain 45m 0s) Loss: 0.0049(0.0008) Grad: 99658.6406  LR: 0.000000  
Epoch: [3][14500/17875] Elapsed 187m 51s (remain 43m 42s) Loss: 0.0002(0.0008) Grad: 7774.6089  LR: 0.000000  
Epoch: [3][14600/17875] Elapsed 189m 10s (remain 42m 25s) Loss: 0.0000(0.0008) Grad: 1386.0570  LR: 0.000000  
Epoch: [3][14700/17875] Elapsed 190m 26s (remain 41m 7s) Loss: 0.0030(0.0008) Grad: 254367.8906  LR: 0.000000  
Epoch: [3][14800/17875] Elapsed 191m 45s (remain 39m 49s) Loss: 0.0011(0.0008) Grad: 73893.5938  LR: 0.000000  
Epoch: [3][14900/17875] Elapsed 193m 2s (remain 38m 31s) Loss: 0.0012(0.0008) Grad: 17114.2949  LR: 0.000000  
Epoch: [3][15000/17875] Elapsed 194m 20s (remain 37m 14s) Loss: 0.0000(0.0008) Grad: 34.1061  LR: 0.000000  
Epoch: [3][15100/17875] Elapsed 195m 38s (remain 35m 56s) Loss: 0.0000(0.0008) Grad: 1387.6862  LR: 0.000000  
Epoch: [3][15200/17875] Elapsed 196m 55s (remain 34m 38s) Loss: 0.0095(0.0008) Grad: 60673.4102  LR: 0.000000  
Epoch: [3][15300/17875] Elapsed 198m 14s (remain 33m 20s) Loss: 0.0000(0.0008) Grad: 399.4679  LR: 0.000000  
Epoch: [3][15400/17875] Elapsed 199m 31s (remain 32m 3s) Loss: 0.0001(0.0008) Grad: 3218.2310  LR: 0.000000  
Epoch: [3][15500/17875] Elapsed 200m 49s (remain 30m 45s) Loss: 0.0000(0.0008) Grad: 180.7403  LR: 0.000000  
Epoch: [3][15600/17875] Elapsed 202m 7s (remain 29m 27s) Loss: 0.0012(0.0008) Grad: 57534.6133  LR: 0.000000  
Epoch: [3][15700/17875] Elapsed 203m 24s (remain 28m 9s) Loss: 0.0000(0.0008) Grad: 441.8329  LR: 0.000000  
Epoch: [3][15800/17875] Elapsed 204m 42s (remain 26m 52s) Loss: 0.0011(0.0008) Grad: 43283.4766  LR: 0.000000  
Epoch: [3][15900/17875] Elapsed 206m 0s (remain 25m 34s) Loss: 0.0000(0.0008) Grad: 622.4111  LR: 0.000000  
Epoch: [3][16000/17875] Elapsed 207m 17s (remain 24m 16s) Loss: 0.0001(0.0008) Grad: 5564.8101  LR: 0.000000  
Epoch: [3][16100/17875] Elapsed 208m 36s (remain 22m 59s) Loss: 0.0014(0.0008) Grad: 101150.1406  LR: 0.000000  
Epoch: [3][16200/17875] Elapsed 209m 54s (remain 21m 41s) Loss: 0.0038(0.0008) Grad: 261645.9844  LR: 0.000000  
Epoch: [3][16300/17875] Elapsed 211m 12s (remain 20m 23s) Loss: 0.0000(0.0008) Grad: 1846.0256  LR: 0.000000  
Epoch: [3][16400/17875] Elapsed 212m 29s (remain 19m 5s) Loss: 0.0000(0.0008) Grad: 2281.7229  LR: 0.000000  
Epoch: [3][16500/17875] Elapsed 213m 47s (remain 17m 48s) Loss: 0.0000(0.0008) Grad: 75.5523  LR: 0.000000  
Epoch: [3][16600/17875] Elapsed 215m 4s (remain 16m 30s) Loss: 0.0094(0.0008) Grad: 99118.5234  LR: 0.000000  
Epoch: [3][16700/17875] Elapsed 216m 22s (remain 15m 12s) Loss: 0.0000(0.0008) Grad: 991.4399  LR: 0.000000  
Epoch: [3][16800/17875] Elapsed 217m 41s (remain 13m 54s) Loss: 0.0000(0.0008) Grad: 1901.7327  LR: 0.000000  
Epoch: [3][16900/17875] Elapsed 219m 0s (remain 12m 37s) Loss: 0.0000(0.0008) Grad: 711.7406  LR: 0.000000  
Epoch: [3][17000/17875] Elapsed 220m 18s (remain 11m 19s) Loss: 0.0000(0.0008) Grad: 323.2853  LR: 0.000000  
Epoch: [3][17100/17875] Elapsed 221m 36s (remain 10m 1s) Loss: 0.0006(0.0008) Grad: 46519.2617  LR: 0.000000  
Epoch: [3][17200/17875] Elapsed 222m 54s (remain 8m 44s) Loss: 0.0000(0.0008) Grad: 114.4622  LR: 0.000000  
Epoch: [3][17300/17875] Elapsed 224m 13s (remain 7m 26s) Loss: 0.0000(0.0008) Grad: 269.1502  LR: 0.000000  
Epoch: [3][17400/17875] Elapsed 225m 30s (remain 6m 8s) Loss: 0.0001(0.0008) Grad: 7663.4551  LR: 0.000000  
Epoch: [3][17500/17875] Elapsed 226m 48s (remain 4m 50s) Loss: 0.0015(0.0008) Grad: 211025.1250  LR: 0.000000  
Epoch: [3][17600/17875] Elapsed 228m 6s (remain 3m 33s) Loss: 0.0062(0.0008) Grad: 78944.1016  LR: 0.000000  
Epoch: [3][17700/17875] Elapsed 229m 24s (remain 2m 15s) Loss: 0.0000(0.0008) Grad: 99.1477  LR: 0.000000  
Epoch: [3][17800/17875] Elapsed 230m 43s (remain 0m 57s) Loss: 0.0000(0.0008) Grad: 122.2502  LR: 0.000000  
Epoch: [3][17874/17875] Elapsed 231m 42s (remain 0m 0s) Loss: 0.0000(0.0008) Grad: 613.0089  LR: 0.000000  
EVAL: [0/1192] Elapsed 0m 1s (remain 28m 25s) Loss: 0.0000(0.0000) 
EVAL: [100/1192] Elapsed 0m 32s (remain 5m 46s) Loss: 0.0002(0.0028) 
EVAL: [200/1192] Elapsed 1m 1s (remain 5m 5s) Loss: 0.0000(0.0031) 
EVAL: [300/1192] Elapsed 1m 32s (remain 4m 33s) Loss: 0.0005(0.0048) 
EVAL: [400/1192] Elapsed 2m 2s (remain 4m 1s) Loss: 0.0153(0.0050) 
EVAL: [500/1192] Elapsed 2m 32s (remain 3m 30s) Loss: 0.0148(0.0045) 
EVAL: [600/1192] Elapsed 3m 3s (remain 3m 0s) Loss: 0.0699(0.0045) 
EVAL: [700/1192] Elapsed 3m 34s (remain 2m 29s) Loss: 0.0036(0.0052) 
EVAL: [800/1192] Elapsed 4m 5s (remain 1m 59s) Loss: 0.0025(0.0051) 
EVAL: [900/1192] Elapsed 4m 35s (remain 1m 29s) Loss: 0.0001(0.0049) 
EVAL: [1000/1192] Elapsed 5m 5s (remain 0m 58s) Loss: 0.0000(0.0047) 
EVAL: [1100/1192] Elapsed 5m 36s (remain 0m 27s) Loss: 0.0031(0.0045) 
EVAL: [1191/1192] Elapsed 6m 4s (remain 0m 0s) Loss: 0.0051(0.0042) 
Epoch 3 - avg_train_loss: 0.0008  avg_val_loss: 0.0042  time: 14271s
Epoch 3 - Score: 0.8879
Epoch 3 - Save Best Score: 0.8879 Model
get pseudo plain from ../output/nbme-score-clinical-patient-notes/make_pseudo_dataset/pseudo_plain.pkl
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp060/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp067/pseudo_labels_1.npy
get pseudo labels from ../output/nbme-score-clinical-patient-notes/nbme-exp083/pseudo_labels_1.npy
100%
612602/612602 [00:01<00:00, 600404.36it/s]
100%
612602/612602 [00:44<00:00, 16055.40it/s]
(612602, 950)
(612602, 6) (612602, 950)
(10725, 7)
(64350, 11)
Epoch: [4][0/21450] Elapsed 0m 3s (remain 1123m 26s) Loss: 0.0000(0.0000) Grad: 633.2291  LR: 0.000000  
Epoch: [4][100/21450] Elapsed 1m 21s (remain 285m 56s) Loss: 0.0000(0.0004) Grad: 0.4898  LR: 0.000000  
Epoch: [4][200/21450] Elapsed 2m 37s (remain 278m 0s) Loss: 0.0000(0.0006) Grad: 102.6198  LR: 0.000000  
Epoch: [4][300/21450] Elapsed 3m 55s (remain 275m 46s) Loss: 0.0000(0.0006) Grad: 46.1649  LR: 0.000000  
Epoch: [4][400/21450] Elapsed 5m 12s (remain 273m 41s) Loss: 0.0000(0.0007) Grad: 867.5959  LR: 0.000000  
Epoch: [4][500/21450] Elapsed 6m 30s (remain 272m 3s) Loss: 0.0000(0.0008) Grad: 16.6813  LR: 0.000000  
Epoch: [4][600/21450] Elapsed 7m 47s (remain 270m 25s) Loss: 0.0000(0.0008) Grad: 30.6760  LR: 0.000000  
Epoch: [4][700/21450] Elapsed 9m 6s (remain 269m 37s) Loss: 0.0000(0.0008) Grad: 38.1466  LR: 0.000000  
Epoch: [4][800/21450] Elapsed 10m 24s (remain 268m 30s) Loss: 0.0028(0.0008) Grad: 13783.3281  LR: 0.000000  
Epoch: [4][900/21450] Elapsed 11m 43s (remain 267m 33s) Loss: 0.0000(0.0008) Grad: 8.6973  LR: 0.000000  
Epoch: [4][1000/21450] Elapsed 13m 1s (remain 266m 3s) Loss: 0.0003(0.0007) Grad: 383.3264  LR: 0.000000  
Epoch: [4][1100/21450] Elapsed 14m 18s (remain 264m 29s) Loss: 0.0001(0.0007) Grad: 591.2455  LR: 0.000000  
Epoch: [4][1200/21450] Elapsed 15m 36s (remain 263m 17s) Loss: 0.0004(0.0007) Grad: 1188.1915  LR: 0.000000  
Epoch: [4][1300/21450] Elapsed 16m 54s (remain 261m 56s) Loss: 0.0004(0.0007) Grad: 729.6371  LR: 0.000000  
Epoch: [4][1400/21450] Elapsed 18m 13s (remain 260m 48s) Loss: 0.0022(0.0007) Grad: 4462.2168  LR: 0.000000  
1
4
Error retrieving VM Details
Git: idle
Python 3 | Busy
