In [1]:
# =========================================================================================
# Libraries
# =========================================================================================
import os
import gc
import time
import math
import random
import warnings
warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
import torch
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from torch.utils.checkpoint import checkpoint
import tokenizers
import transformers
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_cosine_schedule_with_warmup, DataCollatorWithPadding
from sklearn.model_selection import StratifiedGroupKFold
#%env TOKENIZERS_PARALLELISM=true
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "max_split_size_mb:21"
os.environ["TOKENIZERS_PARALLELISM"] = "true"
# =========================================================================================
# Configurations
# =========================================================================================
class CFG:
    print_freq = 1000
    num_workers = 24
    model = 'model/paraphrase-multilingual-mpnet-base-v2-epochs-30-tuned'
    tokenizer = AutoTokenizer.from_pretrained(model)
    gradient_checkpointing = False
    num_cycles = 0.5
    warmup_ratio = 0.1
    epochs = 6
    encoder_lr = 1e-5
    decoder_lr = 1e-4
    eps = 1e-6
    betas = (0.9, 0.999)
    batch_size = 32
    weight_decay = 0.01
    max_grad_norm = 0.012
    #max_len = 512
    max_len = 128
    n_folds = 5
    seed = 42
    
# =========================================================================================
# Seed everything for deterministic results
# =========================================================================================
def seed_everything(cfg):
    random.seed(cfg.seed)
    os.environ['PYTHONHASHSEED'] = str(cfg.seed)
    np.random.seed(cfg.seed)
    torch.manual_seed(cfg.seed)
    torch.cuda.manual_seed(cfg.seed)
    torch.backends.cudnn.deterministic = True
    
# =========================================================================================
# F2 score metric
# =========================================================================================
def f2_score(y_true, y_pred):
    y_true = y_true.apply(lambda x: set(x.split()))
    y_pred = y_pred.apply(lambda x: set(x.split()))
    tp = np.array([len(x[0] & x[1]) for x in zip(y_true, y_pred)])
    fp = np.array([len(x[1] - x[0]) for x in zip(y_true, y_pred)])
    fn = np.array([len(x[0] - x[1]) for x in zip(y_true, y_pred)])
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    f2 = tp / (tp + 0.2 * fp + 0.8 * fn)
    return round(f2.mean(), 4)

# =========================================================================================
# Data Loading
# =========================================================================================
def read_data(cfg):
    train = pd.read_parquet('data/candidates_50_train_79927.parquet')
    train['title1'].fillna("no title", inplace = True)
    train['title2'].fillna("no title", inplace = True)
    #topics['description'].fillna("no description", inplace = True)
    #content['description'].fillna("no description", inplace = True)
    
    correlations = pd.read_csv('data/kfold_correlations.csv')
    
    # Create feature column
    train['text'] = train['title1'] + '[SEP]' + train['title2']
    print(' ')
    print('-' * 50)
    print(f"train.shape: {train.shape}")
    print(f"correlations.shape: {correlations.shape}")
    return train, correlations


# =========================================================================================
# Get max length
# =========================================================================================
def get_max_length(train, cfg):
    lengths = []
    for text in tqdm(train['text'].fillna("").values, total = len(train)):
        length = len(cfg.tokenizer(text, add_special_tokens = False)['input_ids'])
        lengths.append(length)
    cfg.max_len = max(lengths) + 2 # cls & sep
    print(f"max_len: {cfg.max_len}")

# =========================================================================================
# Prepare input, tokenize
# =========================================================================================
def prepare_input(text, cfg):
    inputs = cfg.tokenizer.encode_plus(
        text, 
        return_tensors = None, 
        add_special_tokens = True, 
        max_length = cfg.max_len,
        pad_to_max_length = True,
        truncation = True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype = torch.long)
    return inputs

# =========================================================================================
# Custom dataset
# =========================================================================================
class custom_dataset(Dataset):
    def __init__(self, df, cfg):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df['target'].values
    def __len__(self):
        return len(self.texts)
    def __getitem__(self, item):
        inputs = prepare_input(self.texts[item], self.cfg)
        label = torch.tensor(self.labels[item], dtype = torch.float)
        return inputs, label
    
# =========================================================================================
# Collate function for training
# =========================================================================================
def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# =========================================================================================
# Mean pooling class
# =========================================================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    
# =========================================================================================
# Model
# =========================================================================================
class custom_model(nn.Module):
    def __init__(self, cfg):
        super().__init__()
        self.cfg = cfg
        self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states = True)
        self.config.hidden_dropout = 0.0
        self.config.hidden_dropout_prob = 0.0
        self.config.attention_dropout = 0.0
        self.config.attention_probs_dropout_prob = 0.0
        self.model = AutoModel.from_pretrained(cfg.model, config = self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 1)
        self._init_weights(self.fc)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_state = outputs.last_hidden_state
        feature = self.pool(last_hidden_state, inputs['attention_mask'])
        return feature
    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output
    
# =========================================================================================
# Helper functions
# =========================================================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()
    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0
    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count

def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)

def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))

# =========================================================================================
# Train function loop
# =========================================================================================
def train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled = True)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, target) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.cuda.amp.autocast(enabled = True):
            y_preds = model(inputs)
            loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        scaler.unscale_(optimizer)
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), cfg.max_grad_norm)
        scaler.step(optimizer)
        scaler.update()
        optimizer.zero_grad()
        global_step += 1
        scheduler.step()
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(train_loader) - 1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch + 1, 
                          step, 
                          len(train_loader), 
                          remain = timeSince(start, float(step + 1) / len(train_loader)),
                          loss = losses,
                          grad_norm = grad_norm,
                          lr = scheduler.get_lr()[0]))
    return losses.avg

# =========================================================================================
# Valid function loop
# =========================================================================================
def valid_fn(valid_loader, model, criterion, device, cfg):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, target) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        target = target.to(device)
        batch_size = target.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1), target)
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().squeeze().to('cpu').numpy().reshape(-1))
        end = time.time()
        if step % cfg.print_freq == 0 or step == (len(valid_loader) - 1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, 
                          len(valid_loader),
                          loss = losses,
                          remain = timeSince(start, float(step + 1) / len(valid_loader))))
    predictions = np.concatenate(preds, axis = 0)
    return losses.avg, predictions

# =========================================================================================
# Get best threshold
# =========================================================================================
def get_best_threshold(x_val, val_predictions, correlations):
    best_score = 0
    best_threshold = None
    for thres in np.arange(0.001, 0.99, 0.001):
        x_val['predictions'] = np.where(val_predictions > thres, 1, 0)
        x_val1 = x_val[x_val['predictions'] == 1]
        x_val1 = x_val1.groupby(['topics_ids'])['content_ids'].unique().reset_index()
        x_val1['content_ids'] = x_val1['content_ids'].apply(lambda x: ' '.join(x))
        x_val1.columns = ['topic_id', 'predictions']
        x_val0 = pd.Series(x_val['topics_ids'].unique())
        x_val0 = x_val0[~x_val0.isin(x_val1['topic_id'])]
        x_val0 = pd.DataFrame({'topic_id': x_val0.values, 'predictions': ""})
        x_val_r = pd.concat([x_val1, x_val0], axis = 0, ignore_index = True)
        x_val_r = x_val_r.merge(correlations, how = 'left', on = 'topic_id')
        score = f2_score(x_val_r['content_ids'], x_val_r['predictions'])
        if score > best_score:
            best_score = score
            best_threshold = thres
    return best_score, best_threshold
    
# =========================================================================================
# Train & Evaluate
# =========================================================================================
def train_and_evaluate_one_fold(train, correlations, fold, cfg):
    print(' ')
    print(f"========== fold: {fold} training ==========")
    # Split train & validation
    x_train = train[train['fold'] != fold]
    x_val = train[train['fold'] == fold]
    valid_labels = x_val['target'].values
    train_dataset = custom_dataset(x_train, cfg)
    valid_dataset = custom_dataset(x_val, cfg)
    train_loader = DataLoader(
        train_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = True, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = True
    )
    valid_loader = DataLoader(
        valid_dataset, 
        batch_size = cfg.batch_size, 
        shuffle = False, 
        num_workers = cfg.num_workers, 
        pin_memory = True, 
        drop_last = False
    )
    # Get model
    model = custom_model(cfg)
    model.to(device)
    # Optimizer
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay = 0.0):
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
            'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
            'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters
    optimizer_parameters = get_optimizer_params(
        model, 
        encoder_lr = cfg.encoder_lr, 
        decoder_lr = cfg.decoder_lr,
        weight_decay = cfg.weight_decay
    )
    optimizer = AdamW(
        optimizer_parameters, 
        lr = cfg.encoder_lr, 
        eps = cfg.eps, 
        betas = cfg.betas
    )
    num_train_steps = int(len(x_train) / cfg.batch_size * cfg.epochs)
    num_warmup_steps = num_train_steps * cfg.warmup_ratio
    # Scheduler
    scheduler = get_cosine_schedule_with_warmup(
        optimizer, 
        num_warmup_steps = num_warmup_steps, 
        num_training_steps = num_train_steps, 
        num_cycles = cfg.num_cycles
        )
    # Training & Validation loop
    criterion = nn.BCEWithLogitsLoss(reduction = "mean")
    best_score = 0
    for epoch in range(cfg.epochs):
        start_time = time.time()
        # Train
        avg_loss = train_fn(train_loader, model, criterion, optimizer, epoch, scheduler, device, cfg)
        # Validation
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device, cfg)
        # Compute f2_score
        score, threshold = get_best_threshold(x_val, predictions, correlations)
        elapsed = time.time() - start_time
        print(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        print(f'Epoch {epoch+1} - Score: {score:.4f} - Threshold: {threshold:.5f}')
        if score > best_score:
            best_score = score
            print(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save(
                {'model': model.state_dict(), 'predictions': predictions}, 
                f"{cfg.model.replace('/', '-')}_fold{fold}_{cfg.seed}_30_{epoch}.pth"
                )
            val_predictions = predictions
    torch.cuda.empty_cache()
    gc.collect()
    # Get best threshold
    best_score, best_threshold = get_best_threshold(x_val, val_predictions, correlations)
    print(f'Our CV score is {best_score} using a threshold of {best_threshold}')
    

In [2]:
# Seed everything
seed_everything(CFG)
# Read data
train, correlations = read_data(CFG)

 
--------------------------------------------------
train.shape: (3075850, 7)
correlations.shape: (61517, 3)


In [3]:
train

Unnamed: 0,topics_ids,content_ids,title1,title2,target,fold,text
0,t_00004da3a1b2,c_0feaaa5dc39d,Откриването на резисторите. Language_bg. Descr...,Успоредно свързани резистори. Language_bg. Des...,0.0,1,Откриването на резисторите. Language_bg. Descr...
1,t_09ad67f245fc,c_0feaaa5dc39d,Електрични заряди и електрично поле. Language_...,Успоредно свързани резистори. Language_bg. Des...,0.0,1,Електрични заряди и електрично поле. Language_...
2,t_261fb7043ad1,c_0feaaa5dc39d,Електричен ток и електрично напрежение. Langua...,Успоредно свързани резистори. Language_bg. Des...,0.0,3,Електричен ток и електрично напрежение. Langua...
3,t_2b1b6dfd096b,c_0feaaa5dc39d,Електрично поле. Language_bg. Description: no ...,Успоредно свързани резистори. Language_bg. Des...,0.0,0,Електрично поле. Language_bg. Description: no ...
4,t_3a1f5ae9f991,c_0feaaa5dc39d,Вериги с кондензатори. Language_bg. Descriptio...,Успоредно свързани резистори. Language_bg. Des...,0.0,0,Вериги с кондензатори. Language_bg. Descriptio...
...,...,...,...,...,...,...,...
3075845,t_fea53cc2a5bb,c_c4c2b22ec356,Suma y Resta de Fracciones. Language_es. Descr...,Calcula Expresiones con Números Mixtos. Langua...,1.0,2,Suma y Resta de Fracciones. Language_es. Descr...
3075846,t_ff0a0977e1fc,c_b49da9b18f9e,يتعرف التمثيل البياني للدوال المثلثية (جـا س ،...,يتعرف التمثيل البياني للدوال المثلثية (جـا س ،...,1.0,3,يتعرف التمثيل البياني للدوال المثلثية (جـا س ،...
3075847,t_fff9e5407d13,c_20de77522603,NA_U06 - El periódico. Language_es. Descriptio...,Resumen: El periódico. Language_es. Descriptio...,1.0,4,NA_U06 - El periódico. Language_es. Descriptio...
3075848,t_fff9e5407d13,c_d64037a72376,NA_U06 - El periódico. Language_es. Descriptio...,Introducción: El periódico. Language_es. Descr...,1.0,4,NA_U06 - El periódico. Language_es. Descriptio...


In [4]:
correlations

Unnamed: 0,topic_id,content_ids,fold
0,t_00004da3a1b2,c_1108dd0c7a5d c_376c5a8eb028 c_5bc0e1e2cba0 c...,1
1,t_00068291e9a4,c_639ea2ef9c95 c_89ce9367be10 c_ac1672cdcd2c c...,1
2,t_00069b63a70a,c_11a1dc0bfb99,4
3,t_0006d41a73a8,c_0c6473c3480d c_1c57a1316568 c_5e375cf14c47 c...,2
4,t_0008768bdee6,c_34e1424229b4 c_7d1a964d66d5 c_aab93ee667f4,0
...,...,...,...
61512,t_fff830472691,c_61fb63326e5d c_8f224e321c87,1
61513,t_fff9e5407d13,c_026db653a269 c_0fb048a6412c c_20de77522603 c...,4
61514,t_fffbe1d5d43c,c_46f852a49c08 c_6659207b25d5,2
61515,t_fffe14f1be1e,c_cece166bad6a,2


In [5]:
# Get max length
#get_max_length(train, CFG)
#2321

In [6]:
CFG.max_len

128

In [7]:
# Train and evaluate one fold
# Stage 1 Max Recall: 78403
train_and_evaluate_one_fold(train, correlations, 0, CFG)

 
Epoch: [1][0/76895] Elapsed 0m 1s (remain 2482m 15s) Loss: 0.7196(0.7196) Grad: 1.2228  LR: 0.00000000  
Epoch: [1][1000/76895] Elapsed 2m 32s (remain 193m 5s) Loss: 0.5699(0.6673) Grad: 1.1713  LR: 0.00000022  
Epoch: [1][2000/76895] Elapsed 5m 10s (remain 193m 48s) Loss: 0.2935(0.5236) Grad: 0.5687  LR: 0.00000043  
Epoch: [1][3000/76895] Elapsed 7m 50s (remain 193m 2s) Loss: 0.2428(0.4421) Grad: 0.3255  LR: 0.00000065  
Epoch: [1][4000/76895] Elapsed 10m 31s (remain 191m 52s) Loss: 0.3126(0.4017) Grad: 0.6944  LR: 0.00000087  
Epoch: [1][5000/76895] Elapsed 13m 12s (remain 189m 59s) Loss: 0.2343(0.3758) Grad: 0.5461  LR: 0.00000108  
Epoch: [1][6000/76895] Elapsed 15m 28s (remain 182m 43s) Loss: 0.0680(0.3580) Grad: 0.9157  LR: 0.00000130  
Epoch: [1][7000/76895] Elapsed 18m 0s (remain 179m 43s) Loss: 0.2476(0.3454) Grad: 0.7912  LR: 0.00000152  
Epoch: [1][8000/76895] Elapsed 20m 38s (remain 177m 45s) Loss: 0.2605(0.3361) Grad: 1.4408  LR: 0.00000173  
Epoch: [1][9000/76895] Elap

Epoch: [1][75000/76895] Elapsed 196m 24s (remain 4m 57s) Loss: 0.3326(0.2231) Grad: 3.1371  LR: 0.00000988  
Epoch: [1][76000/76895] Elapsed 199m 4s (remain 2m 20s) Loss: 0.0960(0.2225) Grad: 1.7521  LR: 0.00000987  
Epoch: [1][76894/76895] Elapsed 201m 26s (remain 0m 0s) Loss: 0.1927(0.2220) Grad: 4.0354  LR: 0.00000987  
EVAL: [0/19225] Elapsed 0m 1s (remain 415m 37s) Loss: 0.1352(0.1352) 
EVAL: [1000/19225] Elapsed 0m 49s (remain 15m 7s) Loss: 0.2280(0.1109) 
EVAL: [2000/19225] Elapsed 1m 33s (remain 13m 23s) Loss: 0.0363(0.1082) 
EVAL: [3000/19225] Elapsed 2m 20s (remain 12m 38s) Loss: 0.0642(0.1146) 
EVAL: [4000/19225] Elapsed 3m 11s (remain 12m 7s) Loss: 0.3070(0.1171) 
EVAL: [5000/19225] Elapsed 3m 59s (remain 11m 21s) Loss: 0.1035(0.1211) 
EVAL: [6000/19225] Elapsed 4m 53s (remain 10m 45s) Loss: 0.0338(0.1252) 
EVAL: [7000/19225] Elapsed 5m 39s (remain 9m 52s) Loss: 0.2100(0.1255) 
EVAL: [8000/19225] Elapsed 6m 26s (remain 9m 2s) Loss: 0.3318(0.1282) 
EVAL: [9000/19225] Elapsed

Epoch: [2][57000/76895] Elapsed 149m 56s (remain 52m 20s) Loss: 0.2104(0.1598) Grad: 2.1619  LR: 0.00000894  
Epoch: [2][58000/76895] Elapsed 152m 36s (remain 49m 42s) Loss: 0.0224(0.1596) Grad: 0.7820  LR: 0.00000891  
Epoch: [2][59000/76895] Elapsed 155m 14s (remain 47m 4s) Loss: 0.1711(0.1595) Grad: 2.0286  LR: 0.00000889  
Epoch: [2][60000/76895] Elapsed 157m 54s (remain 44m 27s) Loss: 0.2138(0.1594) Grad: 3.8880  LR: 0.00000887  
Epoch: [2][61000/76895] Elapsed 160m 33s (remain 41m 49s) Loss: 0.2329(0.1592) Grad: 2.4660  LR: 0.00000884  
Epoch: [2][62000/76895] Elapsed 163m 14s (remain 39m 12s) Loss: 0.0474(0.1591) Grad: 1.9785  LR: 0.00000882  
Epoch: [2][63000/76895] Elapsed 165m 52s (remain 36m 34s) Loss: 0.0532(0.1590) Grad: 3.5187  LR: 0.00000879  
Epoch: [2][64000/76895] Elapsed 168m 20s (remain 33m 54s) Loss: 0.0823(0.1589) Grad: 2.1266  LR: 0.00000877  
Epoch: [2][65000/76895] Elapsed 171m 0s (remain 31m 17s) Loss: 0.2636(0.1589) Grad: 3.8442  LR: 0.00000874  
Epoch: [2][6

Epoch: [3][39000/76895] Elapsed 102m 39s (remain 99m 44s) Loss: 0.2730(0.1289) Grad: 2.2165  LR: 0.00000723  
Epoch: [3][40000/76895] Elapsed 105m 18s (remain 97m 8s) Loss: 0.0781(0.1288) Grad: 3.5879  LR: 0.00000719  
Epoch: [3][41000/76895] Elapsed 108m 2s (remain 94m 35s) Loss: 0.1305(0.1288) Grad: 7.5995  LR: 0.00000716  
Epoch: [3][42000/76895] Elapsed 110m 44s (remain 92m 0s) Loss: 0.3052(0.1288) Grad: 3.1671  LR: 0.00000712  
Epoch: [3][43000/76895] Elapsed 113m 34s (remain 89m 31s) Loss: 0.0786(0.1288) Grad: 3.6545  LR: 0.00000709  
Epoch: [3][44000/76895] Elapsed 116m 18s (remain 86m 57s) Loss: 0.2333(0.1289) Grad: 3.0487  LR: 0.00000705  
Epoch: [3][45000/76895] Elapsed 119m 11s (remain 84m 28s) Loss: 0.3547(0.1288) Grad: 9.0711  LR: 0.00000702  
Epoch: [3][46000/76895] Elapsed 122m 5s (remain 81m 59s) Loss: 0.0922(0.1289) Grad: 3.7474  LR: 0.00000699  
Epoch: [3][47000/76895] Elapsed 125m 6s (remain 79m 34s) Loss: 0.0046(0.1288) Grad: 0.1806  LR: 0.00000695  
Epoch: [3][4800

Epoch: [4][21000/76895] Elapsed 56m 57s (remain 151m 35s) Loss: 0.1272(0.1139) Grad: 27.6585  LR: 0.00000508  
Epoch: [4][22000/76895] Elapsed 59m 46s (remain 149m 9s) Loss: 0.1729(0.1142) Grad: 11.5602  LR: 0.00000504  
Epoch: [4][23000/76895] Elapsed 62m 32s (remain 146m 33s) Loss: 0.1408(0.1143) Grad: 6.3499  LR: 0.00000500  
Epoch: [4][24000/76895] Elapsed 65m 9s (remain 143m 36s) Loss: 0.1588(0.1146) Grad: 3.3582  LR: 0.00000496  
Epoch: [4][25000/76895] Elapsed 67m 45s (remain 140m 39s) Loss: 0.5351(0.1149) Grad: 13.4911  LR: 0.00000493  
Epoch: [4][26000/76895] Elapsed 70m 25s (remain 137m 51s) Loss: 0.0018(0.1150) Grad: 0.2886  LR: 0.00000489  
Epoch: [4][27000/76895] Elapsed 72m 59s (remain 134m 52s) Loss: 0.0650(0.1150) Grad: 0.7245  LR: 0.00000485  
Epoch: [4][28000/76895] Elapsed 75m 39s (remain 132m 6s) Loss: 0.1791(0.1152) Grad: 13.8533  LR: 0.00000481  
Epoch: [4][29000/76895] Elapsed 78m 15s (remain 129m 15s) Loss: 0.1225(0.1152) Grad: 28.7551  LR: 0.00000478  
Epoch: [

KeyboardInterrupt: 