# About this notebook
- Deberta-v3-base starter code
- pip wheels is [here](https://www.kaggle.com/code/yasufuminakama/fb3-pip-wheels)
- Inference notebook is [here](https://www.kaggle.com/yasufuminakama/fb3-deberta-v3-base-baseline-inference)

If this notebook is helpful, feel free to upvote :)

# Directory settings

In [1]:
# ====================================================
# Directory settings
# ====================================================
import os

OUTPUT_DIR = './'
if not os.path.exists(OUTPUT_DIR):
    os.makedirs(OUTPUT_DIR)

# CFG

In [2]:
# ====================================================
# CFG
# ====================================================
class CFG:
    wandb=True
    competition='FB3'
    _wandb_kernel='nakama'
    debug=False
    apex=True
    print_freq=20
    num_workers=4
    model="gpt2"
    gradient_checkpointing=True
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=8
    max_len=512
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    target_cols=['cohesion', 'syntax', 'vocabulary', 'phraseology', 'grammar', 'conventions']
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    
if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0]

In [3]:
# ====================================================
# wandb
# ====================================================
if CFG.wandb:
    
    import wandb

    try:
        from kaggle_secrets import UserSecretsClient
        user_secrets = UserSecretsClient()
        secret_value_0 = user_secrets.get_secret("wandb_api")
        wandb.login(key=secret_value_0)
        anony = None
    except:
        anony = "must"
        print('If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. \nGet your W&B access token from here: https://wandb.ai/authorize')


    def class2dict(f):
        return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))

    run = wandb.init(project='FB3-Public', 
                     name=CFG.model,
                     config=class2dict(CFG),
                     group=CFG.model,
                     job_type="train",
                     anonymous=anony)

If you want to use your W&B account, go to Add-ons -> Secrets and provide your W&B access token. Use the Label name as wandb_api. 
Get your W&B access token from here: https://wandb.ai/authorize


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


# Library

In [4]:
# ====================================================
# Library
# ====================================================
import os
import gc
import re
import ast
import sys
import copy
import json
import time
import math
import string
import pickle
import random
import joblib
import itertools
import warnings
warnings.filterwarnings("ignore")

import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold

os.system('pip install iterative-stratification==0.1.7')
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

import torch
import torch.nn as nn
from torch.nn import Parameter
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

os.system('pip uninstall -y transformers')
os.system('pip uninstall -y tokenizers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels transformers')
os.system('python -m pip install --no-index --find-links=../input/fb3-pip-wheels tokenizers')
import tokenizers
import transformers
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel

from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

Collecting iterative-stratification==0.1.7
  Downloading iterative_stratification-0.1.7-py3-none-any.whl (8.5 kB)
Installing collected packages: iterative-stratification
Successfully installed iterative-stratification-0.1.7




Found existing installation: transformers 4.20.1
Uninstalling transformers-4.20.1:
  Successfully uninstalled transformers-4.20.1




Found existing installation: tokenizers 0.12.1
Uninstalling tokenizers-0.12.1:
  Successfully uninstalled tokenizers-0.12.1




Looking in links: ../input/fb3-pip-wheels
Processing /kaggle/input/fb3-pip-wheels/transformers-4.21.2-py3-none-any.whl
Processing /kaggle/input/fb3-pip-wheels/tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl
Installing collected packages: tokenizers, transformers


ERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
allennlp 2.10.1 requires transformers<4.21,>=4.1, but you have transformers 4.21.2 which is incompatible.


Successfully installed tokenizers-0.12.1 transformers-4.21.2
Looking in links: ../input/fb3-pip-wheels




tokenizers.__version__: 0.12.1
transformers.__version__: 4.21.2
env: TOKENIZERS_PARALLELISM=true


# Utils

In [5]:
# ====================================================
# Utils
# ====================================================
def MCRMSE(y_trues, y_preds):
    scores = []
    idxes = y_trues.shape[1]
    for i in range(idxes):
        y_true = y_trues[:,i]
        y_pred = y_preds[:,i]
        score = mean_squared_error(y_true, y_pred, squared=False) # RMSE
        scores.append(score)
    mcrmse_score = np.mean(scores)
    return mcrmse_score, scores


def get_score(y_trues, y_preds):
    mcrmse_score, scores = MCRMSE(y_trues, y_preds)
    return mcrmse_score, scores


def get_logger(filename=OUTPUT_DIR+'train'):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()


def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

# Data Loading

In [6]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/feedback-prize-english-language-learning/train.csv')
test = pd.read_csv('../input/feedback-prize-english-language-learning/test.csv')
submission = pd.read_csv('../input/feedback-prize-english-language-learning/sample_submission.csv')

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())
print(f"submission.shape: {submission.shape}")
display(submission.head())

train.shape: (3911, 8)


Unnamed: 0,text_id,full_text,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0016926B079C,I think that students would benefit from learn...,3.5,3.5,3.0,3.0,4.0,3.0
1,0022683E9EA5,When a problem is a change you have to let it ...,2.5,2.5,3.0,2.0,2.0,2.5
2,00299B378633,"Dear, Principal\n\nIf u change the school poli...",3.0,3.5,3.0,3.0,3.0,2.5
3,003885A45F42,The best time in life is when you become yours...,4.5,4.5,4.5,4.5,4.0,5.0
4,0049B1DF5CCC,Small act of kindness can impact in other peop...,2.5,3.0,3.0,3.0,2.5,2.5


test.shape: (3, 2)


Unnamed: 0,text_id,full_text
0,0000C359D63E,when a person has no experience on a job their...
1,000BAD50D026,Do you think students would benefit from being...
2,00367BB2546B,"Thomas Jefferson once states that ""it is wonde..."


submission.shape: (3, 7)


Unnamed: 0,text_id,cohesion,syntax,vocabulary,phraseology,grammar,conventions
0,0000C359D63E,3.0,3.0,3.0,3.0,3.0,3.0
1,000BAD50D026,3.0,3.0,3.0,3.0,3.0,3.0
2,00367BB2546B,3.0,3.0,3.0,3.0,3.0,3.0


# CV split

In [7]:
# ====================================================
# CV split
# ====================================================
Fold = MultilabelStratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_cols])):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    978
1    977
2    978
3    978
dtype: int64

In [8]:
if CFG.debug:
    display(train.groupby('fold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('fold').size())

# tokenizer

In [9]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = GPT2Tokenizer.from_pretrained(CFG.model)
tokenizer.pad_token = tokenizer.eos_token
CFG.tokenizer = tokenizer

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [10]:
tokenizer.pad_token

'<|endoftext|>'

# Dataset

In [11]:
# ====================================================
# Define max_len
# ====================================================
lengths = []
tk0 = tqdm(train['full_text'].fillna("").values, total=len(train))
for text in tk0:
    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
    lengths.append(length)
CFG.max_len = max(lengths) + 3 # cls & sep & sep
LOGGER.info(f"max_len: {CFG.max_len}")

  0%|          | 0/3911 [00:00<?, ?it/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1056 > 1024). Running this sequence through the model will result in indexing errors
max_len: 5122


In [12]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(
        text, 
        return_tensors=None, 
        add_special_tokens=True, 
        max_length=768,#CFG.max_len,
        pad_to_max_length=True,
        truncation=True
    )
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['full_text'].values
        self.labels = df[cfg.target_cols].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.texts[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label
    

def collate(inputs):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    for k, v in inputs.items():
        inputs[k] = inputs[k][:,:mask_len]
    return inputs

# Model

In [13]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = GPT2Config()#.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info('gpt2 config:')
            
            LOGGER.info(self.config.hidden_size)
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = GPT2Model.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = GPT2Model(self.config)
#         if self.cfg.gradient_checkpointing:
#             self.model.gradient_checkpointing_enable()
        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, 6)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
#         print(inputs['input_ids'].shape)
#         print(inputs['attention_mask'].shape)
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

# Loss

In [14]:
# ====================================================
# Loss
# ====================================================
class RMSELoss(nn.Module):
    def __init__(self, reduction='mean', eps=1e-9):
        super().__init__()
        self.mse = nn.MSELoss(reduction='none')
        self.reduction = reduction
        self.eps = eps

    def forward(self, y_pred, y_true):
        loss = torch.sqrt(self.mse(y_pred, y_true) + self.eps)
        if self.reduction == 'none':
            loss = loss
        elif self.reduction == 'sum':
            loss = loss.sum()
        elif self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Helpler functions

In [15]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    model.train()
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    for step, (inputs, labels) in enumerate(train_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))
        if CFG.wandb:
            wandb.log({f"[fold{fold}] loss": losses.val,
                       f"[fold{fold}] lr": scheduler.get_lr()[0]})
    return losses.avg


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        inputs = collate(inputs)
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
            loss = criterion(y_preds, labels)
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    return losses.avg, predictions

# train loop

In [16]:
# ====================================================
# train loop
# ====================================================

from transformers import AdamW, get_linear_schedule_with_warmup
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['fold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['fold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds[CFG.target_cols].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)

    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size * 2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
#     optimizer = AdamW(model.parameters(), lr=CFG.encoder_lr)

    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)
#     scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps = -1)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.SmoothL1Loss(reduction='mean') # RMSELoss(reduction="mean")
    
    best_score = np.inf

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score, scores = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}  Scores: {scores}')
        if CFG.wandb:
            wandb.log({f"[fold{fold}] epoch": epoch+1, 
                       f"[fold{fold}] avg_train_loss": avg_loss, 
                       f"[fold{fold}] avg_val_loss": avg_val_loss,
                       f"[fold{fold}] score": score})
        
        if best_score > score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds[[f"pred_{c}" for c in CFG.target_cols]] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [17]:
import os
if __name__ == '__main__':
    
    os.environ['CUDA_LAUNCH_BLOCKING'] = "1" 
    def get_result(oof_df):
        labels = oof_df[CFG.target_cols].values
        preds = oof_df[[f"pred_{c}" for c in CFG.target_cols]].values
        score, scores = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}  Scores: {scores}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_DIR+'oof_df.pkl')
        
    if CFG.wandb:
        wandb.finish()

gpt2 config:
768
GPT2Config {
  "activation_function": "gelu_new",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.21.2",
  "use_cache": true,
  "vocab_size": 50257
}



Downloading pytorch_model.bin:   0%|          | 0.00/523M [00:00<?, ?B/s]

Epoch: [1][0/366] Elapsed 0m 2s (remain 13m 51s) Loss: 3.1083(3.1083) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 19s (remain 5m 27s) Loss: 1.8949(2.5588) Grad: 51964.1953  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 37s (remain 5m 0s) Loss: 0.4694(1.9076) Grad: 20302.7559  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 55s (remain 4m 39s) Loss: 0.1829(1.3629) Grad: 5299.7510  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 13s (remain 4m 17s) Loss: 0.1476(1.0742) Grad: 8130.2217  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 30s (remain 3m 58s) Loss: 0.1409(0.8999) Grad: 11384.2422  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 48s (remain 3m 40s) Loss: 0.1482(0.7807) Grad: 3879.1602  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 6s (remain 3m 21s) Loss: 0.1873(0.6952) Grad: 8181.3599  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 23s (remain 3m 2s) Loss: 0.1538(0.6296) Grad: 6837.6978  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 41s (remain 2m 45s) Loss: 0

Epoch 1 - avg_train_loss: 0.3661  avg_val_loss: 0.1678  time: 354s
Epoch 1 - Score: 0.5773  Scores: [0.5337771895533633, 0.5388325881214225, 0.4687858883748661, 0.5433814644171789, 0.575590040295545, 0.8032213671425926]
Epoch 1 - Save Best Score: 0.5773 Model


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1377(0.1678) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1858(0.1678) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 6m 45s) Loss: 0.1668(0.1668) Grad: inf  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 18s (remain 5m 8s) Loss: 0.0838(0.1220) Grad: 40543.5469  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 35s (remain 4m 42s) Loss: 0.1892(0.1276) Grad: 56423.9258  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 53s (remain 4m 26s) Loss: 0.1149(0.1264) Grad: 84264.4531  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 10s (remain 4m 6s) Loss: 0.1993(0.1265) Grad: 138538.3125  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 27s (remain 3m 50s) Loss: 0.2058(0.1287) Grad: 64422.5117  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 44s (remain 3m 32s) Loss: 0.0901(0.1291) Grad: 26811.4082  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 1s (remain 3m 14s) Loss: 0.0805(0.1288) Grad: 59079.4648  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 

Epoch 2 - avg_train_loss: 0.1269  avg_val_loss: 0.1583  time: 353s
Epoch 2 - Score: 0.5645  Scores: [0.5268011066277839, 0.5710675059316824, 0.46725565226122834, 0.5743004536946676, 0.5537098054374111, 0.6941568091953478]
Epoch 2 - Save Best Score: 0.5645 Model


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1393(0.1582) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2057(0.1583) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 31s) Loss: 0.1310(0.1310) Grad: inf  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 18s (remain 5m 4s) Loss: 0.0909(0.1267) Grad: 28447.0312  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 35s (remain 4m 42s) Loss: 0.0789(0.1225) Grad: 25144.2168  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 52s (remain 4m 23s) Loss: 0.1804(0.1252) Grad: 22982.4609  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 10s (remain 4m 6s) Loss: 0.0903(0.1248) Grad: 14293.7080  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 26s (remain 3m 48s) Loss: 0.0729(0.1272) Grad: 15146.7109  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 44s (remain 3m 32s) Loss: 0.0771(0.1267) Grad: 18917.9258  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 1s (remain 3m 14s) Loss: 0.0845(0.1260) Grad: 15221.8262  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 2

Epoch 3 - avg_train_loss: 0.1227  avg_val_loss: 0.1705  time: 350s
Epoch 3 - Score: 0.5862  Scores: [0.5402390320754478, 0.6025179382452129, 0.4930225395283537, 0.5690073514580838, 0.5683011290321411, 0.7443419964225229]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1559(0.1704) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2540(0.1705) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 8m 42s) Loss: 0.0908(0.0908) Grad: inf  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 19s (remain 5m 17s) Loss: 0.1005(0.1229) Grad: 43815.3008  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 37s (remain 4m 54s) Loss: 0.1433(0.1246) Grad: 31314.0547  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 53s (remain 4m 29s) Loss: 0.1366(0.1193) Grad: 44265.9375  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 11s (remain 4m 11s) Loss: 0.1311(0.1174) Grad: 61020.7188  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 29s (remain 3m 54s) Loss: 0.1808(0.1187) Grad: 81481.1641  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 46s (remain 3m 36s) Loss: 0.1783(0.1199) Grad: 88210.5234  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 4s (remain 3m 19s) Loss: 0.1259(0.1196) Grad: 76414.7031  LR: 0.00000116  
Epoch: [4][160/366] Elapsed

Epoch 4 - avg_train_loss: 0.1168  avg_val_loss: 0.1693  time: 354s
Epoch 4 - Score: 0.5842  Scores: [0.5351375721575766, 0.5977699426872755, 0.4882273054518955, 0.5830966507414316, 0.56267013051383, 0.7380103689570754]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1549(0.1692) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2483(0.1693) 


Score: 0.5645  Scores: [0.5268011066277839, 0.5710675059316824, 0.46725565226122834, 0.5743004536946676, 0.5537098054374111, 0.6941568091953478]
gpt2 config:
768
GPT2Config {
  "activation_function": "gelu_new",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.21.2",
  "use_cache": true,
  "vocab_size": 50257
}



Epoch: [1][0/366] Elapsed 0m 1s (remain 7m 20s) Loss: 1.8736(1.8736) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 18s (remain 4m 59s) Loss: 0.6798(1.4369) Grad: 39607.3672  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 34s (remain 4m 35s) Loss: 0.2757(0.9632) Grad: 19914.5391  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 52s (remain 4m 21s) Loss: 0.0828(0.7245) Grad: 8577.6455  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 9s (remain 4m 5s) Loss: 0.0776(0.5896) Grad: 9537.0674  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 27s (remain 3m 48s) Loss: 0.2181(0.5076) Grad: 29573.9512  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 44s (remain 3m 32s) Loss: 0.2145(0.4500) Grad: 14554.5029  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 2s (remain 3m 14s) Loss: 0.0971(0.4074) Grad: 13158.8896  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 18s (remain 2m 56s) Loss: 0.1582(0.3793) Grad: 11754.2764  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 36s (remain 2m 40s) Loss:

Epoch 1 - avg_train_loss: 0.2459  avg_val_loss: 0.2986  time: 350s
Epoch 1 - Score: 0.7890  Scores: [0.7795718458618217, 0.652559944840086, 0.8903904491950136, 0.7850207101791259, 0.677656726264996, 0.949022319107724]
Epoch 1 - Save Best Score: 0.7890 Model


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2964(0.2987) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1379(0.2986) 
Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 28s) Loss: 0.0844(0.0844) Grad: inf  LR: 0.00001707  
Epoch: [2][20/366] Elapsed 0m 18s (remain 5m 1s) Loss: 0.1307(0.1519) Grad: 33921.6875  LR: 0.00001676  
Epoch: [2][40/366] Elapsed 0m 35s (remain 4m 44s) Loss: 0.0669(0.1331) Grad: 23898.8965  LR: 0.00001644  
Epoch: [2][60/366] Elapsed 0m 53s (remain 4m 25s) Loss: 0.1478(0.1287) Grad: 29981.5449  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 10s (remain 4m 9s) Loss: 0.1822(0.1260) Grad: 33735.9961  LR: 0.00001576  
Epoch: [2][100/366] Elapsed 1m 27s (remain 3m 50s) Loss: 0.1987(0.1240) Grad: 23091.2754  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 45s (remain 3m 34s) Loss: 0.1481(0.1257) Grad: 12635.3633  LR: 0.00001504  
Epoch: [2][140/366] Elapsed 2m 3s (remain 3m 17s) Loss: 0.1716(0.1248) Grad: 6498.1372  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m

Epoch 2 - avg_train_loss: 0.1187  avg_val_loss: 0.2339  time: 350s
Epoch 2 - Score: 0.6955  Scores: [0.6852914670125942, 0.6580568027803579, 0.6958798633971811, 0.719449046878276, 0.6423181917102706, 0.7719555854629456]
Epoch 2 - Save Best Score: 0.6955 Model


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2371(0.2340) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1109(0.2339) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 5m 38s) Loss: 0.1002(0.1002) Grad: inf  LR: 0.00001001  
Epoch: [3][20/366] Elapsed 0m 17s (remain 4m 55s) Loss: 0.1179(0.1105) Grad: 37139.7656  LR: 0.00000958  
Epoch: [3][40/366] Elapsed 0m 35s (remain 4m 41s) Loss: 0.1118(0.1063) Grad: 33460.5273  LR: 0.00000916  
Epoch: [3][60/366] Elapsed 0m 53s (remain 4m 25s) Loss: 0.1565(0.1063) Grad: 52632.5938  LR: 0.00000873  
Epoch: [3][80/366] Elapsed 1m 10s (remain 4m 7s) Loss: 0.0865(0.1061) Grad: 32217.9609  LR: 0.00000831  
Epoch: [3][100/366] Elapsed 1m 27s (remain 3m 50s) Loss: 0.1462(0.1092) Grad: 61627.6055  LR: 0.00000789  
Epoch: [3][120/366] Elapsed 1m 45s (remain 3m 33s) Loss: 0.1263(0.1096) Grad: 42503.6328  LR: 0.00000747  
Epoch: [3][140/366] Elapsed 2m 2s (remain 3m 15s) Loss: 0.1151(0.1103) Grad: 41100.6367  LR: 0.00000706  
Epoch: [3][160/366] Elapsed 

Epoch 3 - avg_train_loss: 0.1084  avg_val_loss: 0.2561  time: 351s
Epoch 3 - Score: 0.7284  Scores: [0.7403776335665391, 0.6303049038150544, 0.8022195792728489, 0.7628125996928729, 0.635986310218025, 0.7988600274519578]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2690(0.2563) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1055(0.2561) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 10s) Loss: 0.0596(0.0596) Grad: inf  LR: 0.00000295  
Epoch: [4][20/366] Elapsed 0m 18s (remain 5m 4s) Loss: 0.1334(0.0948) Grad: 37540.4648  LR: 0.00000265  
Epoch: [4][40/366] Elapsed 0m 35s (remain 4m 41s) Loss: 0.0793(0.0934) Grad: 41195.2812  LR: 0.00000237  
Epoch: [4][60/366] Elapsed 0m 52s (remain 4m 23s) Loss: 0.1178(0.1000) Grad: 59584.5703  LR: 0.00000210  
Epoch: [4][80/366] Elapsed 1m 9s (remain 4m 5s) Loss: 0.1097(0.1014) Grad: 25195.0098  LR: 0.00000184  
Epoch: [4][100/366] Elapsed 1m 27s (remain 3m 49s) Loss: 0.1016(0.1016) Grad: 43908.4297  LR: 0.00000160  
Epoch: [4][120/366] Elapsed 1m 45s (remain 3m 33s) Loss: 0.1021(0.1006) Grad: 42363.2266  LR: 0.00000138  
Epoch: [4][140/366] Elapsed 2m 2s (remain 3m 15s) Loss: 0.1385(0.1020) Grad: 33468.1250  LR: 0.00000117  
Epoch: [4][160/366] Elapsed 2m

Epoch 4 - avg_train_loss: 0.1034  avg_val_loss: 0.2741  time: 353s
Epoch 4 - Score: 0.7547  Scores: [0.7423104715417335, 0.657040350661553, 0.8240398070259034, 0.7859734518119356, 0.6515550016847323, 0.8672723922720886]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2900(0.2742) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1294(0.2741) 


Score: 0.6955  Scores: [0.6852914670125942, 0.6580568027803579, 0.6958798633971811, 0.719449046878276, 0.6423181917102706, 0.7719555854629456]
gpt2 config:
768
GPT2Config {
  "activation_function": "gelu_new",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.21.2",
  "use_cache": true,
  "vocab_size": 50257
}



Epoch: [1][0/366] Elapsed 0m 1s (remain 7m 51s) Loss: 6.1865(6.1865) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 19s (remain 5m 14s) Loss: 3.0739(4.8769) Grad: 25536.3867  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 37s (remain 4m 53s) Loss: 0.6269(3.3009) Grad: 28005.9609  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 54s (remain 4m 31s) Loss: 0.2399(2.3340) Grad: 9954.8975  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 11s (remain 4m 12s) Loss: 0.2439(1.8076) Grad: 8907.5312  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 29s (remain 3m 54s) Loss: 0.3136(1.4952) Grad: 13542.8975  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 46s (remain 3m 35s) Loss: 0.1336(1.2840) Grad: 7251.4419  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 4s (remain 3m 18s) Loss: 0.1466(1.1272) Grad: 7656.7529  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 21s (remain 3m 0s) Loss: 0.3602(1.0096) Grad: 14618.6104  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 39s (remain 2m 43s) Loss: 

Epoch 1 - avg_train_loss: 0.5394  avg_val_loss: 0.1436  time: 355s
Epoch 1 - Score: 0.5398  Scores: [0.588749602497341, 0.5387929057371663, 0.46483233895878445, 0.5496353125088658, 0.5715980631899438, 0.5249578186939294]
Epoch 1 - Save Best Score: 0.5398 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 35s) Loss: 0.0698(0.0698) Grad: inf  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 19s (remain 5m 16s) Loss: 0.1900(0.1405) Grad: 89197.7891  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 36s (remain 4m 51s) Loss: 0.0924(0.1407) Grad: 25783.4160  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 54s (remain 4m 30s) Loss: 0.0829(0.1352) Grad: 41587.3008  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 11s (remain 4m 12s) Loss: 0.1632(0.1387) Grad: 44375.8984  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 29s (remain 3m 54s) Loss: 0.1628(0.1382) Grad: 73968.1953  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 46s (remain 3m 36s) Loss: 0.1617(0.1401) Grad: 21248.5820  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 4s (remain 3m 19s) Loss: 0.1927(0.1377) Grad: 31425.4258  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 23s (remain 3m 2s) Loss: 0.1397(0.1387) Grad: 53759.6250  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 2m 40s (remain 2m 44s) Lo

Epoch 2 - avg_train_loss: 0.1342  avg_val_loss: 0.1475  time: 357s
Epoch 2 - Score: 0.5473  Scores: [0.6404767164001333, 0.531406570930958, 0.4677771102723569, 0.5401255361229886, 0.5634724189685062, 0.5403928994248295]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1224(0.1472) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2815(0.1475) 
Epoch: [3][0/366] Elapsed 0m 0s (remain 5m 1s) Loss: 0.2019(0.2019) Grad: inf  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 18s (remain 4m 59s) Loss: 0.0658(0.1290) Grad: 66764.2266  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 36s (remain 4m 45s) Loss: 0.0788(0.1265) Grad: 38978.1953  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 53s (remain 4m 25s) Loss: 0.1844(0.1255) Grad: 107623.5469  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 11s (remain 4m 10s) Loss: 0.1180(0.1305) Grad: 31593.2109  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 29s (remain 3m 53s) Loss: 0.1402(0.1292) Grad: 24636.9512  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 46s (remain 3m 36s) Loss: 0.1361(0.1303) Grad: 60066.8203  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 4s (remain 3m 18s) Loss: 0.1062(0.1287) Grad: 36801.4570  LR: 0.00000704  
Epoch: [3][160/366] Elapsed

Epoch 3 - avg_train_loss: 0.1278  avg_val_loss: 0.1554  time: 355s
Epoch 3 - Score: 0.5628  Scores: [0.6432057937180309, 0.5212264392065294, 0.4848481807544647, 0.5598531941724364, 0.5810067703545823, 0.5869582324488041]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1296(0.1553) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2260(0.1554) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 35s) Loss: 0.1578(0.1578) Grad: inf  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 18s (remain 5m 8s) Loss: 0.1050(0.1232) Grad: 57600.4727  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 36s (remain 4m 51s) Loss: 0.0755(0.1196) Grad: 23888.5645  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 54s (remain 4m 33s) Loss: 0.1005(0.1193) Grad: 41192.4414  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 12s (remain 4m 14s) Loss: 0.1242(0.1237) Grad: 35047.5508  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 30s (remain 3m 56s) Loss: 0.1669(0.1225) Grad: 25921.8008  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 47s (remain 3m 37s) Loss: 0.1266(0.1228) Grad: 62232.8398  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 5s (remain 3m 20s) Loss: 0.1146(0.1217) Grad: 37557.4023  LR: 0.00000116  
Epoch: [4][160/366] Elapsed 

Epoch 4 - avg_train_loss: 0.1243  avg_val_loss: 0.1632  time: 356s
Epoch 4 - Score: 0.5771  Scores: [0.6688646109692281, 0.5195803148613332, 0.503173369458256, 0.5755276856560217, 0.5914000085582883, 0.6039569140860594]
Score: 0.5398  Scores: [0.588749602497341, 0.5387929057371663, 0.46483233895878445, 0.5496353125088658, 0.5715980631899438, 0.5249578186939294]
gpt2 config:
768
GPT2Config {
  "activation_function": "gelu_new",
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_

Epoch: [1][0/366] Elapsed 0m 1s (remain 6m 51s) Loss: 1.4110(1.4110) Grad: inf  LR: 0.00002000  
Epoch: [1][20/366] Elapsed 0m 18s (remain 5m 9s) Loss: 0.8306(1.1895) Grad: 50632.1758  LR: 0.00001999  
Epoch: [1][40/366] Elapsed 0m 36s (remain 4m 46s) Loss: 0.4782(0.9196) Grad: 37261.8594  LR: 0.00001996  
Epoch: [1][60/366] Elapsed 0m 53s (remain 4m 28s) Loss: 0.2405(0.6887) Grad: 15373.0107  LR: 0.00001991  
Epoch: [1][80/366] Elapsed 1m 11s (remain 4m 12s) Loss: 0.1960(0.5641) Grad: 31225.6680  LR: 0.00001985  
Epoch: [1][100/366] Elapsed 1m 29s (remain 3m 55s) Loss: 0.2268(0.4866) Grad: 11992.4814  LR: 0.00001977  
Epoch: [1][120/366] Elapsed 1m 47s (remain 3m 37s) Loss: 0.2068(0.4328) Grad: 20021.1777  LR: 0.00001967  
Epoch: [1][140/366] Elapsed 2m 4s (remain 3m 19s) Loss: 0.0897(0.3980) Grad: 19754.5352  LR: 0.00001955  
Epoch: [1][160/366] Elapsed 2m 21s (remain 3m 0s) Loss: 0.1802(0.3677) Grad: 18343.2891  LR: 0.00001941  
Epoch: [1][180/366] Elapsed 2m 39s (remain 2m 42s) Los

Epoch 1 - avg_train_loss: 0.2377  avg_val_loss: 0.2061  time: 352s
Epoch 1 - Score: 0.6464  Scores: [0.5406440852823401, 0.5813227140530306, 0.648238374275336, 0.5866284001432251, 0.7797008585611795, 0.7416743587443512]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1926(0.2062) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1370(0.2061) 


Epoch 1 - Save Best Score: 0.6464 Model


Epoch: [2][0/366] Elapsed 0m 1s (remain 7m 33s) Loss: 0.2044(0.2044) Grad: inf  LR: 0.00001706  
Epoch: [2][20/366] Elapsed 0m 18s (remain 5m 7s) Loss: 0.1207(0.1153) Grad: 42044.8867  LR: 0.00001675  
Epoch: [2][40/366] Elapsed 0m 36s (remain 4m 51s) Loss: 0.0894(0.1133) Grad: 48393.9766  LR: 0.00001643  
Epoch: [2][60/366] Elapsed 0m 53s (remain 4m 29s) Loss: 0.0518(0.1165) Grad: 36811.5977  LR: 0.00001610  
Epoch: [2][80/366] Elapsed 1m 11s (remain 4m 10s) Loss: 0.1166(0.1161) Grad: 50976.4258  LR: 0.00001575  
Epoch: [2][100/366] Elapsed 1m 28s (remain 3m 53s) Loss: 0.0833(0.1173) Grad: 28029.2383  LR: 0.00001540  
Epoch: [2][120/366] Elapsed 1m 46s (remain 3m 35s) Loss: 0.0827(0.1161) Grad: 57429.9492  LR: 0.00001503  
Epoch: [2][140/366] Elapsed 2m 4s (remain 3m 18s) Loss: 0.0683(0.1153) Grad: 29319.8828  LR: 0.00001466  
Epoch: [2][160/366] Elapsed 2m 21s (remain 3m 0s) Loss: 0.0892(0.1149) Grad: 37694.5547  LR: 0.00001427  
Epoch: [2][180/366] Elapsed 2m 39s (remain 2m 42s) Los

Epoch 2 - avg_train_loss: 0.1158  avg_val_loss: 0.2608  time: 354s
Epoch 2 - Score: 0.7290  Scores: [0.5340989066763732, 0.6821109493758525, 0.7054858384749135, 0.6625224967356279, 0.8630169022812334, 0.9264806227856693]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2514(0.2611) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1469(0.2608) 
Epoch: [3][0/366] Elapsed 0m 1s (remain 7m 26s) Loss: 0.0669(0.0669) Grad: inf  LR: 0.00001000  
Epoch: [3][20/366] Elapsed 0m 18s (remain 5m 7s) Loss: 0.0649(0.1113) Grad: 44785.8984  LR: 0.00000957  
Epoch: [3][40/366] Elapsed 0m 36s (remain 4m 46s) Loss: 0.1384(0.1145) Grad: 34060.5742  LR: 0.00000914  
Epoch: [3][60/366] Elapsed 0m 53s (remain 4m 26s) Loss: 0.1498(0.1121) Grad: 59864.6875  LR: 0.00000872  
Epoch: [3][80/366] Elapsed 1m 10s (remain 4m 7s) Loss: 0.0959(0.1113) Grad: 32411.2539  LR: 0.00000829  
Epoch: [3][100/366] Elapsed 1m 27s (remain 3m 49s) Loss: 0.0899(0.1102) Grad: 55432.7031  LR: 0.00000787  
Epoch: [3][120/366] Elapsed 1m 44s (remain 3m 32s) Loss: 0.1070(0.1121) Grad: 44712.3398  LR: 0.00000746  
Epoch: [3][140/366] Elapsed 2m 2s (remain 3m 15s) Loss: 0.1029(0.1121) Grad: 72438.3203  LR: 0.00000704  
Epoch: [3][160/366] Elapsed 2

Epoch 3 - avg_train_loss: 0.1095  avg_val_loss: 0.2815  time: 352s
Epoch 3 - Score: 0.7591  Scores: [0.5731706193554047, 0.714413426019476, 0.6957009933101974, 0.7084180181921719, 0.9081229046029197, 0.9546283857611111]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2668(0.2817) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1667(0.2815) 
Epoch: [4][0/366] Elapsed 0m 1s (remain 7m 28s) Loss: 0.0694(0.0694) Grad: inf  LR: 0.00000294  
Epoch: [4][20/366] Elapsed 0m 18s (remain 5m 9s) Loss: 0.1388(0.1046) Grad: 50002.0547  LR: 0.00000264  
Epoch: [4][40/366] Elapsed 0m 36s (remain 4m 48s) Loss: 0.1279(0.1027) Grad: 72790.2188  LR: 0.00000236  
Epoch: [4][60/366] Elapsed 0m 53s (remain 4m 28s) Loss: 0.1288(0.1027) Grad: 117231.3906  LR: 0.00000209  
Epoch: [4][80/366] Elapsed 1m 11s (remain 4m 10s) Loss: 0.1319(0.1039) Grad: 47928.4062  LR: 0.00000183  
Epoch: [4][100/366] Elapsed 1m 28s (remain 3m 52s) Loss: 0.0812(0.1038) Grad: 34376.2734  LR: 0.00000159  
Epoch: [4][120/366] Elapsed 1m 45s (remain 3m 34s) Loss: 0.1479(0.1057) Grad: 79032.9062  LR: 0.00000137  
Epoch: [4][140/366] Elapsed 2m 3s (remain 3m 17s) Loss: 0.0728(0.1062) Grad: 43399.0898  LR: 0.00000116  
Epoch: [4][160/366] Elapsed

Epoch 4 - avg_train_loss: 0.1053  avg_val_loss: 0.2744  time: 352s
Epoch 4 - Score: 0.7481  Scores: [0.5536962618752406, 0.6944383858052888, 0.6808446524479363, 0.6973974135897467, 0.9179943869148077, 0.943971366819366]


EVAL: [60/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.2662(0.2746) 
EVAL: [61/62] Elapsed 0m 32s (remain 0m 0s) Loss: 0.1566(0.2744) 


Score: 0.6464  Scores: [0.5406440852823401, 0.5813227140530306, 0.648238374275336, 0.5866284001432251, 0.7797008585611795, 0.7416743587443512]
Score: 0.6167  Scores: [0.5886297674892379, 0.5889190506133998, 0.5785122697760718, 0.6110467096543957, 0.6430058099302994, 0.6898021233791629]


VBox(children=(Label(value='0.000 MB of 0.000 MB uploaded (0.000 MB deduped)\r'), FloatProgress(value=1.0, max…

0,1
[fold0] avg_train_loss,█▁▁▁
[fold0] avg_val_loss,▆▁█▇
[fold0] epoch,▁▃▆█
[fold0] loss,█▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
[fold0] lr,███████▇▇▇▇▇▇▆▆▆▆▅▅▅▄▄▄▄▃▃▃▃▂▂▂▂▂▁▁▁▁▁▁▁
[fold0] score,▅▁█▇
[fold1] avg_train_loss,█▂▁▁
[fold1] avg_val_loss,█▁▃▅
[fold1] epoch,▁▃▆█
[fold1] loss,█▃▂▁▂▂▁▁▂▁▁▁▁▂▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁

0,1
[fold0] avg_train_loss,0.11683
[fold0] avg_val_loss,0.16934
[fold0] epoch,4.0
[fold0] loss,0.07048
[fold0] lr,0.0
[fold0] score,0.58415
[fold1] avg_train_loss,0.10342
[fold1] avg_val_loss,0.27409
[fold1] epoch,4.0
[fold1] loss,0.08438
