In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Fri Sep 16 10:15:12 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    debug2 = False
    apex=True
    print_freq=100
    num_workers=4
    #model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=128
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

if CFG.debug2:
    CFG.epochs = 3
    CFG.trn_fold = [0]

In [6]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'submission')
#OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'model')
OUTPUT_MODEL_DIR = DIR + '/output/model/EXP25/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return f1_score(y_true, (y_pred>thresh).astype(int))


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(CFG.seed)

<torch._C.Generator at 0x7f955c224b50>

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))
sub = pd.read_csv(os.path.join(INPUT_DIR,'sample_submit.csv'),header=None)
sub.columns = ['id','state']

In [10]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)

def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

def get_goal_values(df):
  df["goal"].replace("100000+","100000-100000",inplace=True)
  _df = df["goal"].str.split('-').apply(pd.Series).astype(float)
  _df.columns = ["goal_max","goal_min"]
  df["goal_max"] = _df["goal_max"].astype(str)
  df["goal_min"] = _df["goal_min"].astype(str)
  df["goal_median"] = _df[["goal_max","goal_min"]].median(axis=1)
  df["goal_median"] = df["goal_median"].astype(int)
  return df

if CFG.clean_content==True:
    train['html_content'] = train['html_content'].map(lambda x: str(x))
    train['html_content'] = train['html_content'].apply(html.unescape)
    p = re.compile(r"<[^>]*?>|&amp;|[/'’\"”]")
    train['html_content'] = train['html_content'].map(lambda x: p.sub("", x))
    train['html_content'] = train['html_content'].map(lambda x: x.lstrip())
    train['html_content'] = train['html_content'].fillna('missing')

train = get_goal_values(train)
train['inputs'] = train.goal_median.astype(str) + ' [SEP] ' + train.duration.astype(str) + ' [SEP] ' + train.country + ' [SEP] ' + train.category1 + ' [SEP] ' + train.category2 + ' [SEP] ' + train.html_content

In [11]:
train

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs
0,train_00000,20001-21000,US,45,art,mixed media,"http:dummy.comIn its first year, The Shillitos...",1,20001.0,21000.0,20500,20500 [SEP] 45 [SEP] US [SEP] art [SEP] mixed ...
1,train_00001,19001-20000,US,59,food,restaurants,Cultural Pretzel Sports Bar is a place where p...,0,19001.0,20000.0,19500,19500 [SEP] 59 [SEP] US [SEP] food [SEP] resta...
2,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...
3,train_00003,1001-2000,US,30,art,mixed media,"Canyon de Chelley, Dine (Navajo) Reservation, ...",1,1001.0,2000.0,1500,1500 [SEP] 30 [SEP] US [SEP] art [SEP] mixed m...
4,train_00004,1001-2000,US,29,film & video,webseries,"The story of the show, both on and off screen,...",1,1001.0,2000.0,1500,1500 [SEP] 29 [SEP] US [SEP] film & video [SEP...
...,...,...,...,...,...,...,...,...,...,...,...,...
9786,train_09786,1-1000,US,15,music,electronic music,So the story behind this is that Ive been maki...,0,1.0,1000.0,500,500 [SEP] 15 [SEP] US [SEP] music [SEP] electr...
9787,train_09787,3001-4000,CA,30,fashion,ready-to-wear,THE HIGH CLOTHINGMy vision is to create high q...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] CA [SEP] fashion [SEP] rea...
9788,train_09788,100000-100000,GB,30,technology,software,We dont think anybody looks forward to filling...,0,100000.0,100000.0,100000,100000 [SEP] 30 [SEP] GB [SEP] technology [SEP...
9789,train_09789,79001-80000,US,35,technology,gadgets,What is Droplet?\nDroplet is a wireless button...,1,79001.0,80000.0,79500,79500 [SEP] 35 [SEP] US [SEP] technology [SEP]...


In [12]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.state)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

In [14]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['inputs'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 6 # cls & sep & sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['inputs'].values
        self.labels = df['state'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [17]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        criterion,
        adv_param="weight",
        adv_lr=1e-4,
        adv_eps=1e-2,
        start_epoch=0,
        adv_step=1,
        device="cpu",
    ):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.device = device

    def attack_backward(self, inputs, label):
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            self.save()
            self.attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss
        
        

    def attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    #if not epoch < CFG.nth_awp_start_epoch:
    #    LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    #awp = AWP(model=model,
    #          optimizer=optimizer,
    #          criterion=criterion,
    #          adv_eps=0.01, 
    #          device=device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    #tot_loss = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        #if CFG.nth_awp_start_epoch <= epoch:
        #      loss = awp.attack_backward(inputs, labels)
        #      scaler.scale(loss).backward()
        #      awp.restore()
        #tot_loss += loss.item()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg
    #model.train()
    #return tot_loss/(step+1)


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['state'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [20]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['state'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')

XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "output_hidden_states": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.22.0",
  "untie_r": true,
  "use_mems_eval": t

Epoch: [1][0/458] Elapsed 0m 2s (remain 18m 36s) Loss: 0.7645(0.7645) Grad: inf  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 32s (remain 5m 26s) Loss: 0.5425(0.6974) Grad: 76828.3438  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 3m 2s (remain 3m 53s) Loss: 0.6721(0.6853) Grad: 6965.7393  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 4m 32s (remain 2m 22s) Loss: 0.6922(0.6718) Grad: 1023.7669  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 6m 3s (remain 0m 51s) Loss: 0.6843(0.6859) Grad: 7751.0000  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 6m 54s (remain 0m 0s) Loss: 0.7752(0.6879) Grad: 22595.4473  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 16s) Loss: 0.7067(0.7067) 


Epoch 1 - avg_train_loss: 0.6879  avg_val_loss: 0.6952  time: 459s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6879  avg_val_loss: 0.6952  time: 459s
Epoch 1 - Score: 0.0000
INFO:__main__:Epoch 1 - Score: 0.0000


EVAL: [76/77] Elapsed 0m 44s (remain 0m 0s) Loss: 0.6862(0.6952) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 9m 1s) Loss: 0.7498(0.7498) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 31s (remain 5m 21s) Loss: 1.0115(0.7024) Grad: 42712.3984  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 3m 1s (remain 3m 51s) Loss: 0.8067(0.7119) Grad: 24882.6387  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 4m 31s (remain 2m 21s) Loss: 0.6004(0.7109) Grad: 8350.4609  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 6m 1s (remain 0m 51s) Loss: 0.7057(0.7104) Grad: 11178.3516  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 6m 53s (remain 0m 0s) Loss: 0.6966(0.7102) Grad: 6408.5576  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 16s) Loss: 0.6873(0.6873) 


Epoch 2 - avg_train_loss: 0.7102  avg_val_loss: 0.6960  time: 458s
INFO:__main__:Epoch 2 - avg_train_loss: 0.7102  avg_val_loss: 0.6960  time: 458s
Epoch 2 - Score: 0.6599
INFO:__main__:Epoch 2 - Score: 0.6599
Epoch 2 - Save Best Score: 0.6599 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.6599 Model


EVAL: [76/77] Elapsed 0m 44s (remain 0m 0s) Loss: 0.7028(0.6960) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 9m 5s) Loss: 0.7127(0.7127) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 31s (remain 5m 21s) Loss: 0.6707(0.7015) Grad: 8098.5117  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 3m 1s (remain 3m 51s) Loss: 0.6749(0.7011) Grad: 32270.6367  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 4m 31s (remain 2m 21s) Loss: 0.6590(0.6985) Grad: 6332.9136  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 6m 1s (remain 0m 51s) Loss: 0.4944(0.6921) Grad: 1084.4373  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 6m 53s (remain 0m 0s) Loss: 0.4963(0.6892) Grad: 13903.0693  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 13s) Loss: 0.5770(0.5770) 


Epoch 3 - avg_train_loss: 0.6892  avg_val_loss: 0.6535  time: 458s
INFO:__main__:Epoch 3 - avg_train_loss: 0.6892  avg_val_loss: 0.6535  time: 458s
Epoch 3 - Score: 0.7312
INFO:__main__:Epoch 3 - Score: 0.7312
Epoch 3 - Save Best Score: 0.7312 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7312 Model


EVAL: [76/77] Elapsed 0m 44s (remain 0m 0s) Loss: 0.6493(0.6535) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 9m 17s) Loss: 0.6554(0.6554) Grad: inf  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 31s (remain 5m 22s) Loss: 0.4269(0.5823) Grad: 10455.1182  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 3m 1s (remain 3m 51s) Loss: 0.4409(0.5590) Grad: 1275.4867  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 4m 31s (remain 2m 21s) Loss: 0.4845(0.5459) Grad: 5803.2773  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 6m 1s (remain 0m 51s) Loss: 0.4741(0.5331) Grad: 3226.7869  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 6m 53s (remain 0m 0s) Loss: 0.4160(0.5262) Grad: 5456.4668  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 18s) Loss: 0.4616(0.4616) 


Epoch 4 - avg_train_loss: 0.5262  avg_val_loss: 0.5260  time: 458s
INFO:__main__:Epoch 4 - avg_train_loss: 0.5262  avg_val_loss: 0.5260  time: 458s
Epoch 4 - Score: 0.7513
INFO:__main__:Epoch 4 - Score: 0.7513
Epoch 4 - Save Best Score: 0.7513 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7513 Model


EVAL: [76/77] Elapsed 0m 44s (remain 0m 0s) Loss: 0.4979(0.5260) 


Score: 0.7513
INFO:__main__:Score: 0.7513
XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "output_hidden_states": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.22.0

Epoch: [1][0/458] Elapsed 0m 1s (remain 9m 7s) Loss: 1.3529(1.3529) Grad: inf  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.6608(0.7303) Grad: 5609.7935  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 2m 59s (remain 3m 48s) Loss: 0.6612(0.7096) Grad: 754.5640  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.6283(0.6708) Grad: 3280.9736  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.4026(0.6444) Grad: 1737.9275  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.4912(0.6281) Grad: 3289.6599  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5810(0.5810) 


Epoch 1 - avg_train_loss: 0.6281  avg_val_loss: 0.4917  time: 452s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6281  avg_val_loss: 0.4917  time: 452s
Epoch 1 - Score: 0.7768
INFO:__main__:Epoch 1 - Score: 0.7768
Epoch 1 - Save Best Score: 0.7768 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7768 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.6838(0.4917) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 9m 37s) Loss: 0.4955(0.4955) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.1582(0.4237) Grad: 3047.8652  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.7976(0.4131) Grad: 11623.6309  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.2933(0.4057) Grad: 6007.1494  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.2672(0.4027) Grad: 6119.9067  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.4833(0.3953) Grad: 6756.9897  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 14s) Loss: 0.5108(0.5108) 


Epoch 2 - avg_train_loss: 0.3953  avg_val_loss: 0.4408  time: 452s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3953  avg_val_loss: 0.4408  time: 452s
Epoch 2 - Score: 0.7926
INFO:__main__:Epoch 2 - Score: 0.7926
Epoch 2 - Save Best Score: 0.7926 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7926 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.5638(0.4408) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 9m 27s) Loss: 0.4517(0.4517) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.1512(0.2790) Grad: 4518.4727  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.1642(0.2891) Grad: 4852.7417  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.1812(0.2906) Grad: 4156.0835  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.1378(0.2887) Grad: 5679.2383  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.2904(0.2846) Grad: 11543.2490  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4683(0.4683) 


Epoch 3 - avg_train_loss: 0.2846  avg_val_loss: 0.5208  time: 452s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2846  avg_val_loss: 0.5208  time: 452s
Epoch 3 - Score: 0.7998
INFO:__main__:Epoch 3 - Score: 0.7998
Epoch 3 - Save Best Score: 0.7998 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7998 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.9424(0.5208) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 9m 20s) Loss: 0.2652(0.2652) Grad: inf  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.1311(0.1821) Grad: 15049.8555  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.2338(0.1866) Grad: 25444.8242  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.0449(0.1865) Grad: 3033.4211  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.2332(0.1922) Grad: 19265.0234  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.4234(0.1944) Grad: 56255.7031  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.3792(0.3792) 


Epoch 4 - avg_train_loss: 0.1944  avg_val_loss: 0.5649  time: 452s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1944  avg_val_loss: 0.5649  time: 452s
Epoch 4 - Score: 0.8027
INFO:__main__:Epoch 4 - Score: 0.8027
Epoch 4 - Save Best Score: 0.8027 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8027 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.8833(0.5649) 


Score: 0.8027
INFO:__main__:Score: 0.8027
XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "output_hidden_states": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.22.0

Epoch: [1][0/458] Elapsed 0m 1s (remain 9m 24s) Loss: 1.5142(1.5142) Grad: inf  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.7105(0.7736) Grad: 31968.4160  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.7322(0.7274) Grad: 15718.7285  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.6251(0.6864) Grad: 16350.7197  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.5362(0.6521) Grad: 11128.6719  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.4755(0.6428) Grad: 16525.8867  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 16s) Loss: 0.3947(0.3947) 


Epoch 1 - avg_train_loss: 0.6428  avg_val_loss: 0.5082  time: 452s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6428  avg_val_loss: 0.5082  time: 452s
Epoch 1 - Score: 0.7213
INFO:__main__:Epoch 1 - Score: 0.7213
Epoch 1 - Save Best Score: 0.7213 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7213 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.2651(0.5082) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 9m 28s) Loss: 0.3959(0.3959) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.3801(0.4231) Grad: 28577.4141  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.5281(0.4204) Grad: 50172.0508  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 4m 28s (remain 2m 20s) Loss: 0.4319(0.4124) Grad: 18841.2480  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.4398(0.4074) Grad: 8172.1606  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.7772(0.4111) Grad: 8266.5645  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 18s) Loss: 0.3425(0.3425) 


Epoch 2 - avg_train_loss: 0.4111  avg_val_loss: 0.4619  time: 452s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4111  avg_val_loss: 0.4619  time: 452s
Epoch 2 - Score: 0.7738
INFO:__main__:Epoch 2 - Score: 0.7738
Epoch 2 - Save Best Score: 0.7738 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7738 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1089(0.4619) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 9m 11s) Loss: 0.2010(0.2010) Grad: inf  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.1538(0.2948) Grad: 5289.9463  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.7951(0.2866) Grad: 76037.2344  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.1343(0.2793) Grad: 22111.7676  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.1030(0.2809) Grad: 10606.2598  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.0931(0.2784) Grad: 9212.8330  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 14s) Loss: 0.4053(0.4053) 


Epoch 3 - avg_train_loss: 0.2784  avg_val_loss: 0.5384  time: 452s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2784  avg_val_loss: 0.5384  time: 452s
Epoch 3 - Score: 0.7976
INFO:__main__:Epoch 3 - Score: 0.7976
Epoch 3 - Save Best Score: 0.7976 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.7976 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.0947(0.5384) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 9m 14s) Loss: 0.1105(0.1105) Grad: inf  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 30s (remain 5m 18s) Loss: 0.0287(0.1531) Grad: 8407.9775  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 2m 59s (remain 3m 49s) Loss: 0.1100(0.1556) Grad: 20004.3145  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 4m 28s (remain 2m 19s) Loss: 0.0115(0.1586) Grad: 2378.3506  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 5m 57s (remain 0m 50s) Loss: 0.3593(0.1611) Grad: 62061.7773  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 6m 48s (remain 0m 0s) Loss: 0.0375(0.1624) Grad: 11723.0068  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 13s) Loss: 0.5786(0.5786) 


Epoch 4 - avg_train_loss: 0.1624  avg_val_loss: 0.7446  time: 452s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1624  avg_val_loss: 0.7446  time: 452s
Epoch 4 - Score: 0.7986
INFO:__main__:Epoch 4 - Score: 0.7986
Epoch 4 - Save Best Score: 0.7986 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.7986 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.1256(0.7446) 


Score: 0.7986
INFO:__main__:Score: 0.7986
XLNetConfig {
  "_name_or_path": "xlnet-large-cased",
  "architectures": [
    "XLNetLMHeadModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "attn_type": "bi",
  "bi_data": false,
  "bos_token_id": 1,
  "clamp_len": -1,
  "d_head": 64,
  "d_inner": 4096,
  "d_model": 1024,
  "dropout": 0.1,
  "end_n_top": 5,
  "eos_token_id": 2,
  "ff_activation": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-12,
  "mem_len": null,
  "model_type": "xlnet",
  "n_head": 16,
  "n_layer": 24,
  "output_hidden_states": true,
  "pad_token_id": 5,
  "reuse_len": null,
  "same_length": false,
  "start_n_top": 5,
  "summary_activation": "tanh",
  "summary_last_dropout": 0.1,
  "summary_type": "last",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 250
    }
  },
  "transformers_version": "4.22.0

Epoch: [1][0/459] Elapsed 0m 1s (remain 9m 32s) Loss: 2.9012(2.9012) Grad: nan  LR: 0.00002000  
Epoch: [1][100/459] Elapsed 1m 30s (remain 5m 19s) Loss: 0.4671(1.1072) Grad: 4099.3408  LR: 0.00001985  
Epoch: [1][200/459] Elapsed 2m 59s (remain 3m 50s) Loss: 0.4522(0.9015) Grad: 8290.7031  LR: 0.00001941  
Epoch: [1][300/459] Elapsed 4m 28s (remain 2m 20s) Loss: 0.6844(0.8159) Grad: 11262.2588  LR: 0.00001870  
Epoch: [1][400/459] Elapsed 5m 57s (remain 0m 51s) Loss: 0.6196(0.7692) Grad: 10228.5635  LR: 0.00001774  
Epoch: [1][458/459] Elapsed 6m 49s (remain 0m 0s) Loss: 0.6719(0.7474) Grad: 45960.2109  LR: 0.00001707  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 19s) Loss: 0.6188(0.6188) 


Epoch 1 - avg_train_loss: 0.7474  avg_val_loss: 0.5976  time: 453s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7474  avg_val_loss: 0.5976  time: 453s
Epoch 1 - Score: 0.7222
INFO:__main__:Epoch 1 - Score: 0.7222
Epoch 1 - Save Best Score: 0.7222 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7222 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.6320(0.5976) 
Epoch: [2][0/459] Elapsed 0m 1s (remain 9m 30s) Loss: 0.5059(0.5059) Grad: nan  LR: 0.00001706  
Epoch: [2][100/459] Elapsed 1m 30s (remain 5m 19s) Loss: 0.6303(0.6832) Grad: 13064.1846  LR: 0.00001575  
Epoch: [2][200/459] Elapsed 2m 59s (remain 3m 50s) Loss: 0.2884(0.6618) Grad: 31700.7109  LR: 0.00001427  
Epoch: [2][300/459] Elapsed 4m 28s (remain 2m 20s) Loss: 0.3640(0.5938) Grad: 33173.4102  LR: 0.00001267  
Epoch: [2][400/459] Elapsed 5m 57s (remain 0m 51s) Loss: 0.5336(0.5592) Grad: 15069.7031  LR: 0.00001099  
Epoch: [2][458/459] Elapsed 6m 49s (remain 0m 0s) Loss: 0.4632(0.5407) Grad: 29585.7246  LR: 0.00001000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.5976(0.5976) 


Epoch 2 - avg_train_loss: 0.5407  avg_val_loss: 0.5589  time: 454s
INFO:__main__:Epoch 2 - avg_train_loss: 0.5407  avg_val_loss: 0.5589  time: 454s
Epoch 2 - Score: 0.7793
INFO:__main__:Epoch 2 - Score: 0.7793
Epoch 2 - Save Best Score: 0.7793 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7793 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.4964(0.5589) 
Epoch: [3][0/459] Elapsed 0m 1s (remain 9m 46s) Loss: 0.4234(0.4234) Grad: inf  LR: 0.00000998  
Epoch: [3][100/459] Elapsed 1m 30s (remain 5m 19s) Loss: 0.4604(0.3774) Grad: 45185.9062  LR: 0.00000828  
Epoch: [3][200/459] Elapsed 2m 59s (remain 3m 50s) Loss: 0.2764(0.3674) Grad: 29671.7168  LR: 0.00000663  
Epoch: [3][300/459] Elapsed 4m 28s (remain 2m 20s) Loss: 0.2386(0.3702) Grad: 24096.6660  LR: 0.00000507  
Epoch: [3][400/459] Elapsed 5m 57s (remain 0m 51s) Loss: 0.3681(0.3665) Grad: 39076.3633  LR: 0.00000366  
Epoch: [3][458/459] Elapsed 6m 49s (remain 0m 0s) Loss: 0.5074(0.3640) Grad: 40152.0781  LR: 0.00000293  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 18s) Loss: 0.3967(0.3967) 


Epoch 3 - avg_train_loss: 0.3640  avg_val_loss: 0.4550  time: 454s
INFO:__main__:Epoch 3 - avg_train_loss: 0.3640  avg_val_loss: 0.4550  time: 454s
Epoch 3 - Score: 0.8015
INFO:__main__:Epoch 3 - Score: 0.8015
Epoch 3 - Save Best Score: 0.8015 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8015 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.4205(0.4550) 
Epoch: [4][0/459] Elapsed 0m 1s (remain 9m 33s) Loss: 0.5335(0.5335) Grad: inf  LR: 0.00000292  
Epoch: [4][100/459] Elapsed 1m 30s (remain 5m 20s) Loss: 0.4955(0.3173) Grad: 97126.7031  LR: 0.00000182  
Epoch: [4][200/459] Elapsed 2m 59s (remain 3m 50s) Loss: 0.3618(0.3141) Grad: 71450.6953  LR: 0.00000096  
Epoch: [4][300/459] Elapsed 4m 28s (remain 2m 21s) Loss: 0.4880(0.3095) Grad: 97486.9688  LR: 0.00000036  
Epoch: [4][400/459] Elapsed 5m 57s (remain 0m 51s) Loss: 0.3285(0.3097) Grad: 72500.3750  LR: 0.00000005  
Epoch: [4][458/459] Elapsed 6m 49s (remain 0m 0s) Loss: 0.1187(0.3122) Grad: 33226.0664  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 16s) Loss: 0.3766(0.3766) 


Epoch 4 - avg_train_loss: 0.3122  avg_val_loss: 0.4459  time: 454s
INFO:__main__:Epoch 4 - avg_train_loss: 0.3122  avg_val_loss: 0.4459  time: 454s
Epoch 4 - Score: 0.8024
INFO:__main__:Epoch 4 - Score: 0.8024
Epoch 4 - Save Best Score: 0.8024 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8024 Model


EVAL: [76/77] Elapsed 0m 43s (remain 0m 0s) Loss: 0.4381(0.4459) 


Score: 0.8024
INFO:__main__:Score: 0.8024
Score: 0.7889
INFO:__main__:Score: 0.7889


In [21]:
A = pd.read_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
A.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs,kfold,pred
0,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...,0,0.148836
1,train_00006,2001-3000,CA,30,music,classical music,The Crimson String Quartet (CSQ) is returning ...,1,2001.0,3000.0,2500,2500 [SEP] 30 [SEP] CA [SEP] music [SEP] class...,0,0.80092
2,train_00016,3001-4000,GB,30,journalism,web,I am interested in setting up a small news web...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] GB [SEP] journalism [SEP] ...,0,0.023478
3,train_00017,11001-12000,US,30,food,drinks,Why Schenk-Atwood?Our neighborhood has just ab...,1,11001.0,12000.0,11500,11500 [SEP] 30 [SEP] US [SEP] food [SEP] drink...,0,0.291198
4,train_00018,12001-13000,US,28,journalism,web,Mega Visions app has been approved on all majo...,1,12001.0,13000.0,12500,12500 [SEP] 28 [SEP] US [SEP] journalism [SEP]...,0,0.541055
