In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.22.1-py3-none-any.whl (4.9 MB)
[K     |████████████████████████████████| 4.9 MB 6.7 MB/s 
[?25hCollecting huggingface-hub<1.0,>=0.9.0
  Downloading huggingface_hub-0.9.1-py3-none-any.whl (120 kB)
[K     |████████████████████████████████| 120 kB 72.0 MB/s 
Collecting tokenizers!=0.11.3,<0.13,>=0.11.1
  Downloading tokenizers-0.12.1-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.6 MB)
[K     |████████████████████████████████| 6.6 MB 55.6 MB/s 
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.9.1 tokenizers-0.12.1 transformers-4.22.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.4.0-py3-none-any.whl (365 kB)
[K     |████████████████████████████████| 365 kB 7.3 

In [3]:
!nvidia-smi

Mon Sep 19 06:55:40 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   37C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

The cache for model files in Transformers v4.22.0 has been updated. Migrating your old cache. This is a one-time only operation. You can interrupt this and resume the migration later on by calling `transformers.utils.move_cache()`.


Moving 0 files to the new cache system


0it [00:00, ?it/s]

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    debug2 = False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base"
    # model='microsoft/deberta-base'
    # model='roberta-base'
    # model='roberta-large'
    # model='roberta-large-mnli'
    # model='xlnet-large-cased'
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large"
    # model="microsoft/deberta-v3-large"
    # model='microsoft/deberta-v2-xlarge'
    # model='funnel-transformer/large'
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'
    # model='google/electra-large-discriminator'
    # model='google/electra-base-discriminator'
    model="facebook/bart-large-mnli"
    # model="facebook/bart-large"
    # model="facebook/bart-base"
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=180
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=4
    trn_fold=[0, 1, 2, 3]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = False
    freezing = False
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

if CFG.debug2:
    CFG.epochs = 3
    CFG.trn_fold = [0]

In [6]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'submission')
#OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'model')
OUTPUT_MODEL_DIR = DIR + '/output/model/EXP45/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return f1_score(y_true, (y_pred>thresh).astype(int))


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

g = torch.Generator()
g.manual_seed(CFG.seed)

<torch._C.Generator at 0x7fda6f6b8eb0>

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
train = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))
sub = pd.read_csv(os.path.join(INPUT_DIR,'sample_submit.csv'),header=None)
sub.columns = ['id','state']

In [10]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)

def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

def get_goal_values(df):
  df["goal"].replace("100000+","100000-100000",inplace=True)
  _df = df["goal"].str.split('-').apply(pd.Series).astype(float)
  _df.columns = ["goal_max","goal_min"]
  df["goal_max"] = _df["goal_max"].astype(str)
  df["goal_min"] = _df["goal_min"].astype(str)
  df["goal_median"] = _df[["goal_max","goal_min"]].median(axis=1)
  df["goal_median"] = df["goal_median"].astype(int)
  return df

if CFG.clean_content==True:
    train['html_content'] = train['html_content'].map(lambda x: str(x))
    train['html_content'] = train['html_content'].apply(html.unescape)
    p = re.compile(r"<[^>]*?>|&amp;|[/'’\"”]")
    train['html_content'] = train['html_content'].map(lambda x: p.sub("", x))
    train['html_content'] = train['html_content'].map(lambda x: x.lstrip())
    train['html_content'] = train['html_content'].fillna('missing')

train = get_goal_values(train)
train['inputs'] = train.goal_median.astype(str) + ' [SEP] ' + train.duration.astype(str) + ' [SEP] ' + train.country + ' [SEP] ' + train.category1 + ' [SEP] ' + train.category2 + ' [SEP] ' + train.html_content

In [11]:
train

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs
0,train_00000,20001-21000,US,45,art,mixed media,"http:dummy.comIn its first year, The Shillitos...",1,20001.0,21000.0,20500,20500 [SEP] 45 [SEP] US [SEP] art [SEP] mixed ...
1,train_00001,19001-20000,US,59,food,restaurants,Cultural Pretzel Sports Bar is a place where p...,0,19001.0,20000.0,19500,19500 [SEP] 59 [SEP] US [SEP] food [SEP] resta...
2,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...
3,train_00003,1001-2000,US,30,art,mixed media,"Canyon de Chelley, Dine (Navajo) Reservation, ...",1,1001.0,2000.0,1500,1500 [SEP] 30 [SEP] US [SEP] art [SEP] mixed m...
4,train_00004,1001-2000,US,29,film & video,webseries,"The story of the show, both on and off screen,...",1,1001.0,2000.0,1500,1500 [SEP] 29 [SEP] US [SEP] film & video [SEP...
...,...,...,...,...,...,...,...,...,...,...,...,...
9786,train_09786,1-1000,US,15,music,electronic music,So the story behind this is that Ive been maki...,0,1.0,1000.0,500,500 [SEP] 15 [SEP] US [SEP] music [SEP] electr...
9787,train_09787,3001-4000,CA,30,fashion,ready-to-wear,THE HIGH CLOTHINGMy vision is to create high q...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] CA [SEP] fashion [SEP] rea...
9788,train_09788,100000-100000,GB,30,technology,software,We dont think anybody looks forward to filling...,0,100000.0,100000.0,100000,100000 [SEP] 30 [SEP] GB [SEP] technology [SEP...
9789,train_09789,79001-80000,US,35,technology,gadgets,What is Droplet?\nDroplet is a wireless button...,1,79001.0,80000.0,79500,79500 [SEP] 35 [SEP] US [SEP] technology [SEP]...


In [12]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.state)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [13]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/899k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/456k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [14]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['inputs'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 6 # cls & sep & sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [15]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['inputs'].values
        self.labels = df['state'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [16]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel.from_config(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        #self.pool = MeanPooling()
        self.fc_dropout = nn.Dropout(cfg.fc_dropout)
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        #self._init_weights(self.fc)
        self.attention = nn.Sequential(
            nn.Linear(self.config.hidden_size, 512),
            nn.Tanh(),
            nn.Linear(512, 1),
            nn.Softmax(dim=1)
        )
        #self._init_weights(self.attention)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        weights = self.attention(last_hidden_states)
        feature = torch.sum(weights * last_hidden_states, dim=1)
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(self.fc_dropout(feature))
        return output

In [17]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        criterion,
        adv_param="weight",
        adv_lr=1e-4,
        adv_eps=1e-2,
        start_epoch=0,
        adv_step=1,
        device="cpu",
    ):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.device = device

    def attack_backward(self, inputs, label):
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            self.save()
            self.attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss
        
        

    def attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [18]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    #if not epoch < CFG.nth_awp_start_epoch:
    #    LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    #awp = AWP(model=model,
    #          optimizer=optimizer,
    #          criterion=criterion,
    #          adv_eps=0.01, 
    #          device=device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    #tot_loss = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        #if CFG.nth_awp_start_epoch <= epoch:
        #      loss = awp.attack_backward(inputs, labels)
        #      scaler.scale(loss).backward()
        #      awp.restore()
        #tot_loss += loss.item()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg
    #model.train()
    #return tot_loss/(step+1)


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [19]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['state'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0.

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [20]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['state'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')

BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label2id": {
    "contradiction": 0,
    "entai

Downloading:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

Some weights of the model checkpoint at facebook/bart-large-mnli were not used when initializing BartModel: ['classification_head.dense.weight', 'classification_head.dense.bias', 'classification_head.out_proj.weight', 'classification_head.out_proj.bias']
- This IS expected if you are initializing BartModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BartModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/458] Elapsed 0m 5s (remain 38m 10s) Loss: 0.8660(0.8660) Grad: inf  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 58s (remain 7m 0s) Loss: 0.7295(0.6553) Grad: 352889.2812  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 3m 52s (remain 4m 57s) Loss: 0.5238(0.6272) Grad: 214720.2188  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 5m 46s (remain 3m 0s) Loss: 0.4334(0.5951) Grad: 281218.1562  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 7m 40s (remain 1m 5s) Loss: 0.6132(0.5741) Grad: 313088.3438  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 8m 45s (remain 0m 0s) Loss: 0.5555(0.5611) Grad: 394228.5000  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 55s) Loss: 0.5957(0.5957) 


Epoch 1 - avg_train_loss: 0.5611  avg_val_loss: 0.5344  time: 577s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5611  avg_val_loss: 0.5344  time: 577s
Epoch 1 - Score: 0.6240
INFO:__main__:Epoch 1 - Score: 0.6240
Epoch 1 - Save Best Score: 0.6240 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.6240 Model


EVAL: [76/77] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3747(0.5344) 
Epoch: [2][0/458] Elapsed 0m 1s (remain 13m 39s) Loss: 0.5403(0.5403) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 55s (remain 6m 48s) Loss: 0.4054(0.4179) Grad: 332885.3125  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 3m 49s (remain 4m 53s) Loss: 0.1358(0.4131) Grad: 111626.0469  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 5m 42s (remain 2m 58s) Loss: 0.4757(0.4132) Grad: 292191.2500  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 7m 36s (remain 1m 4s) Loss: 0.3907(0.4125) Grad: 385581.5938  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 8m 41s (remain 0m 0s) Loss: 0.4373(0.4051) Grad: 289196.6875  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 1s (remain 2m 8s) Loss: 0.3312(0.3312) 


Epoch 2 - avg_train_loss: 0.4051  avg_val_loss: 0.4324  time: 572s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4051  avg_val_loss: 0.4324  time: 572s
Epoch 2 - Score: 0.8098
INFO:__main__:Epoch 2 - Score: 0.8098
Epoch 2 - Save Best Score: 0.8098 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8098 Model


EVAL: [76/77] Elapsed 0m 50s (remain 0m 0s) Loss: 0.2855(0.4324) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 13m 1s) Loss: 0.1584(0.1584) Grad: 298204.1562  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 54s (remain 6m 46s) Loss: 0.4679(0.3039) Grad: 393253.7812  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 3m 48s (remain 4m 51s) Loss: 0.2440(0.3022) Grad: 463826.7812  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 5m 41s (remain 2m 58s) Loss: 0.3387(0.2998) Grad: 432035.6250  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 7m 34s (remain 1m 4s) Loss: 0.2649(0.2987) Grad: 311420.3750  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 8m 39s (remain 0m 0s) Loss: 0.0911(0.2957) Grad: 164253.2656  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 32s) Loss: 0.4006(0.4006) 


Epoch 3 - avg_train_loss: 0.2957  avg_val_loss: 0.4619  time: 570s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2957  avg_val_loss: 0.4619  time: 570s
Epoch 3 - Score: 0.8013
INFO:__main__:Epoch 3 - Score: 0.8013


EVAL: [76/77] Elapsed 0m 50s (remain 0m 0s) Loss: 0.3500(0.4619) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 12m 1s) Loss: 0.1083(0.1083) Grad: 408940.3438  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 54s (remain 6m 45s) Loss: 0.2343(0.1993) Grad: 388704.0000  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 3m 48s (remain 4m 51s) Loss: 0.1019(0.1915) Grad: 251781.3594  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 5m 41s (remain 2m 58s) Loss: 0.2173(0.1933) Grad: 169176.8594  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 7m 34s (remain 1m 4s) Loss: 0.1987(0.1999) Grad: 368619.4062  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 8m 39s (remain 0m 0s) Loss: 0.0366(0.1966) Grad: 79789.5547  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 43s) Loss: 0.4770(0.4770) 


Epoch 4 - avg_train_loss: 0.1966  avg_val_loss: 0.5654  time: 570s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1966  avg_val_loss: 0.5654  time: 570s
Epoch 4 - Score: 0.8084
INFO:__main__:Epoch 4 - Score: 0.8084


EVAL: [76/77] Elapsed 0m 50s (remain 0m 0s) Loss: 0.5922(0.5654) 


Score: 0.8098
INFO:__main__:Score: 0.8098
BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label

Epoch: [1][0/458] Elapsed 0m 1s (remain 12m 4s) Loss: 0.7227(0.7227) Grad: 277421.0625  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 54s (remain 6m 45s) Loss: 0.6283(0.6714) Grad: 128588.5703  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 3m 47s (remain 4m 51s) Loss: 0.5463(0.6259) Grad: 216037.0000  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.3907(0.5912) Grad: 261945.0312  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.8763(0.5702) Grad: 536006.5000  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.2795(0.5571) Grad: 99098.7422  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 14s) Loss: 0.4376(0.4376) 


Epoch 1 - avg_train_loss: 0.5571  avg_val_loss: 0.4366  time: 568s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5571  avg_val_loss: 0.4366  time: 568s
Epoch 1 - Score: 0.8034
INFO:__main__:Epoch 1 - Score: 0.8034


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.5473(0.4366) 


Epoch 1 - Save Best Score: 0.8034 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.8034 Model


Epoch: [2][0/458] Elapsed 0m 1s (remain 10m 31s) Loss: 0.4908(0.4908) Grad: inf  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 54s (remain 6m 44s) Loss: 0.3638(0.4475) Grad: 115213.2734  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 3m 47s (remain 4m 50s) Loss: 0.4817(0.4353) Grad: 136370.1406  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.3635(0.4447) Grad: 70210.1562  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.7909(0.4438) Grad: 94339.9062  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.4556(0.4380) Grad: 41855.0898  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 15s) Loss: 0.4443(0.4443) 


Epoch 2 - avg_train_loss: 0.4380  avg_val_loss: 0.4228  time: 568s
INFO:__main__:Epoch 2 - avg_train_loss: 0.4380  avg_val_loss: 0.4228  time: 568s
Epoch 2 - Score: 0.7634
INFO:__main__:Epoch 2 - Score: 0.7634


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.4109(0.4228) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 11m 20s) Loss: 0.3522(0.3522) Grad: 230120.5312  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 54s (remain 6m 44s) Loss: 0.3008(0.3173) Grad: 314307.4062  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 3m 47s (remain 4m 50s) Loss: 0.1614(0.2955) Grad: 64856.0234  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.1245(0.2931) Grad: 84466.5781  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.2919(0.2881) Grad: 143103.1406  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.1523(0.2842) Grad: 110709.7969  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 9s) Loss: 0.5568(0.5568) 


Epoch 3 - avg_train_loss: 0.2842  avg_val_loss: 0.4611  time: 568s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2842  avg_val_loss: 0.4611  time: 568s
Epoch 3 - Score: 0.8178
INFO:__main__:Epoch 3 - Score: 0.8178
Epoch 3 - Save Best Score: 0.8178 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8178 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.4793(0.4611) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 11m 0s) Loss: 0.1351(0.1351) Grad: 283376.0000  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 54s (remain 6m 45s) Loss: 0.2507(0.2103) Grad: 256640.7812  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 3m 47s (remain 4m 51s) Loss: 0.0786(0.2062) Grad: 261617.0625  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.0508(0.2003) Grad: 75075.9297  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.0724(0.2049) Grad: 93393.9297  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.1022(0.2031) Grad: 383121.3125  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 0s (remain 1m 11s) Loss: 0.6419(0.6419) 


Epoch 4 - avg_train_loss: 0.2031  avg_val_loss: 0.5206  time: 568s
INFO:__main__:Epoch 4 - avg_train_loss: 0.2031  avg_val_loss: 0.5206  time: 568s
Epoch 4 - Score: 0.8182
INFO:__main__:Epoch 4 - Score: 0.8182
Epoch 4 - Save Best Score: 0.8182 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8182 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.5304(0.5206) 


Score: 0.8182
INFO:__main__:Score: 0.8182
BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label

Epoch: [1][0/458] Elapsed 0m 1s (remain 15m 9s) Loss: 0.6983(0.6983) Grad: 315915.9688  LR: 0.00002000  
Epoch: [1][100/458] Elapsed 1m 54s (remain 6m 46s) Loss: 0.4966(0.6742) Grad: 72276.2812  LR: 0.00001985  
Epoch: [1][200/458] Elapsed 3m 48s (remain 4m 51s) Loss: 0.5037(0.6321) Grad: 120007.3203  LR: 0.00001941  
Epoch: [1][300/458] Elapsed 5m 41s (remain 2m 57s) Loss: 0.5508(0.6066) Grad: 114996.0391  LR: 0.00001870  
Epoch: [1][400/458] Elapsed 7m 34s (remain 1m 4s) Loss: 0.3039(0.5746) Grad: 64527.0352  LR: 0.00001773  
Epoch: [1][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.8484(0.5599) Grad: 179411.2500  LR: 0.00001708  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 21s) Loss: 0.3426(0.3426) 
EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.1891(0.4521) 


Epoch 1 - avg_train_loss: 0.5599  avg_val_loss: 0.4521  time: 569s
INFO:__main__:Epoch 1 - avg_train_loss: 0.5599  avg_val_loss: 0.4521  time: 569s
Epoch 1 - Score: 0.7765
INFO:__main__:Epoch 1 - Score: 0.7765
Epoch 1 - Save Best Score: 0.7765 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7765 Model


Epoch: [2][0/458] Elapsed 0m 1s (remain 11m 19s) Loss: 0.2981(0.2981) Grad: 396174.5938  LR: 0.00001707  
Epoch: [2][100/458] Elapsed 1m 54s (remain 6m 45s) Loss: 0.4108(0.3890) Grad: 236312.8750  LR: 0.00001576  
Epoch: [2][200/458] Elapsed 3m 47s (remain 4m 51s) Loss: 0.1995(0.3855) Grad: 148490.2031  LR: 0.00001428  
Epoch: [2][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.3548(0.3818) Grad: 132793.0469  LR: 0.00001268  
Epoch: [2][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.3317(0.3741) Grad: 189225.1094  LR: 0.00001100  
Epoch: [2][457/458] Elapsed 8m 38s (remain 0m 0s) Loss: 0.3041(0.3795) Grad: 93092.1562  LR: 0.00001003  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 20s) Loss: 0.4380(0.4380) 


Epoch 2 - avg_train_loss: 0.3795  avg_val_loss: 0.4438  time: 568s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3795  avg_val_loss: 0.4438  time: 568s
Epoch 2 - Score: 0.7812
INFO:__main__:Epoch 2 - Score: 0.7812
Epoch 2 - Save Best Score: 0.7812 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.7812 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.2225(0.4438) 
Epoch: [3][0/458] Elapsed 0m 1s (remain 11m 25s) Loss: 0.4143(0.4143) Grad: 520409.2812  LR: 0.00001001  
Epoch: [3][100/458] Elapsed 1m 54s (remain 6m 44s) Loss: 0.5281(0.3077) Grad: 137806.0469  LR: 0.00000830  
Epoch: [3][200/458] Elapsed 3m 47s (remain 4m 50s) Loss: 0.4018(0.2941) Grad: 150054.4062  LR: 0.00000665  
Epoch: [3][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.4431(0.2807) Grad: 229112.5469  LR: 0.00000509  
Epoch: [3][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.3193(0.2795) Grad: 242901.1562  LR: 0.00000368  
Epoch: [3][457/458] Elapsed 8m 37s (remain 0m 0s) Loss: 0.3641(0.2833) Grad: 320985.5312  LR: 0.00000296  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 18s) Loss: 0.4124(0.4124) 


Epoch 3 - avg_train_loss: 0.2833  avg_val_loss: 0.4772  time: 568s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2833  avg_val_loss: 0.4772  time: 568s
Epoch 3 - Score: 0.8009
INFO:__main__:Epoch 3 - Score: 0.8009
Epoch 3 - Save Best Score: 0.8009 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8009 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.1701(0.4772) 
Epoch: [4][0/458] Elapsed 0m 1s (remain 11m 14s) Loss: 0.1600(0.1600) Grad: 355437.4688  LR: 0.00000294  
Epoch: [4][100/458] Elapsed 1m 54s (remain 6m 44s) Loss: 0.2915(0.1654) Grad: 302143.7188  LR: 0.00000184  
Epoch: [4][200/458] Elapsed 3m 47s (remain 4m 50s) Loss: 0.1215(0.1703) Grad: 283360.3125  LR: 0.00000097  
Epoch: [4][300/458] Elapsed 5m 40s (remain 2m 57s) Loss: 0.2565(0.1773) Grad: 167906.5156  LR: 0.00000037  
Epoch: [4][400/458] Elapsed 7m 33s (remain 1m 4s) Loss: 0.1336(0.1750) Grad: 102879.1016  LR: 0.00000005  
Epoch: [4][457/458] Elapsed 8m 37s (remain 0m 0s) Loss: 0.0613(0.1748) Grad: 28696.3281  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 22s) Loss: 0.5343(0.5343) 


Epoch 4 - avg_train_loss: 0.1748  avg_val_loss: 0.6382  time: 568s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1748  avg_val_loss: 0.6382  time: 568s
Epoch 4 - Score: 0.8030
INFO:__main__:Epoch 4 - Score: 0.8030
Epoch 4 - Save Best Score: 0.8030 Model
INFO:__main__:Epoch 4 - Save Best Score: 0.8030 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.2508(0.6382) 


Score: 0.8030
INFO:__main__:Score: 0.8030
BartConfig {
  "_name_or_path": "facebook/bart-large-mnli",
  "_num_labels": 3,
  "activation_dropout": 0.0,
  "activation_function": "gelu",
  "add_final_layer_norm": false,
  "architectures": [
    "BartForSequenceClassification"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "bos_token_id": 0,
  "classif_dropout": 0.0,
  "classifier_dropout": 0.0,
  "d_model": 1024,
  "decoder_attention_heads": 16,
  "decoder_ffn_dim": 4096,
  "decoder_layerdrop": 0.0,
  "decoder_layers": 12,
  "decoder_start_token_id": 2,
  "dropout": 0.1,
  "encoder_attention_heads": 16,
  "encoder_ffn_dim": 4096,
  "encoder_layerdrop": 0.0,
  "encoder_layers": 12,
  "eos_token_id": 2,
  "forced_eos_token_id": 2,
  "gradient_checkpointing": false,
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "id2label": {
    "0": "contradiction",
    "1": "neutral",
    "2": "entailment"
  },
  "init_std": 0.02,
  "is_encoder_decoder": true,
  "label

Epoch: [1][0/459] Elapsed 0m 1s (remain 13m 51s) Loss: 0.6624(0.6624) Grad: 269454.1875  LR: 0.00002000  
Epoch: [1][100/459] Elapsed 1m 54s (remain 6m 46s) Loss: 0.6403(0.6697) Grad: 263257.8438  LR: 0.00001985  
Epoch: [1][200/459] Elapsed 3m 47s (remain 4m 52s) Loss: 0.4869(0.6366) Grad: 191153.6719  LR: 0.00001941  
Epoch: [1][300/459] Elapsed 5m 40s (remain 2m 58s) Loss: 0.3984(0.6107) Grad: 124220.4062  LR: 0.00001870  
Epoch: [1][400/459] Elapsed 7m 33s (remain 1m 5s) Loss: 0.6138(0.5841) Grad: 214795.4844  LR: 0.00001774  
Epoch: [1][458/459] Elapsed 8m 39s (remain 0m 0s) Loss: 0.4172(0.5726) Grad: 75575.7891  LR: 0.00001707  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 27s) Loss: 0.4019(0.4019) 


Epoch 1 - avg_train_loss: 0.5726  avg_val_loss: 0.4614  time: 570s


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.5963(0.4614) 


INFO:__main__:Epoch 1 - avg_train_loss: 0.5726  avg_val_loss: 0.4614  time: 570s
Epoch 1 - Score: 0.7190
INFO:__main__:Epoch 1 - Score: 0.7190
Epoch 1 - Save Best Score: 0.7190 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.7190 Model


Epoch: [2][0/459] Elapsed 0m 1s (remain 11m 24s) Loss: 0.4393(0.4393) Grad: 259854.6406  LR: 0.00001706  
Epoch: [2][100/459] Elapsed 1m 54s (remain 6m 46s) Loss: 0.2117(0.4134) Grad: 207405.5625  LR: 0.00001575  
Epoch: [2][200/459] Elapsed 3m 47s (remain 4m 52s) Loss: 0.2856(0.4030) Grad: 251915.6250  LR: 0.00001427  
Epoch: [2][300/459] Elapsed 5m 40s (remain 2m 58s) Loss: 0.6026(0.3899) Grad: 370860.7188  LR: 0.00001267  
Epoch: [2][400/459] Elapsed 7m 33s (remain 1m 5s) Loss: 0.1766(0.3877) Grad: 57564.8242  LR: 0.00001099  
Epoch: [2][458/459] Elapsed 8m 39s (remain 0m 0s) Loss: 0.2897(0.3902) Grad: 66220.7109  LR: 0.00001000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 21s) Loss: 0.4327(0.4327) 


Epoch 2 - avg_train_loss: 0.3902  avg_val_loss: 0.3902  time: 569s
INFO:__main__:Epoch 2 - avg_train_loss: 0.3902  avg_val_loss: 0.3902  time: 569s
Epoch 2 - Score: 0.8072
INFO:__main__:Epoch 2 - Score: 0.8072
Epoch 2 - Save Best Score: 0.8072 Model
INFO:__main__:Epoch 2 - Save Best Score: 0.8072 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.4301(0.3902) 
Epoch: [3][0/459] Elapsed 0m 1s (remain 11m 18s) Loss: 0.3475(0.3475) Grad: 460313.4375  LR: 0.00000998  
Epoch: [3][100/459] Elapsed 1m 54s (remain 6m 46s) Loss: 0.4563(0.3262) Grad: 460285.3438  LR: 0.00000828  
Epoch: [3][200/459] Elapsed 3m 47s (remain 4m 52s) Loss: 0.4063(0.3140) Grad: 176267.6406  LR: 0.00000663  
Epoch: [3][300/459] Elapsed 5m 40s (remain 2m 58s) Loss: 0.2947(0.3077) Grad: 188056.5469  LR: 0.00000507  
Epoch: [3][400/459] Elapsed 7m 33s (remain 1m 5s) Loss: 0.5726(0.2913) Grad: 259780.8125  LR: 0.00000366  
Epoch: [3][458/459] Elapsed 8m 39s (remain 0m 0s) Loss: 0.3686(0.2968) Grad: 160262.8750  LR: 0.00000293  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 18s) Loss: 0.4246(0.4246) 


Epoch 3 - avg_train_loss: 0.2968  avg_val_loss: 0.4230  time: 569s
INFO:__main__:Epoch 3 - avg_train_loss: 0.2968  avg_val_loss: 0.4230  time: 569s
Epoch 3 - Score: 0.8150
INFO:__main__:Epoch 3 - Score: 0.8150
Epoch 3 - Save Best Score: 0.8150 Model
INFO:__main__:Epoch 3 - Save Best Score: 0.8150 Model


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.4134(0.4230) 
Epoch: [4][0/459] Elapsed 0m 1s (remain 11m 14s) Loss: 0.4163(0.4163) Grad: 504424.2812  LR: 0.00000292  
Epoch: [4][100/459] Elapsed 1m 54s (remain 6m 45s) Loss: 0.2504(0.2151) Grad: 426893.9688  LR: 0.00000182  
Epoch: [4][200/459] Elapsed 3m 47s (remain 4m 52s) Loss: 0.1114(0.2056) Grad: 257770.2344  LR: 0.00000096  
Epoch: [4][300/459] Elapsed 5m 40s (remain 2m 58s) Loss: 0.1750(0.1940) Grad: 203207.9219  LR: 0.00000036  
Epoch: [4][400/459] Elapsed 7m 33s (remain 1m 5s) Loss: 0.2813(0.1898) Grad: 236803.5000  LR: 0.00000005  
Epoch: [4][458/459] Elapsed 8m 39s (remain 0m 0s) Loss: 0.1067(0.1899) Grad: 236133.5625  LR: 0.00000000  
EVAL: [0/77] Elapsed 0m 1s (remain 1m 21s) Loss: 0.5363(0.5363) 


Epoch 4 - avg_train_loss: 0.1899  avg_val_loss: 0.5374  time: 569s
INFO:__main__:Epoch 4 - avg_train_loss: 0.1899  avg_val_loss: 0.5374  time: 569s
Epoch 4 - Score: 0.8147
INFO:__main__:Epoch 4 - Score: 0.8147


EVAL: [76/77] Elapsed 0m 49s (remain 0m 0s) Loss: 0.6104(0.5374) 


Score: 0.8150
INFO:__main__:Score: 0.8150
Score: 0.8115
INFO:__main__:Score: 0.8115


In [21]:
A = pd.read_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
A.head()

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs,kfold,pred
0,train_00002,2001-3000,US,38,art,performance art,"I want to perform this piece guerilla style, o...",0,2001.0,3000.0,2500,2500 [SEP] 38 [SEP] US [SEP] art [SEP] perform...,0,0.618032
1,train_00006,2001-3000,CA,30,music,classical music,The Crimson String Quartet (CSQ) is returning ...,1,2001.0,3000.0,2500,2500 [SEP] 30 [SEP] CA [SEP] music [SEP] class...,0,0.944546
2,train_00016,3001-4000,GB,30,journalism,web,I am interested in setting up a small news web...,0,3001.0,4000.0,3500,3500 [SEP] 30 [SEP] GB [SEP] journalism [SEP] ...,0,0.00072
3,train_00017,11001-12000,US,30,food,drinks,Why Schenk-Atwood?Our neighborhood has just ab...,1,11001.0,12000.0,11500,11500 [SEP] 30 [SEP] US [SEP] food [SEP] drink...,0,0.819439
4,train_00018,12001-13000,US,28,journalism,web,Mega Visions app has been approved on all majo...,1,12001.0,13000.0,12500,12500 [SEP] 28 [SEP] US [SEP] journalism [SEP]...,0,0.113744
