In [1]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install transformers
!pip install datasets
!pip install sentencepiece

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
!nvidia-smi

Sun Sep 18 08:57:28 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   44C    P0    28W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [4]:
import os
import gc
import math
import time
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.simplefilter('ignore')
from tqdm import tqdm
import re
import html

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import Adam, SGD, AdamW, RAdam
from torch.optim import lr_scheduler
from torch.utils.data import DataLoader, Dataset

from sklearn.model_selection import StratifiedKFold,StratifiedGroupKFold,GroupKFold
from sklearn.metrics import log_loss,f1_score

from transformers import AutoModel, AutoConfig, AutoTokenizer, AdamW, DataCollatorWithPadding
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
# ====================================================
# CFG
# ====================================================
class CFG:
    debug=False
    debug2 = False
    apex=True
    print_freq=100
    num_workers=4
    # model="microsoft/deberta-v3-base" o
    # model='microsoft/deberta-base'  △
    # model='roberta-base'  x
    # model='roberta-large'  x
    # model='roberta-large-mnli'
    # model='google/bigbird-roberta-base'
    # model='google/bigbird-roberta-large'
    # model='xlnet-large-cased'  △
    # model='albert-xxlarge-v2'
    # model="microsoft/deberta-large" o
    # model="microsoft/deberta-v3-large"  o
    # model='microsoft/deberta-v2-xlarge'
    # model='microsoft/deberta-v2-xxlarge'
    # model='microsoft/deberta-xlarge' o
    # model='funnel-transformer/large' o
    # model='funnel-transformer/medium'
    # model='albert-base-v2'
    # model='albert-large-v2'  x
    # model='google/electra-large-discriminator'  x
    # model='google/electra-base-discriminator'  x
    # model="facebook/bart-large-mnli"
    # model="facebook/bart-large"  o
    # model="facebook/bart-base"
    # model = "distilbert-base-uncased" x
    # model = "allenai/longformer-large-4096" x
    # model = "allenai/longformer-base-4096"
    # model = "uw-madison/yoso-4096"  x
    # model = "xlm-roberta-large"
    # model = "xlm-roberta-base"
    model = "google/muril-large-cased" #x
    # model = "google/rembert" x
    scheduler='cosine' # ['linear', 'cosine']
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    epochs=4
    encoder_lr=2e-5
    decoder_lr=2e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    batch_size=16
    fc_dropout=0.2
    target_size=1
    max_len=256
    weight_decay=0.01
    gradient_accumulation_steps=1
    max_grad_norm=1000
    seed=42
    n_fold=10
    trn_fold=[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
    train=True
    nth_awp_start_epoch=1
    gradient_checkpointing = True
    freezing = True
    clean_content = True

if CFG.debug:
    CFG.epochs = 2
    CFG.trn_fold = [0, 1]

if CFG.debug2:
    CFG.epochs = 3
    CFG.trn_fold = [0]

In [6]:
DIR = '/content/drive/MyDrive/Competitions/Signate/MUFJ'
INPUT_DIR = os.path.join(DIR,'input')
OUTPUT_DIR = os.path.join(DIR,'output')
OUTPUT_SUB_DIR = os.path.join(OUTPUT_DIR,'submission')
#OUTPUT_MODEL_DIR = os.path.join(OUTPUT_DIR,'model')
OUTPUT_MODEL_DIR = DIR + '/output/model/EXP36/'
if not os.path.exists(OUTPUT_MODEL_DIR):
    os.makedirs(OUTPUT_MODEL_DIR)

In [7]:
def get_score(labels, outputs):
    thresh = 0.5
    y_pred = outputs
    y_true = labels
    return f1_score(y_true, (y_pred>thresh).astype(int))


def get_logger(filename=OUTPUT_MODEL_DIR+'train'):
    from logging import getLogger, INFO, FileHandler, Formatter, StreamHandler
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    handler2 = FileHandler(filename=f"{filename}.log")
    handler2.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    logger.addHandler(handler2)
    return logger

LOGGER = get_logger()

def seed_everything(seed=CFG.seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=CFG.seed)

In [8]:
def freeze(module):
    """
    Freezes module's parameters.
    """
    
    for parameter in module.parameters():
        parameter.requires_grad = False
        
def get_freezed_parameters(module):
    """
    Returns names of freezed parameters of the given module.
    """
    
    freezed_parameters = []
    for name, parameter in module.named_parameters():
        if not parameter.requires_grad:
            freezed_parameters.append(name)
            
    return freezed_parameters

def set_embedding_parameters_bits(embeddings_path, optim_bits=32):
    """
    https://github.com/huggingface/transformers/issues/14819#issuecomment-1003427930
    """
    
    embedding_types = ("word", "position", "token_type")
    for embedding_type in embedding_types:
        attr_name = f"{embedding_type}_embeddings"
        
        if hasattr(embeddings_path, attr_name): 
            bnb.optim.GlobalOptimManager.get_instance().register_module_override(
                getattr(embeddings_path, attr_name), 'weight', {'optim_bits': optim_bits}
            )

In [9]:
df = pd.read_csv(os.path.join(INPUT_DIR,'train.csv'))
test = pd.read_csv(os.path.join(INPUT_DIR,'test.csv'))
sub = pd.read_csv(os.path.join(INPUT_DIR,'sample_submit.csv'),header=None)
sub.columns = ['id','state']

In [10]:
English_Country = ['US','CA','GB','AU','NZ']
train = df.loc[~df['country'].isin(English_Country)]
train = train.reset_index(drop=True)
df.shape, train.shape

((9791, 8), (1109, 8))

In [11]:
def remove_URL(text):
    url = re.compile(r'https?://\S+|www\.\S+')
    return url.sub('',text)

def remove_html(text):
    html=re.compile(r"<[^>]*?>")
    return html.sub('',text)

def cleaning(texts):
    clean_texts = []
    for text in texts:
        # htmlタグを削除
        text = remove_URL(text)
        text = remove_html(text)
        #アルファベット以外をスペースに置き換え
        #clean_punc = re.sub(r'[^a-zA-Z]', ' ', text)
        #改行削除
        #text = text.replace("\n","")
        clean_texts.append(text)
    return clean_texts

def get_goal_values(df):
  df["goal"].replace("100000+","100000-100000",inplace=True)
  _df = df["goal"].str.split('-').apply(pd.Series).astype(float)
  _df.columns = ["goal_max","goal_min"]
  df["goal_max"] = _df["goal_max"].astype(str)
  df["goal_min"] = _df["goal_min"].astype(str)
  df["goal_median"] = _df[["goal_max","goal_min"]].median(axis=1)
  df["goal_median"] = df["goal_median"].astype(int)
  return df

if CFG.clean_content==True:
    train['html_content'] = train['html_content'].map(lambda x: str(x))
    train['html_content'] = train['html_content'].apply(html.unescape)
    p = re.compile(r"<[^>]*?>|&amp;|[/'’\"”]")
    train['html_content'] = train['html_content'].map(lambda x: p.sub("", x))
    train['html_content'] = train['html_content'].map(lambda x: x.lstrip())
    train['html_content'] = train['html_content'].fillna('missing')

train = get_goal_values(train)
train['inputs'] = train.goal_median.astype(str) + ' [SEP] ' + train.duration.astype(str) + ' [SEP] ' + train.country + ' [SEP] ' + train.category1 + ' [SEP] ' + train.category2 + ' [SEP] ' + train.html_content

In [12]:
train

Unnamed: 0,id,goal,country,duration,category1,category2,html_content,state,goal_max,goal_min,goal_median,inputs
0,train_00010,1-1000,FR,15,technology,diy electronics,Excellent board for Internet of Things (IoT) ...,1,1.0,1000.0,500,500 [SEP] 15 [SEP] FR [SEP] technology [SEP] d...
1,train_00027,5001-6000,IT,40,film & video,shorts,INTRO[EN] Every day we are bombarded by tragic...,1,5001.0,6000.0,5500,5500 [SEP] 40 [SEP] IT [SEP] film & video [SEP...
2,train_00038,2001-3000,DE,32,games,playing cards,Front\n\n\n\n\n\nTuck Back\n\n\n\n\n\nComposit...,1,2001.0,3000.0,2500,2500 [SEP] 32 [SEP] DE [SEP] games [SEP] playi...
3,train_00064,22001-23000,DE,30,food,drinks,Fresh T - ice tea with new flavors and combina...,0,22001.0,23000.0,22500,22500 [SEP] 30 [SEP] DE [SEP] food [SEP] drink...
4,train_00066,3001-4000,DE,29,art,installations,It will be a tasty interactive experience! The...,1,3001.0,4000.0,3500,3500 [SEP] 29 [SEP] DE [SEP] art [SEP] install...
...,...,...,...,...,...,...,...,...,...,...,...,...
1104,train_09754,2001-3000,SG,30,music,pop,"Im a singer-songwriter based in Singapore, and...",1,2001.0,3000.0,2500,2500 [SEP] 30 [SEP] SG [SEP] music [SEP] pop [...
1105,train_09771,75001-76000,IT,48,film & video,fantasy,Dear lords and ladiesI am Herold the South Tyr...,0,75001.0,76000.0,75500,75500 [SEP] 48 [SEP] IT [SEP] film & video [SE...
1106,train_09774,2001-3000,MX,32,film & video,documentary,SINOPSIS: Miles de migrantes centroamericanos ...,1,2001.0,3000.0,2500,2500 [SEP] 32 [SEP] MX [SEP] film & video [SEP...
1107,train_09779,56001-57000,FR,60,technology,apps,I.\tCERTISIO ? \nThe mobile application ‘andro...,0,56001.0,57000.0,56500,56500 [SEP] 60 [SEP] FR [SEP] technology [SEP]...


In [13]:
skf = StratifiedKFold(n_splits=CFG.n_fold,shuffle=True,random_state=CFG.seed)
for fold, ( _, val_) in enumerate(skf.split(train, train.state)):
    train.loc[val_ , "kfold"] = int(fold)
    
train["kfold"] = train["kfold"].astype(int)

if CFG.debug:
    display(train.groupby('kfold').size())
    train = train.sample(n=1000, random_state=0).reset_index(drop=True)
    display(train.groupby('kfold').size())

In [14]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
tokenizer.save_pretrained(OUTPUT_MODEL_DIR+'tokenizer/')
CFG.tokenizer = tokenizer

Downloading:   0%|          | 0.00/181 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/406 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.16M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

In [15]:
# ====================================================
# Define max_len
# ====================================================
#lengths = []
#tk0 = tqdm(train['inputs'].fillna("").values, total=len(train))
#for text in tk0:
#    length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
#    lengths.append(length)
#CFG.max_len = max(lengths) + 6 # cls & sep & sep
#LOGGER.info(f"max_len: {CFG.max_len}")

In [16]:
# ====================================================
# Dataset
# ====================================================
def prepare_input(cfg, text):
    inputs = cfg.tokenizer(text,
                           add_special_tokens=True,
                           max_length=cfg.max_len,
                           padding="max_length",
                           return_offsets_mapping=False,
                           truncation=True)
    for k, v in inputs.items():
        inputs[k] = torch.tensor(v, dtype=torch.long)
    return inputs


class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.inputs = df['inputs'].values
        self.labels = df['state'].values

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, item):
        inputs = prepare_input(self.cfg, self.inputs[item])
        label = torch.tensor(self.labels[item], dtype=torch.float)
        return inputs, label

#collate_fn = DataCollatorWithPadding(tokenizer=tokenizer)

In [17]:
# ====================================================
# Model
# ====================================================
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()
        
    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings
    

class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
            self.config.hidden_dropout = 0.
            self.config.hidden_dropout_prob = 0.
            self.config.attention_dropout = 0.
            self.config.attention_probs_dropout_prob = 0.
            LOGGER.info(self.config)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if self.cfg.gradient_checkpointing:
            self.model.gradient_checkpointing_enable()

        # Freezing
        if cfg.freezing:
            # freezing embeddings and first 2 layers of encoder
            freeze((self.model).embeddings)
            freeze((self.model).encoder.layer[:2])
            cfg.after_freezed_parameters = filter(lambda parameter: parameter.requires_grad, (self.model).parameters())

        self.pool = MeanPooling()
        self.fc = nn.Linear(self.config.hidden_size, cfg.target_size)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)
        
    def feature(self, inputs):
        outputs = self.model(**inputs)
        last_hidden_states = outputs[0]
        feature = self.pool(last_hidden_states, inputs['attention_mask'])
        return feature

    def forward(self, inputs):
        feature = self.feature(inputs)
        output = self.fc(feature)
        return output

In [18]:
class AWP:
    def __init__(
        self,
        model,
        optimizer,
        criterion,
        adv_param="weight",
        adv_lr=1e-4,
        adv_eps=1e-2,
        start_epoch=0,
        adv_step=1,
        device="cpu",
    ):
        self.model = model
        self.optimizer = optimizer
        self.criterion = criterion
        self.adv_param = adv_param
        self.adv_lr = adv_lr
        self.adv_eps = adv_eps
        self.start_epoch = start_epoch
        self.adv_step = adv_step
        self.backup = {}
        self.backup_eps = {}
        self.device = device

    def attack_backward(self, inputs, label):
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            self.save()
            self.attack_step() # モデルを近傍の悪い方へ改変
            y_preds = self.model(inputs)
            adv_loss = self.criterion(
                y_preds.view(-1, 1), label.view(-1, 1))
            mask = (label.view(-1, 1) != -1)
            adv_loss = torch.masked_select(adv_loss, mask).mean()
            self.optimizer.zero_grad()
        return adv_loss
        
        

    def attack_step(self):
        e = 1e-6
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                norm1 = torch.norm(param.grad)
                norm2 = torch.norm(param.data.detach())
                if norm1 != 0 and not torch.isnan(norm1):
                    r_at = self.adv_lr * param.grad / (norm1 + e) * (norm2 + e)
                    param.data.add_(r_at)
                    param.data = torch.min(
                        torch.max(param.data, self.backup_eps[name][0]), self.backup_eps[name][1]
                    )
                # param.data.clamp_(*self.backup_eps[name])

    def save(self):
        for name, param in self.model.named_parameters():
            if param.requires_grad and param.grad is not None and self.adv_param in name:
                if name not in self.backup:
                    self.backup[name] = param.data.clone()
                    grad_eps = self.adv_eps * param.abs().detach()
                    self.backup_eps[name] = (
                        self.backup[name] - grad_eps,
                        self.backup[name] + grad_eps,
                    )

    def restore(self,):
        for name, param in self.model.named_parameters():
            if name in self.backup:
                param.data = self.backup[name]
        self.backup = {}
        self.backup_eps = {}

In [19]:
# ====================================================
# Helper functions
# ====================================================
class AverageMeter(object):
    """Computes and stores the average and current value"""
    def __init__(self):
        self.reset()

    def reset(self):
        self.val = 0
        self.avg = 0
        self.sum = 0
        self.count = 0

    def update(self, val, n=1):
        self.val = val
        self.sum += val * n
        self.count += n
        self.avg = self.sum / self.count


def asMinutes(s):
    m = math.floor(s / 60)
    s -= m * 60
    return '%dm %ds' % (m, s)


def timeSince(since, percent):
    now = time.time()
    s = now - since
    es = s / (percent)
    rs = es - s
    return '%s (remain %s)' % (asMinutes(s), asMinutes(rs))


def train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device):
    #if not epoch < CFG.nth_awp_start_epoch:
    #    LOGGER.info(f'AWP training with epoch {epoch+1}')
    model.train()
    #awp = AWP(model=model,
    #          optimizer=optimizer,
    #          criterion=criterion,
    #          adv_eps=0.01, 
    #          device=device)
    scaler = torch.cuda.amp.GradScaler(enabled=CFG.apex)
    losses = AverageMeter()
    start = end = time.time()
    global_step = 0
    #tot_loss = 0
    for step, (inputs, labels) in enumerate(train_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.cuda.amp.autocast(enabled=CFG.apex):
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        scaler.scale(loss).backward()
        grad_norm = torch.nn.utils.clip_grad_norm_(model.parameters(), CFG.max_grad_norm)
        #if CFG.nth_awp_start_epoch <= epoch:
        #      loss = awp.attack_backward(inputs, labels)
        #      scaler.scale(loss).backward()
        #      awp.restore()
        #tot_loss += loss.item()
        if (step + 1) % CFG.gradient_accumulation_steps == 0:
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
            global_step += 1
            if CFG.batch_scheduler:
                scheduler.step()
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(train_loader)-1):
            print('Epoch: [{0}][{1}/{2}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  'Grad: {grad_norm:.4f}  '
                  'LR: {lr:.8f}  '
                  .format(epoch+1, step, len(train_loader), 
                          remain=timeSince(start, float(step+1)/len(train_loader)),
                          loss=losses,
                          grad_norm=grad_norm,
                          lr=scheduler.get_lr()[0]))

    return losses.avg
    #model.train()
    #return tot_loss/(step+1)


def valid_fn(valid_loader, model, criterion, device):
    losses = AverageMeter()
    model.eval()
    preds = []
    start = end = time.time()
    for step, (inputs, labels) in enumerate(valid_loader):
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        labels = labels.to(device)
        batch_size = labels.size(0)
        with torch.no_grad():
            y_preds = model(inputs)
        loss = criterion(y_preds.view(-1, 1), labels.view(-1, 1))
        if CFG.gradient_accumulation_steps > 1:
            loss = loss / CFG.gradient_accumulation_steps
        losses.update(loss.item(), batch_size)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
        end = time.time()
        if step % CFG.print_freq == 0 or step == (len(valid_loader)-1):
            print('EVAL: [{0}/{1}] '
                  'Elapsed {remain:s} '
                  'Loss: {loss.val:.4f}({loss.avg:.4f}) '
                  .format(step, len(valid_loader),
                          loss=losses,
                          remain=timeSince(start, float(step+1)/len(valid_loader))))
    predictions = np.concatenate(preds)
    predictions = np.concatenate(predictions)
    return losses.avg, predictions


def inference_fn(test_loader, model, device):
    preds = []
    model.eval()
    model.to(device)
    tk0 = tqdm(test_loader, total=len(test_loader))
    for inputs in tk0:
        for k, v in inputs.items():
            inputs[k] = v.to(device)
        with torch.no_grad():
            y_preds = model(inputs)
        preds.append(y_preds.sigmoid().to('cpu').numpy())
    predictions = np.concatenate(preds)
    return predictions

In [20]:
# ====================================================
# train loop
# ====================================================
def train_loop(folds, fold):
    
    LOGGER.info(f"========== fold: {fold} training ==========")

    # ====================================================
    # loader
    # ====================================================
    train_folds = folds[folds['kfold'] != fold].reset_index(drop=True)
    valid_folds = folds[folds['kfold'] == fold].reset_index(drop=True)
    valid_labels = valid_folds['state'].values
    
    train_dataset = TrainDataset(CFG, train_folds)
    valid_dataset = TrainDataset(CFG, valid_folds)


    train_loader = DataLoader(train_dataset,
                              batch_size=CFG.batch_size,
                              shuffle=True,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=True)
    valid_loader = DataLoader(valid_dataset,
                              batch_size=CFG.batch_size*2,
                              shuffle=False,
                              num_workers=CFG.num_workers, pin_memory=True, drop_last=False)

    # ====================================================
    # model & optimizer
    # ====================================================
    model = CustomModel(CFG, config_path=None, pretrained=True)
    torch.save(model.config, OUTPUT_MODEL_DIR+'config.pth')
    model.to(device)
    
    def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': weight_decay},
            {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
             'lr': encoder_lr, 'weight_decay': 0.0},
            {'params': [p for n, p in model.named_parameters() if "model" not in n],
             'lr': decoder_lr, 'weight_decay': 0.0}
        ]
        return optimizer_parameters

    optimizer_parameters = get_optimizer_params(model,
                                                encoder_lr=CFG.encoder_lr, 
                                                decoder_lr=CFG.decoder_lr,
                                                weight_decay=CFG.weight_decay)
    optimizer = AdamW(optimizer_parameters, lr=CFG.encoder_lr, eps=CFG.eps, betas=CFG.betas)
    
    # ====================================================
    # scheduler
    # ====================================================
    def get_scheduler(cfg, optimizer, num_train_steps):
        if cfg.scheduler == 'linear':
            scheduler = get_linear_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
            )
        elif cfg.scheduler == 'cosine':
            scheduler = get_cosine_schedule_with_warmup(
                optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
            )
        return scheduler
    
    num_train_steps = int(len(train_folds) / CFG.batch_size * CFG.epochs)
    scheduler = get_scheduler(CFG, optimizer, num_train_steps)

    # ====================================================
    # loop
    # ====================================================
    criterion = nn.BCEWithLogitsLoss(reduction="mean")
    
    best_score = 0

    for epoch in range(CFG.epochs):

        start_time = time.time()

        # train
        avg_loss = train_fn(fold, train_loader, model, criterion, optimizer, epoch, scheduler, device)

        # eval
        avg_val_loss, predictions = valid_fn(valid_loader, model, criterion, device)
        
        # scoring
        score = get_score(valid_labels, predictions)

        elapsed = time.time() - start_time

        LOGGER.info(f'Epoch {epoch+1} - avg_train_loss: {avg_loss:.4f}  avg_val_loss: {avg_val_loss:.4f}  time: {elapsed:.0f}s')
        LOGGER.info(f'Epoch {epoch+1} - Score: {score:.4f}')

        
        if best_score < score:
            best_score = score
            LOGGER.info(f'Epoch {epoch+1} - Save Best Score: {best_score:.4f} Model')
            torch.save({'model': model.state_dict(),
                        'predictions': predictions},
                        OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth")

    predictions = torch.load(OUTPUT_MODEL_DIR+f"{CFG.model.replace('/', '-')}_fold{fold}_best.pth", 
                             map_location=torch.device('cpu'))['predictions']
    valid_folds['pred'] = predictions

    torch.cuda.empty_cache()
    gc.collect()
    
    return valid_folds

In [21]:
if __name__ == '__main__':
    
    def get_result(oof_df):
        labels = oof_df['state'].values
        preds = oof_df['pred'].values
        score = get_score(labels, preds)
        LOGGER.info(f'Score: {score:<.4f}')
    
    if CFG.train:
        oof_df = pd.DataFrame()
        for fold in range(CFG.n_fold):
            if fold in CFG.trn_fold:
                _oof_df = train_loop(train, fold)
                oof_df = pd.concat([oof_df, _oof_df])
                LOGGER.info(f"========== fold: {fold} result ==========")
                get_result(_oof_df)
        oof_df = oof_df.reset_index(drop=True)
        LOGGER.info(f"========== CV ==========")
        get_result(oof_df)
        oof_df.to_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')

BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

INFO:__main__:BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_

Downloading:   0%|          | 0.00/2.03G [00:00<?, ?B/s]

Some weights of the model checkpoint at google/muril-large-cased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch: [1][0/62] Elapsed 0m 5s (remain 5m 46s) Loss: 0.6668(0.6668) Grad: 47066.2109  LR: 0.00002000  
Epoch: [1][61/62] Elapsed 0m 31s (remain 0m 0s) Loss: 0.6778(0.7427) Grad: 5057.8706  LR: 0.00001709  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 5s) Loss: 0.7141(0.7141) 


Epoch 1 - avg_train_loss: 0.7427  avg_val_loss: 0.7155  time: 36s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7427  avg_val_loss: 0.7155  time: 36s
Epoch 1 - Score: 0.5802
INFO:__main__:Epoch 1 - Score: 0.5802
Epoch 1 - Save Best Score: 0.5802 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5802 Model


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7173(0.7155) 
Epoch: [2][0/62] Elapsed 0m 1s (remain 1m 9s) Loss: 0.7347(0.7347) Grad: inf  LR: 0.00001700  
Epoch: [2][61/62] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7531(0.7281) Grad: 22703.6074  LR: 0.00001006  
EVAL: [0/4] Elapsed 0m 2s (remain 0m 6s) Loss: 0.7091(0.7091) 


Epoch 2 - avg_train_loss: 0.7281  avg_val_loss: 0.7080  time: 32s
INFO:__main__:Epoch 2 - avg_train_loss: 0.7281  avg_val_loss: 0.7080  time: 32s
Epoch 2 - Score: 0.5179
INFO:__main__:Epoch 2 - Score: 0.5179


EVAL: [3/4] Elapsed 0m 4s (remain 0m 0s) Loss: 0.6963(0.7080) 
Epoch: [3][0/62] Elapsed 0m 1s (remain 1m 9s) Loss: 0.6997(0.6997) Grad: inf  LR: 0.00000994  
Epoch: [3][61/62] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7502(0.7212) Grad: 32633.3613  LR: 0.00000300  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 5s) Loss: 0.7072(0.7072) 


Epoch 3 - avg_train_loss: 0.7212  avg_val_loss: 0.7049  time: 31s
INFO:__main__:Epoch 3 - avg_train_loss: 0.7212  avg_val_loss: 0.7049  time: 31s
Epoch 3 - Score: 0.5047
INFO:__main__:Epoch 3 - Score: 0.5047


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6876(0.7049) 
Epoch: [4][0/62] Elapsed 0m 1s (remain 1m 7s) Loss: 0.7229(0.7229) Grad: inf  LR: 0.00000291  
Epoch: [4][61/62] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7117(0.7193) Grad: 5972.5869  LR: 0.00000000  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 5s) Loss: 0.7070(0.7070) 


Epoch 4 - avg_train_loss: 0.7193  avg_val_loss: 0.7045  time: 31s
INFO:__main__:Epoch 4 - avg_train_loss: 0.7193  avg_val_loss: 0.7045  time: 31s
Epoch 4 - Score: 0.4762
INFO:__main__:Epoch 4 - Score: 0.4762


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6865(0.7045) 


Score: 0.5802
INFO:__main__:Score: 0.5802
BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

INFO:__main__:BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size"

Epoch: [1][0/62] Elapsed 0m 0s (remain 0m 55s) Loss: 0.7092(0.7092) Grad: inf  LR: 0.00002000  
Epoch: [1][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.7491(0.7395) Grad: 11527.7061  LR: 0.00001709  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 3s) Loss: 0.7257(0.7257) 


Epoch 1 - avg_train_loss: 0.7395  avg_val_loss: 0.7172  time: 30s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7395  avg_val_loss: 0.7172  time: 30s
Epoch 1 - Score: 0.2703
INFO:__main__:Epoch 1 - Score: 0.2703
Epoch 1 - Save Best Score: 0.2703 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.2703 Model


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7167(0.7172) 
Epoch: [2][0/62] Elapsed 0m 0s (remain 0m 57s) Loss: 0.7131(0.7131) Grad: 104666.3672  LR: 0.00001700  
Epoch: [2][61/62] Elapsed 0m 27s (remain 0m 0s) Loss: 0.7532(0.7328) Grad: 32505.1953  LR: 0.00001006  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7238(0.7238) 


Epoch 2 - avg_train_loss: 0.7328  avg_val_loss: 0.7148  time: 31s
INFO:__main__:Epoch 2 - avg_train_loss: 0.7328  avg_val_loss: 0.7148  time: 31s
Epoch 2 - Score: 0.2535
INFO:__main__:Epoch 2 - Score: 0.2535


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7141(0.7148) 
Epoch: [3][0/62] Elapsed 0m 0s (remain 0m 59s) Loss: 0.6964(0.6964) Grad: 93608.1328  LR: 0.00000994  
Epoch: [3][61/62] Elapsed 0m 27s (remain 0m 0s) Loss: 0.6571(0.7310) Grad: 5818.8750  LR: 0.00000300  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7230(0.7230) 


Epoch 3 - avg_train_loss: 0.7310  avg_val_loss: 0.7138  time: 31s
INFO:__main__:Epoch 3 - avg_train_loss: 0.7310  avg_val_loss: 0.7138  time: 31s
Epoch 3 - Score: 0.2319
INFO:__main__:Epoch 3 - Score: 0.2319


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7130(0.7138) 
Epoch: [4][0/62] Elapsed 0m 0s (remain 0m 53s) Loss: 0.6442(0.6442) Grad: inf  LR: 0.00000291  
Epoch: [4][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.6934(0.7288) Grad: 8848.3506  LR: 0.00000000  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 3s) Loss: 0.7229(0.7229) 


Epoch 4 - avg_train_loss: 0.7288  avg_val_loss: 0.7136  time: 30s
INFO:__main__:Epoch 4 - avg_train_loss: 0.7288  avg_val_loss: 0.7136  time: 30s
Epoch 4 - Score: 0.2319
INFO:__main__:Epoch 4 - Score: 0.2319


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7128(0.7136) 


Score: 0.2703
INFO:__main__:Score: 0.2703
BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

INFO:__main__:BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size"

Epoch: [1][0/62] Elapsed 0m 0s (remain 0m 56s) Loss: 0.8735(0.8735) Grad: inf  LR: 0.00002000  
Epoch: [1][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 1.0055(0.7911) Grad: 40517.4453  LR: 0.00001709  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7710(0.7710) 


Epoch 1 - avg_train_loss: 0.7911  avg_val_loss: 0.7609  time: 31s
INFO:__main__:Epoch 1 - avg_train_loss: 0.7911  avg_val_loss: 0.7609  time: 31s
Epoch 1 - Score: 0.5323
INFO:__main__:Epoch 1 - Score: 0.5323
Epoch 1 - Save Best Score: 0.5323 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.5323 Model


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7071(0.7609) 
Epoch: [2][0/62] Elapsed 0m 0s (remain 0m 52s) Loss: 0.8371(0.8371) Grad: inf  LR: 0.00001700  
Epoch: [2][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.7334(0.7751) Grad: 9835.3740  LR: 0.00001006  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7538(0.7538) 


Epoch 2 - avg_train_loss: 0.7751  avg_val_loss: 0.7523  time: 31s
INFO:__main__:Epoch 2 - avg_train_loss: 0.7751  avg_val_loss: 0.7523  time: 31s
Epoch 2 - Score: 0.5043
INFO:__main__:Epoch 2 - Score: 0.5043


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7026(0.7523) 
Epoch: [3][0/62] Elapsed 0m 0s (remain 0m 54s) Loss: 0.8731(0.8731) Grad: inf  LR: 0.00000994  
Epoch: [3][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.8529(0.7677) Grad: 23871.6934  LR: 0.00000300  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 3s) Loss: 0.7458(0.7458) 


Epoch 3 - avg_train_loss: 0.7677  avg_val_loss: 0.7484  time: 30s
INFO:__main__:Epoch 3 - avg_train_loss: 0.7677  avg_val_loss: 0.7484  time: 30s
Epoch 3 - Score: 0.4865
INFO:__main__:Epoch 3 - Score: 0.4865


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7007(0.7484) 
Epoch: [4][0/62] Elapsed 0m 0s (remain 0m 56s) Loss: 0.6434(0.6434) Grad: 38362.9414  LR: 0.00000291  
Epoch: [4][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.7071(0.7631) Grad: 4687.2754  LR: 0.00000000  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7446(0.7446) 


Epoch 4 - avg_train_loss: 0.7631  avg_val_loss: 0.7478  time: 30s
INFO:__main__:Epoch 4 - avg_train_loss: 0.7631  avg_val_loss: 0.7478  time: 30s
Epoch 4 - Score: 0.4865
INFO:__main__:Epoch 4 - Score: 0.4865


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.7004(0.7478) 


Score: 0.5323
INFO:__main__:Score: 0.5323
BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size": 1024,
  "hidden_act": "gelu",
  "hidden_dropout": 0.0,
  "hidden_dropout_prob": 0.0,
  "hidden_size": 1024,
  "initializer_range": 0.02,
  "intermediate_size": 4096,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 16,
  "num_hidden_layers": 24,
  "output_hidden_states": true,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.22.1",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 197285
}

INFO:__main__:BertConfig {
  "_name_or_path": "google/muril-large-cased",
  "architectures": [
    "BertModel"
  ],
  "attention_dropout": 0.0,
  "attention_probs_dropout_prob": 0.0,
  "classifier_dropout": null,
  "embedding_size"

Epoch: [1][0/62] Elapsed 0m 0s (remain 0m 55s) Loss: 0.7474(0.7474) Grad: inf  LR: 0.00002000  
Epoch: [1][61/62] Elapsed 0m 26s (remain 0m 0s) Loss: 0.6548(0.6795) Grad: 10735.1816  LR: 0.00001709  
EVAL: [0/4] Elapsed 0m 1s (remain 0m 4s) Loss: 0.7213(0.7213) 


Epoch 1 - avg_train_loss: 0.6795  avg_val_loss: 0.7000  time: 31s
INFO:__main__:Epoch 1 - avg_train_loss: 0.6795  avg_val_loss: 0.7000  time: 31s
Epoch 1 - Score: 0.3288
INFO:__main__:Epoch 1 - Score: 0.3288
Epoch 1 - Save Best Score: 0.3288 Model
INFO:__main__:Epoch 1 - Save Best Score: 0.3288 Model


EVAL: [3/4] Elapsed 0m 3s (remain 0m 0s) Loss: 0.6743(0.7000) 
Epoch: [2][0/62] Elapsed 0m 0s (remain 0m 57s) Loss: 0.6769(0.6769) Grad: 76873.4922  LR: 0.00001700  


KeyboardInterrupt: ignored

In [None]:
A = pd.read_pickle(OUTPUT_MODEL_DIR+'oof_df.pkl')
A.head()