https://www.kaggle.com/code/columbia2131/training-inference-code-xlm-roberta-base

### import

In [None]:
import numpy as np 
import pandas as pd 
import sys 
import os 
import logzero 
import wandb 
import pickle 
from tqdm.auto import tqdm
import matplotlib.pyplot as plt 
import seaborn as sns 
sns.set()

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler
from pytorch_lightning.utilities.seed import seed_everything


### config

In [None]:

class Config():
    # common
    version = '003'
    comment = 'test'
    input_dir = '/home/user/work/input/we-are-all-alike-on-the-inside'
    output_dir = f'/home/user/work/output/{version}' 
    seed = 42
    target_col = None 

    # wandb
    wandb_init = {
        "project": "debug",
        "entity": "kuto5046",
        "group": f"exp{version}",
        "dir": output_dir,
        "tags": [],
        # "mode": "disabled", 
    }

    # cv 
    n_splits = 5
    use_fold = [0]  # fold1つで終える場合[0], 全てのfoldを実行する場合[0,1,2,3,4]

    # dataloader
    loader_params = {
        'batch_size': 32,
        'shuffle': False,
        'num_workers': 4,
        }

    # model
    n_epochs = 10
    model_path = 'xlm-roberta-base'
    max_len = 128
    weight_decay = 1e-5
    beta = (0.9, 0.98)
    lr = 2e-5
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1  # 1なら累積しない

c = Config()
# c = HydraConfig.get_cnf(config_path='/home/user/work/configs/', config_name='config.yaml')
os.makedirs(c.output_dir, exist_ok=True)
logger = logzero.setup_logger(name='main', logfile=f'{c.output_dir}/result.log', level=10)

In [None]:
c.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
c.device

In [None]:
seed_everything(c.seed)

### read data

In [None]:
train = pd.read_csv(f'{c.input_dir}/train.csv')
test = pd.read_csv(f'{c.input_dir}/test.csv')
train.shape, test.shape

In [None]:
train.head()

### preprocess

In [None]:
from src.features.base import get_categorical_col, get_numerical_col
from src.features.encoder import pp_for_categorical_encoding

In [None]:
whole = pd.concat([train, test]).reset_index(drop=True)

In [None]:
c.target_col = 'category'
c.n_class = 3
c.target_map = {'association': 0, 'disagreement': 1, 'unbiased': 2}
c.target_map_rev = {0: 'association', 1: 'disagreement', 2: 'unbiased'}

In [None]:
get_categorical_col(whole)

In [None]:
get_numerical_col(whole)

In [None]:
# import ast 
# def fix_s1s2(data):
#     new_s1 = []
#     new_s2 = []
#     for idx, row in tqdm(data.iterrows(), total=len(data)):
#         if row["s1"].startswith("["):
#             try:
#                 temp_s1 = " ".join(ast.literal_eval(row["s1"]))
#             except SyntaxError:
#                 temp_s1 = row["s1"][1:-1]
#         else:
#             temp_s1 = row["s1"]

#         if row["s2"].startswith("["):
#             try:
#                 temp_s2 = " ".join(ast.literal_eval(row["s2"]))
#             except SyntaxError:
#                 temp_s2 = row["s2"][1:-1]
#         else:
#             temp_s2 = row["s2"]

#         new_s1.append(temp_s1)
#         new_s2.append(temp_s2)
#     data["s1"] = new_s1
#     data["s2"] = new_s2
#     return data

In [None]:
whole['s1'] = whole['s1'].map(
    lambda x: x\
        .replace("['", '')\
        .replace("']", '')\
        .replace('["', '')\
        .replace('"]', '')\
        .replace('[«', '«')
        .replace('»]', '»')\
        .replace('[', '')\
        .replace(']', '')\
        .split("', '")
)


whole['s1'] = whole['s1'].map(lambda x: ' '.join(x))

In [None]:
train = whole[~whole[c.target_col].isna()].reset_index(drop=True)
test = whole[whole[c.target_col].isna()].reset_index(drop=True)

In [None]:
# labelを数値に変換
train[c.target_col] = train[c.target_col].map(c.target_map)

### model

In [None]:
import transformers
from transformers import AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

In [None]:
class CustomDataset(Dataset):
    def __init__(self, df: pd.DataFrame, config: Config, phase: str='train'):
        assert phase in ['train', 'valid', 'test']
        self.config = config
        self.tokenizer = AutoTokenizer.from_pretrained(config.model_path)
        self.phase = phase
        self.s1 = df['s1'].to_numpy()
        self.s2 = df['s2'].to_numpy()
        self.y = np.nan
        if self.phase in ['train', 'valid']:
            self.y = df[config.target_col].to_numpy()
    
    def __len__(self):
        return self.s1.shape[0]

    def __getitem__(self, idx):
        # GET TEXT AND WORD LABELS 
        inputs1 = self.tokenizer.encode_plus(
            self.s1[idx],
            self.s2[idx],
            max_length=self.config.max_len, 
            padding='max_length',
            truncation=True, 
            return_attention_mask=True,
        )
        
        x = {
            'token1': torch.tensor(inputs1['input_ids'], dtype=torch.long),
            'mask1': torch.tensor(inputs1['attention_mask'], dtype=torch.long),
        }
        return x, self.y[idx]

In [None]:
class CustomModel(nn.Module):
    def __init__(self, model_path, n_class):
        super(CustomModel, self).__init__()
        self.backbone = AutoModel.from_pretrained(model_path)
        self.ln = nn.LayerNorm(768)
        self.linear1 = nn.Sequential(
            nn.Linear(768, 256),
            nn.ReLU(),
            nn.Linear(256, n_class) 
        )
 
    def forward(self, x):
        output = self.backbone(x['token1'], attention_mask=x['mask1'])["last_hidden_state"][:, 0, :]
        output = self.ln(output)
        output = self.linear1(output)        
        return output

In [None]:
import pytorch_lightning as pl
from pytorch_lightning.loggers import WandbLogger
from pytorch_lightning.callbacks import ModelCheckpoint
from pytorch_lightning.callbacks import EarlyStopping
from torchmetrics import F1Score

class SimpleLearner(pl.LightningModule):
    def __init__(self, model, config):
        super().__init__()
        self.model = model
        self.criterion = self.get_criterion(config)
        self.optimizer = self.get_optimizer(config)
        self.scheduler = self.get_scheduler(config)
        self.metric = self.get_metric(config)
    
    def training_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        loss = self.criterion(output, y)
        score = self.metric(output, y) 

        self.log(f'Loss/train', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Score/train', score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss
    
    def validation_step(self, batch, batch_idx):
        x, y = batch
        output = self.model(x)
        loss = self.criterion(output, y)
        score = self.metric(output, y) 
        
        self.log(f'Loss/val', loss, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        self.log(f'Score/val', score, on_step=False, on_epoch=True, prog_bar=False, logger=True)
        return loss

    def configure_optimizers(self):
        return {"optimizer": self.optimizer, "lr_scheduler": self.scheduler, "monitor": "Loss/val"}


    def get_metric(self, config):
        return F1Score(average='micro')


    def get_optimizer(self, config: dict):

        param_optimizer = list(self.model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']  # このパラメータはweight decayしない
        optimizer_grouped_parameters = [
                {
                    'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
                    'weight_decay': config.weight_decay
                },
                {
                    'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
                    'weight_decay': 0.0
                }
            ]

        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr = config.lr,
            betas = config.beta,
            weight_decay = config.weight_decay,
            )
        return optimizer

    def get_scheduler(self, config: dict):
        num_train_optimization_steps = int(
            config.len_loader * config.n_epochs // config.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * config.num_warmup_steps_rate)
        
        scheduler = get_linear_schedule_with_warmup(
            self.optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        return scheduler 


    def get_criterion(self, config: dict):
        criterion = nn.CrossEntropyLoss()
        return criterion

### cv

In [None]:
from src.cv import get_kfold, get_stratifiedkfold, get_groupkfold
cv = get_stratifiedkfold(train, c.target_col, n_splits=5)
cv

### callback

In [None]:
def get_callbacks(i_fold):
    callbacks = []
    checkpoint_callback = ModelCheckpoint(
        monitor=f'Loss/val',
        mode='min',
        dirpath=c.output_dir,
        verbose=False,
        filename=f'model_fold{i_fold}')
    callbacks.append(checkpoint_callback)

    early_stop_callback = EarlyStopping(
        monitor='Loss/val',
        min_delta=0.00,
        patience=20,
        verbose=False,
        mode='min')
    callbacks.append(early_stop_callback)
    return callbacks 

### train

In [None]:
from sklearn.metrics import f1_score, roc_auc_score
def calc_score(true, pred):
    return f1_score(true, pred, average='micro')

In [None]:
def to_np(input):
    return input.detach().cpu().numpy()

def worker_init_fn(worker_id):                                                          
    np.random.seed(np.random.get_state()[1][0] + worker_id)

def inference(model, loader, phase='test'):
    model.eval()
    pred = []
    with torch.no_grad():
        for batch in loader:
            x, y = batch
            output = model(x)
            pred.append(to_np(output))
    return np.concatenate(pred)

In [None]:
def train_pipeline(train, test, cv, config, target_col):
    oofs = []
    preds = []
    for i, (idx_train, idx_valid) in enumerate(cv):

        if i not in c.use_fold:
            continue 

        logger.info("############")
        logger.info(f"fold {i}")
        logger.info("############")
        wandb.init(**config.wandb_init, name=f'exp{config.version}-fold{i}')
        
        _train = train.loc[idx_train].reset_index(drop=True)
        _valid = train.loc[idx_valid].reset_index(drop=True)

        loaders = {}
        loaders["train"] = DataLoader(CustomDataset(_train, config, phase="train"), **config.loader_params, worker_init_fn=worker_init_fn) 
        loaders["valid"] = DataLoader(CustomDataset(_valid, config, phase="valid"), **config.loader_params, worker_init_fn=worker_init_fn)
        loaders["test"] = DataLoader(CustomDataset(test, config, phase="test"), **config.loader_params, worker_init_fn=worker_init_fn)

        config.len_loader = len(loaders['train'])

        model = CustomModel(c.model_path, c.n_class)
        # model_name = model.__class__.__name__
        callbacks = get_callbacks(i)
        learner = SimpleLearner(model, config)
        trainer = pl.Trainer(
            logger=[WandbLogger()], 
            callbacks=callbacks,
            max_epochs=config.n_epochs,
            devices='auto',
            accelerator='auto',
            # fast_dev_run=True,
            deterministic=True,
            # precision=16,
            amp_backend='apex',
            )
        print('start train')
        trainer.fit(learner, train_dataloaders=loaders['train'], val_dataloaders=loaders['valid'])
        
        print('create oof')
        pred = inference(model, loaders['valid'])
        oof_df = pd.DataFrame(pred, index=idx_valid)
        oofs.append(oof_df)

        # evaluate
        print('evaluate valid data')
        score = calc_score(_valid[c.target_col], pred)
        logger.info(f'fold-{i} score: {score}')
        wandb.log({'CV': score})

        # pred
        print('inference test data')
        pred_test = inference(model, loaders['test'])
        np.save(f"{c.output_dir}/pred_test_{i}", pred_test)
        preds.append(pred_test)

        if i != c.use_fold[-1]:
            wandb.finish()

    # oofを保存
    oof = np.array(pd.concat(oofs).sort_index())
    np.save(f"{c.output_dir}/oof", oof)
    return model, oof

In [None]:
model, oof = train_pipeline(train, test, cv, c, c.target_col)

### inference

In [None]:
preds = []
for i in range(len(cv)):
    # TODO 存在していればに変更
    pred = np.load(f'{c.output_dir}/pred_test_{i}.npy')
    preds.append(pred)
pred_test = np.mean(preds, axis=0)

### submission

In [None]:
sub = pd.read_csv(f'{c.input_dir}/sample_submission.csv')
sub[c.target_col] = pred_test
sub.to_csv(f'{c.output_dir}/submission_exp{c.version}.csv', index=False)