In [1]:
import os

class Config:
    # 出力フォルダ名
    NAME = "test-11-deberta-large-epoch5"

    # 学習するモデルの読み込み
    # https://huggingface.co/ からモデルのパスを指定
    # 例えば, "microsoft/deberta-base"
    MODEL_PATH = "microsoft/deberta-v3-large"

    # ベースとなるディレクトリパスの指定
    COLAB_PATH = "/content/drive/MyDrive/signate/MUFG Data Science Champion Ship" 
    DRIVE_PATH = "/content/drive/MyDrive/signate/MUFG Data Science Champion Ship" 
    # DRIVE_PATH = os.path.join(COLAB_PATH, AUTHOR)

    # シード値
    seed = 42
    
    # cross-validaitonの分割数
    num_fold = 3
    # 学習するfold
    trn_fold = [0, 1, 2]
    
    # batct_sizeの設定
    batch_size = 8
    
    # epoch数の設定
    n_epochs = 5
    
    # トークン数の最大の長さの設定
    max_len = 256

    # 学習率の設定
    lr = 2e-5

    # optimizer等の設定
    weight_decay = 2e-5
    beta = (0.9, 0.98)
    num_warmup_steps_rate = 0.01
    clip_grad_norm = None
    gradient_accumulation_steps = 1
    num_eval = 1

In [2]:
# ========================================
# Library
# ========================================
import os
import gc
import re
import sys
import json
import time
import shutil
import joblib
import random
import requests
import warnings
warnings.filterwarnings('ignore')
from ast import literal_eval

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import scipy 
import itertools
from pathlib import Path
from glob import glob
from tqdm.auto import tqdm
from sklearn.model_selection import (
    StratifiedKFold, 
    KFold, 
    GroupKFold,
    StratifiedGroupKFold,
)
from sklearn.metrics import (
    accuracy_score, 
    f1_score,
    roc_auc_score,
)

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, Subset
from torch.cuda.amp import autocast, GradScaler

from google.colab import drive
if not os.path.isdir('/content/drive'):
    drive.mount('/content/drive') 

In [3]:
def cleaning(texts):
  clean_texts = []
  for text in texts:
    text = remove_tag(text)
    clean_texts.append(text)
  return clean_texts

def remove_tag(x):
  p = re.compile(r"<[^>]*?>")
  return p.sub('',x)

In [4]:
def setup(cfg):
    cfg.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # mount
    from google.colab import drive
    if not os.path.isdir('/content/drive'):
        drive.mount('/content/drive') 

    # pip install
    ! pip install -q transformers
    ! pip install -q sentencepiece

    # set dirs
    cfg.DRIVE = cfg.DRIVE_PATH
    cfg.EXP = (cfg.NAME if cfg.NAME is not None 
        else requests.get('http://172.28.0.2:9000/api/sessions').json()[0]['name'][:-6]
    )
    cfg.INPUT = os.path.join(cfg.DRIVE, 'Input')
    cfg.OUTPUT = os.path.join(cfg.DRIVE, 'Oututput')
    cfg.DATASET = os.path.join(cfg.DRIVE, 'Dataset')

    cfg.OUTPUT_EXP = os.path.join(cfg.OUTPUT, cfg.EXP) 
    cfg.EXP_MODEL = os.path.join(cfg.OUTPUT_EXP, 'model')
    cfg.EXP_FIG = os.path.join(cfg.OUTPUT_EXP, 'fig')
    cfg.EXP_PREDS = os.path.join(cfg.OUTPUT_EXP, 'preds')

    # make dirs
    for d in [cfg.INPUT, cfg.EXP_MODEL, cfg.EXP_FIG, cfg.EXP_PREDS]:
        os.makedirs(d, exist_ok=True)
    return cfg

In [5]:
# =====================
# Utils
# =====================
# Seed
def set_seed(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

# KFold
def get_stratifiedkfold(train, target_col, n_splits, seed):
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed)
    generator = kf.split(train, train[target_col])
    fold_series = []
    for fold, (idx_train, idx_valid) in enumerate(generator):
        fold_series.append(pd.Series(fold, index=idx_valid))
    fold_series = pd.concat(fold_series).sort_index()
    return fold_series

# collatte
def collatte(inputs, labels=None):
    mask_len = int(inputs["attention_mask"].sum(axis=1).max())
    if not labels is None:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        labels =  labels[:,:mask_len]
        return inputs, labels, mask_len
                
    else:
        inputs = {
            "input_ids" : inputs['input_ids'][:,:mask_len],
            "attention_mask" : inputs['attention_mask'][:,:mask_len],
        }
        return inputs, mask_len

In [6]:
# =====================
# Dataset & Model
# =====================
class BERTDataset(Dataset):
    def __init__(self, cfg, texts, labels=None):
        self.cfg = cfg
        self.texts = texts
        self.labels = labels

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        inputs = self.prepare_input(self.cfg, self.texts[index])
        if self.labels is not None:
            label = torch.tensor(self.labels[index], dtype=torch.int64)
            return inputs, label
        else:
            return inputs
    
    @staticmethod
    def prepare_input(cfg, text):
        inputs = cfg.tokenizer(
            text,
            add_special_tokens=True,
            max_length=cfg.max_len,
            padding="max_length",
            truncation=True,
            return_offsets_mapping=False,
        )
        for k, v in inputs.items():
            inputs[k] = torch.tensor(v, dtype=torch.long)
        return inputs


class BERTModel(nn.Module):
    def __init__(self, cfg, criterion=None):
        super().__init__()
        self.cfg = cfg
        self.criterion = criterion
        self.config = AutoConfig.from_pretrained(
            cfg.MODEL_PATH,
            output_hidden_states=True
        )
        self.backbone = AutoModel.from_pretrained(
            cfg.MODEL_PATH, 
            config=self.config
        )
        self.fc = nn.Sequential(
            nn.Linear(self.config.hidden_size, 2),
        )
    
    def forward(self, inputs, labels=None):
        outputs = self.backbone(**inputs)["last_hidden_state"]
        outputs = outputs[:, 0, :]
        if labels is not None:
            logits = self.fc(outputs)
            loss = self.criterion(logits, labels)
            return logits, loss
        else:
            logits = self.fc(outputs)
            return logits

In [7]:
def training(cfg, train):
    # =====================
    # Training
    # =====================
    set_seed(cfg.seed)
    oof_pred = np.zeros((len(train), 2), dtype=np.float32)
    
    # 損失関数
    criterion = nn.CrossEntropyLoss()

    for fold in cfg.trn_fold:
        print('-'*20, fold, '-'*20)
        # Dataset,Dataloaderの設定
        train_df = train.loc[cfg.folds!=fold]
        valid_df = train.loc[cfg.folds==fold]
        train_idx = list(train_df.index)
        valid_idx = list(valid_df.index)

        train_dataset = BERTDataset(
            cfg,
            train_df['html_content'].to_numpy(), 
            train_df['state'].to_numpy(),
        )
        valid_dataset = BERTDataset(
            cfg, 
            valid_df['html_content'].to_numpy(), 
            valid_df['state'].to_numpy()
        )
        train_loader = DataLoader(
            dataset=train_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=True,
            pin_memory=True,
            drop_last=True
        )
        valid_loader = DataLoader(
            dataset=valid_dataset,
            batch_size=cfg.batch_size,
            shuffle=False,
            pin_memory=True,
            drop_last=False
        )

        # 初期化
        best_val_preds = None
        best_val_score = -1

        # modelの読み込み
        model = BERTModel(cfg, criterion)
        model = model.to(cfg.device)

        # optimizer，schedulerの設定
        param_optimizer = list(model.named_parameters())
        no_decay = ['bias', 'LayerNorm.bias', 'LayerNorm.weight']
        optimizer_grouped_parameters = []
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if not any(nd in n for nd in no_decay)], 
            'weight_decay': cfg.weight_decay
        })
        optimizer_grouped_parameters.append({
            'params': [p for n, p in param_optimizer if any(nd in n for nd in no_decay)], 
            'weight_decay': 0.0
        })
        optimizer = AdamW(
            optimizer_grouped_parameters,
            lr=cfg.lr,
            betas=cfg.beta,
            weight_decay=cfg.weight_decay,
        )
        num_train_optimization_steps = int(
            len(train_loader) * cfg.n_epochs // cfg.gradient_accumulation_steps
        )
        num_warmup_steps = int(num_train_optimization_steps * cfg.num_warmup_steps_rate)
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=num_warmup_steps,
            num_training_steps=num_train_optimization_steps
        )
        num_eval_step = len(train_loader) // cfg.num_eval + cfg.num_eval
        
        for epoch in range(cfg.n_epochs):
            # training
            print(f"# ============ start epoch:{epoch} ============== #")
            model.train() 
            val_losses_batch = []
            scaler = GradScaler()
            with tqdm(train_loader, total=len(train_loader)) as pbar:
                for step, (inputs, labels) in enumerate(pbar):
                    inputs, max_len = collatte(inputs)
                    for k, v in inputs.items():
                        inputs[k] = v.to(cfg.device)
                    labels = labels.to(cfg.device)

                    optimizer.zero_grad()
                    with autocast():
                        output, loss = model(inputs, labels)
                    pbar.set_postfix({
                        'loss': loss.item(),
                        'lr': scheduler.get_lr()[0]
                    })

                    if cfg.gradient_accumulation_steps > 1:
                        loss = loss / cfg.gradient_accumulation_steps
                    scaler.scale(loss).backward()
                    if cfg.clip_grad_norm is not None:
                        torch.nn.utils.clip_grad_norm_(
                            model.parameters(), 
                            cfg.clip_grad_norm
                        )
                    if (step+1) % cfg.gradient_accumulation_steps == 0:
                        scaler.step(optimizer)
                        scaler.update()
                        scheduler.step()
                
            # evaluating
            val_preds = []
            val_losses = []
            val_nums = []
            model.eval()
            with torch.no_grad():
                with tqdm(valid_loader, total=len(valid_loader)) as pbar:
                    for (inputs, labels) in pbar:
                        inputs, max_len = collatte(inputs)
                        for k, v in inputs.items():
                            inputs[k] = v.to(cfg.device)
                        labels = labels.to(cfg.device)
                        with autocast():
                            output, loss = model(inputs, labels)
                        output = output.sigmoid().detach().cpu().numpy()
                        val_preds.append(output)
                        val_losses.append(loss.item() * len(labels))
                        val_nums.append(len(labels))
                        pbar.set_postfix({
                            'val_loss': loss.item()
                        })

            val_preds = np.concatenate(val_preds)
            val_loss = sum(val_losses) / sum(val_nums)
            score = f1_score(np.argmax(val_preds, axis=1), valid_df['state'], average='macro')
            val_log = {
                'val_loss': val_loss,
                'score': score,
            }
            display(val_log)
            if best_val_score < score:
                print("save model weight")
                best_val_preds = val_preds
                best_val_score = score
                torch.save(
                    model.state_dict(), 
                    os.path.join(cfg.EXP_MODEL, f"fold{fold}.pth")
                )

        oof_pred[valid_idx] = best_val_preds.astype(np.float32)
        np.save(os.path.join(cfg.EXP_PREDS, f'oof_pred_fold{fold}.npy'), best_val_preds)
        del model; gc.collect()

    # scoring
    np.save(os.path.join(cfg.EXP_PREDS, 'oof_pred.npy'), oof_pred)
    score = f1_score(np.argmax(oof_pred, axis=1), train['state'], average='macro')
    print('CV:', round(score, 5))
    return score

In [8]:
def inferring(cfg, test):
    print('\n'.join(cfg.model_weights))
    sub_pred = np.zeros((len(test), 2), dtype=np.float32)
    for fold, model_weight in enumerate(cfg.model_weights):
        # dataset, dataloader
        test_dataset = BERTDataset(
            cfg,
            test['html_content'].to_numpy()
        )
        test_loader = DataLoader(
            dataset=test_dataset, 
            batch_size=cfg.batch_size, 
            shuffle=False,
            pin_memory=True
        )
        model = BERTModel(cfg)
        model.load_state_dict(torch.load(model_weight))
        model = model.to(cfg.device)

        model.eval()
        fold_pred = []
        with torch.no_grad():
            for inputs in tqdm(test_loader, total=len(test_loader)):
                inputs, max_len = collatte(inputs)
                for k, v in inputs.items():
                    inputs[k] = v.to(cfg.device)
                with autocast():
                    output = model(inputs)
                output = output.softmax(axis=1).detach().cpu().numpy()
                fold_pred.append(output)
        fold_pred = np.concatenate(fold_pred)
        np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred_fold{fold}.npy'), fold_pred)
        sub_pred += fold_pred / len(cfg.model_weights)
        del model; gc.collect()
    np.save(os.path.join(cfg.EXP_PREDS, f'sub_pred.npy'), sub_pred)
    return sub_pred

In [9]:
# =====================
# Main
# =====================
# セットアップ
cfg = setup(Config)

import transformers
from transformers import AutoConfig, AutoModel, AutoTokenizer
from transformers import AdamW, get_linear_schedule_with_warmup

# データの読み込み
train = pd.read_csv(os.path.join(cfg.INPUT, 'train.csv'))
test = pd.read_csv(os.path.join(cfg.INPUT, 'test.csv'))
sub = pd.read_csv(os.path.join(cfg.INPUT, 'sample_submit.csv'), header=None)


TEXT_COLUMNS = ['goal', 'country', 'duration', 'category1', 'category2', 'html_content']


# tokenizerの読み込み
cfg.tokenizer = AutoTokenizer.from_pretrained(cfg.MODEL_PATH)

print(train['html_content'][0])

train['html_content'] = train[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(train[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=cfg.tokenizer.sep_token)
test['html_content'] = test[TEXT_COLUMNS[0]].fillna('NAN').astype(str).str.cat(test[TEXT_COLUMNS[1:]].fillna('NAN').astype(str), sep=cfg.tokenizer.sep_token)

# train['html_content'] = cleaning(train['html_content'])
# test['html_content'] = cleaning(test['html_content'])

print(train['html_content'][0])

# validationデータの設定
cfg.folds = get_stratifiedkfold(train, 'state', cfg.num_fold, cfg.seed)
cfg.folds.to_csv(os.path.join(cfg.EXP_PREDS, 'folds.csv'))

# BERTの学習
score = training(cfg, train)

# BERTの推論
cfg.model_weights = [p for p in sorted(glob(os.path.join(cfg.EXP_MODEL, 'fold*.pth')))]
sub_pred = inferring(cfg, test)
# print(sub_pred)
# print(sub_pred.shape())
sub[1] = np.argmax(sub_pred, axis=1)
sub[1] = sub[1].astype(int)

# 提出用ファイル
sub.to_csv(os.path.join(cfg.EXP_PREDS, 'submission_bert.csv'), index=False, header=False)

Downloading:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


<div class="contents"><div><p><a href="http://dummy.com">http://dummy.com<p>In its first year, The Shillito's Elves Display won an international 
design award for Shillito's department store.  The elves display is arts
 and crafts at its finest.  The mixed media exhibit displays the talents
 of local fine arts graduates, and the display, while "folksy", is as 
technologically advanced as Disney World's famous "It's a Small World" 
ride. </p><p>The Shillito's Elves attracted close to 100,000 people each
 year.  It was one of the most beloved Christmas traditions in 
Cincinnati.  For many in the Cincinnati area, it is a fond childhood 
holiday memory and one that they would love to share with their own 
families.  In the next 40 days, we are asking for your help to make the 
entire display viewable again for the first time in 25 years.  In order 
to make this happen, we must meet our financial goal.  </p><p>Your money will be used in the following ways:</p><ul>
<li>Repair broken animated

Downloading:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.42579925118708145, 'score': 0.7914656642618045}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.4071206705754294, 'score': 0.8017322776244578}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.5568436348409045, 'score': 0.8036070665726109}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.8154590122547805, 'score': 0.800855151408657}



  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 1.0442934406592566, 'score': 0.8017030859100895}

-------------------- 1 --------------------


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.48389844333424287, 'score': 0.7635618898796637}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.4107734357901648, 'score': 0.7970417920042986}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.4950208503606857, 'score': 0.7971709274794574}

save model weight


  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.7178780146162299, 'score': 0.7970920138888888}



  0%|          | 0/815 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.9895656901393451, 'score': 0.7835709648722079}

-------------------- 2 --------------------


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).




  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.5483627097485367, 'score': 0.7172866732926917}

save model weight


  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.43526958658893966, 'score': 0.8026129012138332}

save model weight


  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.47409980641600913, 'score': 0.8043353133576451}

save model weight


  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.6050139939057985, 'score': 0.8003385820347556}



  0%|          | 0/816 [00:00<?, ?it/s]

  0%|          | 0/408 [00:00<?, ?it/s]

{'val_loss': 0.7182686376929684, 'score': 0.8098736463179548}

save model weight
CV: 0.80362
/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Oututput/test-11-deberta-large-epoch5/model/fold0.pth
/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Oututput/test-11-deberta-large-epoch5/model/fold1.pth
/content/drive/MyDrive/signate/MUFG Data Science Champion Ship/Oututput/test-11-deberta-large-epoch5/model/fold2.pth


Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1225 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1225 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-v3-large were not used when initializing DebertaV2Model: ['mask_predictions.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'mask_predictions.LayerNorm.bias', 'mask_predictions.classifier.bias', 'mask_predictions.classifier.weight', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'mask_predictions.dense.bias', 'mask_predictions.dense.weight']
- This IS expected if you are initializing DebertaV2Model from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaV2Model from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/1225 [00:00<?, ?it/s]

In [10]:
sub

Unnamed: 0,0,1
0,test_00000,1
1,test_00001,1
2,test_00002,1
3,test_00003,0
4,test_00004,0
...,...,...
9795,test_09795,1
9796,test_09796,1
9797,test_09797,0
9798,test_09798,1


# やったこと

huggingfaceを使ってcolumをすべてtextとして繋げ, debertaで学習しました

# この後にできそうなこと

baseからlargeにする

mask augmentation

以下のコメントアウト外せばrandomにtokenをmaskして学習するようになります.

```

# train_transform = RandomMask(tokenizer, mask_prob=0.2) # [MASK] augmentation

# ds['train'].set_transform(train_transform)

```

HTMLタグをtokenとして追加する

ちゃんと交差検証する

データ多くないのでハイパラ調整する際は交差検証必須だと思います

CVと言いつつhold outしかしてないので5foldとかで回して平均すれば若干精度伸びそうです.

MLM

htmlを文章として扱っているので事前学習が効きそう

AWP, FGM

大抵のコンペでやれば精度上がる

max_lenの調整

かなり文章が長いものもあるのでどこまで使うかは大事そう

切り詰める場合にも後半を切り落とすのではなく中間部分を切り落とした方が良いかも

文章の長さを特徴量としてあげてもいいかも

大量にアンサンブル

MODEL_NAME変えて色々混ぜれば精度伸びそうです

疑似ラベル

ハイパラ調整

huggingfaceにはハイパラ調整用のメソッドが用意されてるので使うと楽に調整できそうですhttps://huggingface.co/docs/transformers/v4.21.2/en/main_classes/trainer#transformers.Trainer.hyperparameter_search

# 感想

タスクがシンプルすぎて差別化が難しい気がしています

全colum文字としても自然に扱えそうなので, GBDTよりも言語モデルが強い気がしています