# 各種インポート

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 4.3 MB/s 
[K     |████████████████████████████████| 596 kB 50.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 22.5 MB/s 
[K     |████████████████████████████████| 101 kB 10.7 MB/s 
[?25h

In [None]:
!pip install -q nlp

[K     |████████████████████████████████| 1.7 MB 4.3 MB/s 
[K     |████████████████████████████████| 212 kB 56.3 MB/s 
[?25h

In [None]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer


# パラメータ設定

In [None]:
# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))


# config
data_dir = os.path.join(os.environ["HOME"], "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "train.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
MODELS_DIR = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/models"
#MODEL_NAME = 'bert-base-uncased'
#MODEL_NAME = 'roberta-base'
MODEL_NAME = 'microsoft/deberta-base'
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 10
NUM_SPLITS = 5

Device: Tesla P100-PCIE-16GB


# モデルの設定

In [None]:
# dataset
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=128))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset


# model
"""
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            return_dict=False) # Pythonの実行上必要なので加筆しました。)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output
"""

# roberta
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                return_dict=False
                ) 
        
        # token_type_idsは使わない
        
        output = output[0]
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        del input_ids, attention_mask
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels = batch.values()
            del batch

            outputs = model(input_ids, attention_mask)
            del input_ids, attention_mask
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1


def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    #loss_ax.savefig(f"./figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    #acc_ax.savefig(f"./figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    #f1_ax.savefig(f"./figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100000, gamma=1.0)
    # ダミーのスケジューラー

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_deberta_{fold}.pth")
            #torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1



# 学習

In [None]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
#with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
 #   f.write(lines)




  0%|          | 0/1212 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0823329091072083  Acc: 0.5723684210526315  f1: 0.4301270825444907  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7356710632642111  Acc: 0.7105263157894737  f1: 0.5467850105291645  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6449893712997437  Acc: 0.7368421052631579  f1: 0.5947287427722256  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6700308124224345  Acc: 0.756578947368421  f1: 0.7004537596316402  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7307419379552206  Acc: 0.7730263157894737  f1: 0.7246520112560915  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8670056660970052  Acc: 0.7368421052631579  f1: 0.7146829397619154  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9661642909049988  Acc: 0.75  f1: 0.7340603714451652  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9667586088180542  Acc: 0.7598684210526315  f1: 0.7085135526074968  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9666877786318461  Acc: 0.743421052631579  f1: 0.7109552557836751  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9741695523262024  Acc: 0.7467105263157895  f1: 0.7120233303212027  

<fold=0> best score: 0.7340603714451652



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8753301898638407  Acc: 0.6534653465346535  f1: 0.5023857420536904  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7212763428688049  Acc: 0.7293729372937293  f1: 0.6614908082738358  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.680015504360199  Acc: 0.7392739273927392  f1: 0.6832339352439856  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7084400455156962  Acc: 0.7425742574257426  f1: 0.6880782844462131  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8487138350804647  Acc: 0.7326732673267327  f1: 0.6641342360253624  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.064737359682719  Acc: 0.7260726072607261  f1: 0.6611520501333223  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0429141521453857  Acc: 0.7392739273927392  f1: 0.6830051229746049  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.226798176765442  Acc: 0.7128712871287128  f1: 0.6590559445455064  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2827481826146443  Acc: 0.7425742574257426  f1: 0.6603411619615701  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3685799439748128  Acc: 0.7326732673267327  f1: 0.6764840953081372  

<fold=1> best score: 0.6880782844462131



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7350272138913473  Acc: 0.6897689768976898  f1: 0.5166497569224265  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6228053371111552  Acc: 0.7755775577557755  f1: 0.5970915360491462  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.577313760916392  Acc: 0.7920792079207921  f1: 0.6761900025400508  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.638958215713501  Acc: 0.7623762376237624  f1: 0.6697268166585979  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6891530354817709  Acc: 0.7656765676567657  f1: 0.695100096162252  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7676763733228048  Acc: 0.768976897689769  f1: 0.6990675057208238  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8339446187019348  Acc: 0.7854785478547854  f1: 0.7108811211915376  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1119725306828816  Acc: 0.7326732673267327  f1: 0.656063588615483  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0288856625556946  Acc: 0.735973597359736  f1: 0.6583161752552625  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0193401177724202  Acc: 0.7722772277227723  f1: 0.6906655591189131  

<fold=2> best score: 0.7108811211915376



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3596436182657878  Acc: 0.3333333333333333  f1: 0.125  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2720828851064045  Acc: 0.3333333333333333  f1: 0.125  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7174848715464274  Acc: 0.7128712871287128  f1: 0.5518781056687962  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6228316426277161  Acc: 0.7458745874587459  f1: 0.6605812976342877  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.656363288561503  Acc: 0.7458745874587459  f1: 0.6237353091466531  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6696022152900696  Acc: 0.7887788778877888  f1: 0.7387178165441276  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8735940456390381  Acc: 0.7755775577557755  f1: 0.7122604107480734  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9014694889386495  Acc: 0.7887788778877888  f1: 0.7473383287460784  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9134237964948019  Acc: 0.7920792079207921  f1: 0.7458355343968979  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1011091272036235  Acc: 0.759075907590759  f1: 0.6681730366443335  

<fold=3> best score: 0.7473383287460784



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2287884950637817  Acc: 0.39603960396039606  f1: 0.2995033024796183  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7077197631200155  Acc: 0.7194719471947195  f1: 0.5757778314100888  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6969084143638611  Acc: 0.7458745874587459  f1: 0.6482472206156418  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.781010905901591  Acc: 0.7425742574257426  f1: 0.6693863139825219  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7261192798614502  Acc: 0.759075907590759  f1: 0.7065191409278025  model saving!



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7901626030604044  Acc: 0.7656765676567657  f1: 0.7028543632231068  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.862297515074412  Acc: 0.759075907590759  f1: 0.7065023466632379  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9994696974754333  Acc: 0.7557755775577558  f1: 0.6968261881023596  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0965217749277751  Acc: 0.7425742574257426  f1: 0.691063819776691  



  0%|          | 0/38 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.117427905400594  Acc: 0.759075907590759  f1: 0.7041540020263424  

<fold=4> best score: 0.7065191409278025

CV: 0.7173754493513594


<Figure size 432x288 with 0 Axes>

# 推論

In [None]:
# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    #model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.load_state_dict(torch.load(MODELS_DIR + f"best_deberta_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_deberta.csv", index=False, header=False)
    #submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission.csv", index=False, header=False)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

# アンサンブル

In [None]:
import pandas as pd
import numpy as np

bert = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv0711864244.csv",header=None)
roberta = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submissionroberta-base.csv",header=None)
deberta = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_deberta.csv",header=None)


bert["bert"] = bert[1]
bert["roberta"] = roberta[1]
bert["deberta"] = deberta[1]

ens = bert[["bert","roberta","deberta"]].T

mode = ens.mode().T

bert["voting"] = mode[0].astype(int)

bert[[0,"voting"]].to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/Emsamble.csv",index=None,header=None)