# 各種インポート

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers
!pip install -q nlp

[K     |████████████████████████████████| 4.7 MB 7.5 MB/s 
[K     |████████████████████████████████| 6.6 MB 43.9 MB/s 
[K     |████████████████████████████████| 101 kB 10.8 MB/s 
[?25h

In [None]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup

# パラメータ設定

In [None]:
# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))


# config
data_dir = os.path.join(os.environ["HOME"], "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "train.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
MODELS_DIR = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/models"
MODEL_NAME = 'bert-base-uncased'
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 10
NUM_SPLITS = 5

# optimizer等の設定
#weight_decay = 2e-5
beta = (0.9, 0.98)
num_warmup_steps_rate = 0.01
clip_grad_norm = None
gradient_accumulation_steps = 1
#num_eval = 1

Device: Tesla P100-PCIE-16GB


# モデルの設定

In [None]:
# dataset
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=128))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset


# model
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            return_dict=False) # Pythonの実行上必要なので加筆しました。)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output


# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels, token_type_ids = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, token_type_ids)
        del input_ids, attention_mask, token_type_ids
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels, token_type_ids = batch.values()
            del batch

            outputs = model(input_ids, attention_mask, token_type_ids)
            del input_ids, attention_mask, token_type_ids
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1


def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    #loss_ax.savefig(f"./figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    #acc_ax.savefig(f"./figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    #f1_ax.savefig(f"./figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    # weight=torch.tensor(1/np.array([0.3, 0.1, 0.3, 0.3])).cuda().float(), label_smoothing=0.1
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    # ダミーのスケジューラー
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100000, gamma=1.0)
    """
    num_train_optimization_steps = int(
            len(train_dataloader) * EPOCHS // gradient_accumulation_steps
            )
    num_warmup_steps = int(num_train_optimization_steps * num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps
    )
    """
    

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1



# 学習

In [None]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
#with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
 #   f.write(lines)




  0%|          | 0/1212 [00:00<?, ?it/s]

  0%|          | 0/304 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7579819560050964  Acc: 0.7171052631578947  f1: 0.557489945238627  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6203917562961578  Acc: 0.7664473684210527  f1: 0.7057448584930566  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6071707109610239  Acc: 0.7763157894736842  f1: 0.7603256805114391  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6750717361768087  Acc: 0.7828947368421053  f1: 0.7645130084534992  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8739175001780192  Acc: 0.7631578947368421  f1: 0.732864387134164  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8340888619422913  Acc: 0.7894736842105263  f1: 0.7385097176435876  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0254599054654439  Acc: 0.7368421052631579  f1: 0.6939292958562735  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1414199670155842  Acc: 0.7532894736842105  f1: 0.711973833526374  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1758747299512227  Acc: 0.756578947368421  f1: 0.7177359339642513  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9691357811292013  Acc: 0.7631578947368421  f1: 0.7233717894985374  

<fold=0> best score: 0.7645130084534992



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7829303741455078  Acc: 0.6897689768976898  f1: 0.5321557771090498  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7461820046106974  Acc: 0.7557755775577558  f1: 0.7134247279813782  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7572135726610819  Acc: 0.7227722772277227  f1: 0.6383919498524078  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8296920657157898  Acc: 0.7623762376237624  f1: 0.7001391625230634  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.154526948928833  Acc: 0.7062706270627063  f1: 0.6501000479386938  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0719426274299622  Acc: 0.759075907590759  f1: 0.694970483078627  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2878456910451253  Acc: 0.735973597359736  f1: 0.6876747132233539  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.2060903708140056  Acc: 0.7458745874587459  f1: 0.6870930606125508  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3413182099660237  Acc: 0.7656765676567657  f1: 0.7168029252103678  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3651082118352253  Acc: 0.7722772277227723  f1: 0.724353873944483  model saving!

<fold=1> best score: 0.724353873944483



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8130361437797546  Acc: 0.6798679867986799  f1: 0.5197892210228092  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.64552108446757  Acc: 0.7458745874587459  f1: 0.6343695742042167  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6768148144086202  Acc: 0.7557755775577558  f1: 0.6246250650845167  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7313060760498047  Acc: 0.7524752475247525  f1: 0.6577895116082288  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8640336592992147  Acc: 0.7425742574257426  f1: 0.687939087970936  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9443755547205607  Acc: 0.7458745874587459  f1: 0.6183979621693259  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9335550864537557  Acc: 0.7392739273927392  f1: 0.6399906545067835  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.0117711623509724  Acc: 0.7425742574257426  f1: 0.6325165184400902  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1235684553782146  Acc: 0.735973597359736  f1: 0.625503742084053  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1641011238098145  Acc: 0.7293729372937293  f1: 0.604373307004886  

<fold=2> best score: 0.687939087970936



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7200391491254171  Acc: 0.6765676567656765  f1: 0.5195642453756355  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6458527942498525  Acc: 0.7326732673267327  f1: 0.6474667177944271  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.616533507903417  Acc: 0.7656765676567657  f1: 0.6386497761497761  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7868344187736511  Acc: 0.7557755775577558  f1: 0.7176703639378571  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.819569448630015  Acc: 0.7887788778877888  f1: 0.685129368866185  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1045818726221721  Acc: 0.7458745874587459  f1: 0.6587327728308822  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9422621726989746  Acc: 0.7722772277227723  f1: 0.7095028367863804  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.025556782881419  Acc: 0.7656765676567657  f1: 0.6858163622387302  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1094741423924763  Acc: 0.759075907590759  f1: 0.7087533880459451  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.1588546832402546  Acc: 0.7656765676567657  f1: 0.7088227393276674  

<fold=3> best score: 0.7176703639378571



  0%|          | 0/1213 [00:00<?, ?it/s]

  0%|          | 0/303 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.7029343446095785  Acc: 0.7128712871287128  f1: 0.5477456011730206  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6831811666488647  Acc: 0.7260726072607261  f1: 0.6491442970050658  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.6861290335655212  Acc: 0.7293729372937293  f1: 0.608817565841339  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.662526547908783  Acc: 0.7623762376237624  f1: 0.6889669411887698  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.8102550705273946  Acc: 0.7491749174917491  f1: 0.6981238032970625  model saving!



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 0.9594442844390869  Acc: 0.7458745874587459  f1: 0.6913911968790017  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.048702637354533  Acc: 0.7194719471947195  f1: 0.6640851096314324  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.202429970105489  Acc: 0.7227722772277227  f1: 0.6767829186321245  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3155246178309123  Acc: 0.7227722772277227  f1: 0.6695098218632295  



  0%|          | 0/76 [00:00<?, ?it/s]

  0%|          | 0/3 [00:00<?, ?it/s]

Loss: 1.3561618725458782  Acc: 0.7227722772277227  f1: 0.677888291217791  

<fold=4> best score: 0.6981238032970625

CV: 0.7185200275207676


<Figure size 432x288 with 0 Axes>

# 推論

In [None]:
# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission.csv", index=False, header=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.pred

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]