In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 6.7 MB/s 
[K     |████████████████████████████████| 6.6 MB 56.5 MB/s 
[K     |████████████████████████████████| 101 kB 10.3 MB/s 
[?25h

In [None]:
!pip install -q nlp

[K     |████████████████████████████████| 1.7 MB 8.6 MB/s 
[K     |████████████████████████████████| 212 kB 69.2 MB/s 
[?25h

In [None]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer


In [None]:
# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))


# config
data_dir = os.path.join(os.environ["HOME"], "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "train.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
PSEUDO_FILE = os.path.join(data_dir, "pseudo_label.csv")
MODELS_DIR = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/models"
#MODEL_NAME = 'bert-base-uncased'
MODEL_NAME = 'roberta-base'
#MODEL_NAME = 'microsoft/deberta-base'
TRAIN_BATCH_SIZE = 32
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 10
NUM_SPLITS = 5

Device: Tesla P100-PCIE-16GB


In [None]:
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    #print(df)
    
    dfp = pd.read_csv(PSEUDO_FILE)
    dfp["labels"] = dfp["labels"] - 1
    dfp["kfold"] = np.nan
    #df = df.rename(columns={'jobflag': 'labels'})
    #print(dfp)

    df = pd.concat([df, dfp], axis=0)
    #print(df)

    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=128))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset


# model
"""
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            return_dict=False) # Pythonの実行上必要なので加筆しました。)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output
"""

# roberta
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask):
        output = self.bert(
                input_ids = input_ids,
                attention_mask = attention_mask,
                return_dict=False
                ) 
        
        # token_type_idsは使わない
        
        output = output[0]
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output

# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask)
        del input_ids, attention_mask
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels = batch.values()
            del batch

            outputs = model(input_ids, attention_mask)
            del input_ids, attention_mask
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1


def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    #loss_ax.savefig(f"./figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    #acc_ax.savefig(f"./figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    #f1_ax.savefig(f"./figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100000, gamma=1.0)
    # ダミーのスケジューラー

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            #torch.save(model.state_dict(), MODELS_DIR + f"best_deberta_{fold}.pth")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1



In [None]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
#with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
 #   f.write(lines)



Downloading config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/478M [00:00<?, ?B/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.41726004928350446  Acc: 0.8192307692307692  f1: 0.727682427012948  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.37226945757865904  Acc: 0.8538461538461538  f1: 0.8027881647673314  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3468510925769806  Acc: 0.8653846153846154  f1: 0.7997746008652369  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.39030743986368177  Acc: 0.8538461538461538  f1: 0.7931446104878423  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4695721805095673  Acc: 0.8653846153846154  f1: 0.8134413251620625  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5623389840126037  Acc: 0.8269230769230769  f1: 0.7631854087989437  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5072648018598557  Acc: 0.8615384615384616  f1: 0.8175969145796732  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.575519397854805  Acc: 0.8596153846153847  f1: 0.8230332319176092  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6141914159059525  Acc: 0.8615384615384616  f1: 0.8185436169307136  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6268876045942307  Acc: 0.8673076923076923  f1: 0.8116685433462556  

<fold=0> best score: 0.8230332319176092



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4139712303876877  Acc: 0.8192307692307692  f1: 0.7159875511615814  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3532965511083603  Acc: 0.8596153846153847  f1: 0.7711507205783958  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.42410953640937804  Acc: 0.8538461538461538  f1: 0.7605473521892236  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.46619475483894346  Acc: 0.8403846153846154  f1: 0.7499438731033566  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.42044958248734476  Acc: 0.8711538461538462  f1: 0.8099027142901436  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4963513039052486  Acc: 0.8442307692307692  f1: 0.7642007223180759  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4932095818221569  Acc: 0.8769230769230769  f1: 0.814758860865517  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6853167682886123  Acc: 0.8346153846153846  f1: 0.7816086366258914  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5627501740120351  Acc: 0.8557692307692307  f1: 0.7922474465990627  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5574162617325783  Acc: 0.8576923076923076  f1: 0.8033437397919493  

<fold=1> best score: 0.814758860865517



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4877507507801056  Acc: 0.8153846153846154  f1: 0.7431295646476704  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.382385790348053  Acc: 0.8538461538461538  f1: 0.7810715912604447  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4962368309497833  Acc: 0.8326923076923077  f1: 0.7857316348631964  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4860677033662796  Acc: 0.8269230769230769  f1: 0.7721132897986078  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.47071752846241  Acc: 0.8365384615384616  f1: 0.7903897795117486  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5724344104528427  Acc: 0.8307692307692308  f1: 0.7669605594917371  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.536116500198841  Acc: 0.8346153846153846  f1: 0.7901778236814687  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5145371362566948  Acc: 0.8461538461538461  f1: 0.7826062874048603  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.7050230264663696  Acc: 0.8384615384615385  f1: 0.7741431025015475  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.597073820233345  Acc: 0.8442307692307692  f1: 0.7881950098059913  

<fold=2> best score: 0.7903897795117486



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.48445236682891846  Acc: 0.8015414258188824  f1: 0.687289204706367  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4337353318929672  Acc: 0.8323699421965318  f1: 0.783004596992284  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4320682242512703  Acc: 0.8400770712909441  f1: 0.7785269230578182  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.42711883336305617  Acc: 0.8516377649325626  f1: 0.7925757848070933  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5186408363282681  Acc: 0.8323699421965318  f1: 0.7740207968098387  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5092728029936552  Acc: 0.8420038535645472  f1: 0.7913857517603603  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5370549176819622  Acc: 0.8535645472061657  f1: 0.7990327698428163  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5518335649743676  Acc: 0.8497109826589595  f1: 0.8009873436449901  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5833077383227646  Acc: 0.8362235067437379  f1: 0.7833694191473517  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6600758589105681  Acc: 0.8420038535645472  f1: 0.7872528069195477  

<fold=3> best score: 0.8009873436449901



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5192247867584229  Acc: 0.8111753371868978  f1: 0.7248390562911367  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4365305632352829  Acc: 0.8362235067437379  f1: 0.7858462517564676  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5296590477228165  Acc: 0.8439306358381503  f1: 0.8191152631845758  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4790163069963455  Acc: 0.8362235067437379  f1: 0.7954051937092668  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4851788114756346  Acc: 0.8400770712909441  f1: 0.8049011145886661  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5295920699834824  Acc: 0.8458574181117534  f1: 0.8173660165233728  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5418550137430429  Acc: 0.8574181117533719  f1: 0.8189071952567503  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6735701739788056  Acc: 0.8574181117533719  f1: 0.842162415061857  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5970324516296387  Acc: 0.8554913294797688  f1: 0.822435010928511  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.8120075643062592  Acc: 0.8285163776493256  f1: 0.7840826403326404  

<fold=4> best score: 0.842162415061857

CV: 0.8142663262003443


<Figure size 432x288 with 0 Axes>

In [None]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
#with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
 #   f.write(lines)


        id                                        description  labels  kfold
0        0  <li>Develop cutting-edge web applications that...       2    NaN
1        1  <li> Designs and develops high quality, scalab...       2    NaN
2        2  <li>Functions as a point person for Network St...       3    NaN
3        3  <li> Work on the technical design, development...       2    NaN
4        4  <li>Quantify the resources required for a task...       3    NaN
...    ...                                                ...     ...    ...
1511  1511  <li>Support detailed reporting, statistical an...       0    NaN
1512  1512  <li>Collaborate with teams to support the ML t...       1    NaN
1513  1513  <li> Work with executives and other business l...       0    NaN
1514  1514  <li>Leading design ideation sessions to ensure...       2    NaN
1515  1515  <li>Detection of Issues &amp; Impact Assessmen...       0    NaN

[1516 rows x 4 columns]
        id                                        d

Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/474 [00:00<?, ?B/s]

Downloading vocab.json:   0%|          | 0.00/878k [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Downloading pytorch_model.bin:   0%|          | 0.00/533M [00:00<?, ?B/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4338808342814445  Acc: 0.8096153846153846  f1: 0.6722718191937367  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.33646690249443056  Acc: 0.875  f1: 0.813200042295966  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3494553752243519  Acc: 0.8711538461538462  f1: 0.8127723716041759  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3820335894823074  Acc: 0.8673076923076923  f1: 0.8162205195985296  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.37576880641281607  Acc: 0.8711538461538462  f1: 0.8176911093311349  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3907688602805138  Acc: 0.8730769230769231  f1: 0.8118505434630229  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4307014048099518  Acc: 0.8923076923076924  f1: 0.8429124815459244  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5316593080759049  Acc: 0.8653846153846154  f1: 0.8065320766940571  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.55443474650383  Acc: 0.8711538461538462  f1: 0.8093801876410572  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6198937356472015  Acc: 0.8730769230769231  f1: 0.8193413515630564  

<fold=0> best score: 0.8429124815459244



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.7116844832897187  Acc: 0.6846153846153846  f1: 0.5169887583680687  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.40697163343429565  Acc: 0.8519230769230769  f1: 0.7715532125619964  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.37333319038152696  Acc: 0.8596153846153847  f1: 0.784607452244718  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.43165757358074186  Acc: 0.8480769230769231  f1: 0.7772734878060825  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.37052715495228766  Acc: 0.8557692307692307  f1: 0.7971267351002009  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.44881040044128895  Acc: 0.8384615384615385  f1: 0.7669355703748821  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5141763359308242  Acc: 0.8461538461538461  f1: 0.7762150189330463  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5007702860515565  Acc: 0.8557692307692307  f1: 0.7960754865897801  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5743578806053847  Acc: 0.85  f1: 0.7267656278812464  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6819874880369753  Acc: 0.825  f1: 0.773232894556424  

<fold=1> best score: 0.7971267351002009



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.7867355227470398  Acc: 0.7153846153846154  f1: 0.5429302191176388  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.43765740692615507  Acc: 0.8403846153846154  f1: 0.763731307208113  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.43071475625038147  Acc: 0.8384615384615385  f1: 0.7676021353907964  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.47945402562618256  Acc: 0.8403846153846154  f1: 0.7833131817434515  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5118482738733292  Acc: 0.8442307692307692  f1: 0.799930679494166  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6408377856016159  Acc: 0.8365384615384616  f1: 0.7933853703865477  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.7036480486392975  Acc: 0.8326923076923077  f1: 0.728575153875469  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6724226653575898  Acc: 0.8480769230769231  f1: 0.7928249831236558  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6870972335338592  Acc: 0.8557692307692307  f1: 0.7706129370847289  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.721855890750885  Acc: 0.8442307692307692  f1: 0.7705033904148948  

<fold=2> best score: 0.799930679494166



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 1.2678148031234742  Acc: 0.31021194605009633  f1: 0.11838235294117648  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.44785037338733674  Acc: 0.8265895953757225  f1: 0.7557411644847634  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3962913557887077  Acc: 0.8516377649325626  f1: 0.7928401025228886  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4033586591482162  Acc: 0.8420038535645472  f1: 0.7775000805180038  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3986905999481678  Acc: 0.861271676300578  f1: 0.7998721936221935  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.43028239980340005  Acc: 0.8689788053949904  f1: 0.8399916309473257  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.47478230241686104  Acc: 0.8574181117533719  f1: 0.8119130015492688  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5599281344562769  Acc: 0.8497109826589595  f1: 0.793347004054735  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5130700960755348  Acc: 0.8631984585741811  f1: 0.81353811120477  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5702265337109566  Acc: 0.8477842003853564  f1: 0.7900097028362804  

<fold=3> best score: 0.8399916309473257



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaModel: ['lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 1.2197998046875  Acc: 0.32755298651252407  f1: 0.12336719883889696  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4622264474630356  Acc: 0.8188824662813102  f1: 0.7095006383173776  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4004869759082794  Acc: 0.8362235067437379  f1: 0.7888628696321004  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5031900823116302  Acc: 0.8362235067437379  f1: 0.8218430601076321  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4359537035226822  Acc: 0.8516377649325626  f1: 0.8152563992080607  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4503773391246796  Acc: 0.8574181117533719  f1: 0.8400744407464802  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.47173920935019853  Acc: 0.8631984585741811  f1: 0.8307254665868707  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5381560860434547  Acc: 0.8554913294797688  f1: 0.8519826727694626  model saving!



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6199456243775785  Acc: 0.8477842003853564  f1: 0.822330342038238  



  0%|          | 0/65 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6519071936607361  Acc: 0.8574181117533719  f1: 0.8350009290226682  

<fold=4> best score: 0.8519826727694626

CV: 0.826388839971416


<Figure size 432x288 with 0 Axes>

In [None]:
# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    #model.load_state_dict(torch.load(MODELS_DIR + f"best_deberta_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_roberta.csv", index=False, header=False)
    #submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission.csv", index=False, header=False)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.weight', 'lm_head.decoder.weight', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaMod

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

In [None]:
import pandas as pd
import numpy as np

bert = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv0830116346.csv",header=None)
roberta = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_roberta.csv",header=None)
deberta = pd.read_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_deberta.csv",header=None)


bert["bert"] = bert[1]
bert["roberta"] = roberta[1]
bert["deberta"] = deberta[1]

ens = bert[["bert","roberta","deberta"]].T

mode = ens.mode().T

bert["voting"] = mode[0].astype(int)

bert[[0,"voting"]].to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/Ensemble_pse.csv",index=None,header=None)