In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q transformers

[K     |████████████████████████████████| 4.7 MB 5.0 MB/s 
[K     |████████████████████████████████| 6.6 MB 53.2 MB/s 
[K     |████████████████████████████████| 101 kB 12.3 MB/s 
[?25h

In [None]:
!pip install -q nlp

[K     |████████████████████████████████| 1.7 MB 5.1 MB/s 
[K     |████████████████████████████████| 212 kB 73.5 MB/s 
[?25h

In [None]:
import collections
import os
import random

import matplotlib.pyplot as plt
import nlp
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from sklearn.model_selection import StratifiedKFold
from tqdm.notebook import tqdm
from transformers import AdamW, AutoModel, AutoTokenizer, get_linear_schedule_with_warmup


In [None]:
# seeds
SEED = 42
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(seed)
        torch.cuda.manual_seed_all(seed)

seed_everything(SEED)

if torch.cuda.is_available():
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    current_device = torch.cuda.current_device()
    print("Device:", torch.cuda.get_device_name(current_device))


# config
data_dir = os.path.join(os.environ["HOME"], "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data")
DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
TRAIN_FILE = os.path.join(data_dir, "train.csv")
TEST_FILE = os.path.join(data_dir, "test.csv")
PSEUDO_FILE = os.path.join(data_dir, "pseudo_label.csv")
MODELS_DIR = "/content/drive/MyDrive/Competition/SIGNATE/Datascientist/models"
MODEL_NAME = 'bert-base-uncased'
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 128
NUM_CLASSES = 4
EPOCHS = 10
NUM_SPLITS = 5

# optimizer等の設定
#weight_decay = 2e-5
beta = (0.9, 0.98)
num_warmup_steps_rate = 0.01
clip_grad_norm = None
gradient_accumulation_steps = 1
#num_eval = 1

Device: Tesla P100-PCIE-16GB


In [None]:
# make pseudo label data

# 疑似ラベル
psuedo_labels=[]
psuedo_texts=[]

models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()

        #outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)


#if outputs.max() >= 0.8:
 #   outputs = np.argmax(outputs, axis=1)

#print(len(final_output))
#print(final_output)

#print(len(psuedo_labels))
#print(len(psuedo_texts))

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])

# 疑似ラベル
psuedo_labels=[]
psuedo_ids=[]

for id, l in zip(submit["id"].values.tolist(), final_output):
  if max(l) >= 0.9:  # 0.9以上のみ
    psuedo_labels.append(np.argmax(l))

    psuedo_ids.append(id)

print(len(psuedo_labels))
print(len(psuedo_ids))

#print(test_df[test_df["id"].isin(psuedo_ids)])

test_df_psuedo = test_df[test_df["id"].isin(psuedo_ids)]
test_df_psuedo["labels"] = psuedo_labels
test_df_psuedo["labels"] = test_df_psuedo["labels"] + 1

print(test_df_psuedo)

test_df_psuedo.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data/pseudo_label.csv", index=False)
#submit["labels"] = final_output
#submit["labels"] = submit["labels"] + 1

#submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/data/pseudo_label.csv", index=False, header=False)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predic

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]

1082
1082
        id                                        description  labels
0     1516  <li>Building decision-making models and propos...       1
1     1517  <li>Educate homeowners on the benefits of sola...       4
2     1518  <li><span>Design, develop, document, and imple...       3
3     1519  <li>Apply advanced technical expertise and ski...       4
5     1521  <li>Model and execute on concurrent Product A/...       1
...    ...                                                ...     ...
1511  3027  <li>Completes programming and performs testing...       3
1512  3028  <li> Manages the development of interface requ...       3
1513  3029  <li>Lead the implementation of new statistical...       1
1514  3030  <li>Configure Zuora to fulfill use cases</li><...       3
1516  3032  <li>Support a business intelligence environmen...       1

[1082 rows x 3 columns]


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
# dataset
def make_folded_df(csv_file, num_splits=5):
    df = pd.read_csv(csv_file)
    df["jobflag"] = df["jobflag"] - 1
    df["kfold"] = np.nan
    df = df.rename(columns={'jobflag': 'labels'})
    print(df)
    
    dfp = pd.read_csv(PSEUDO_FILE)
    dfp["labels"] = dfp["labels"] - 1
    dfp["kfold"] = np.nan
    #df = df.rename(columns={'jobflag': 'labels'})
    print(dfp)

    df = pd.concat([df, dfp], axis=0)
    #print(df)

    label = df["labels"].tolist()

    skfold = StratifiedKFold(num_splits, shuffle=True, random_state=SEED)
    for fold, (_, valid_indexes) in enumerate(skfold.split(range(len(label)), label)):
        for i in valid_indexes:
            df.iat[i,3] = fold
    return df

def make_dataset(df, tokenizer, device):
    dataset = nlp.Dataset.from_pandas(df)
    dataset = dataset.map(
        lambda example: tokenizer(example["description"],
                                  padding="max_length",
                                  truncation=True,
                                  max_length=128))
    dataset.set_format(type='torch', 
                       columns=['input_ids', 'token_type_ids', 'attention_mask', 'labels'], 
                       device=device)
    return dataset


# model
class Classifier(nn.Module):
    def __init__(self, model_name, num_classes=4):
        super().__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.dropout = nn.Dropout(0.1)
        self.linear = nn.Linear(768, num_classes)
        nn.init.normal_(self.linear.weight, std=0.02)
        nn.init.zeros_(self.linear.bias)

    def forward(self, input_ids, attention_mask, token_type_ids):
        output, _ = self.bert(
            input_ids = input_ids,
            attention_mask = attention_mask,
            token_type_ids = token_type_ids,
            return_dict=False) # Pythonの実行上必要なので加筆しました。)
        output = output[:, 0, :]
        output = self.dropout(output)
        output = self.linear(output)
        return output


# training function
def train_fn(dataloader, model, criterion, optimizer, scheduler, device, epoch):
    
    model.train()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    progress = tqdm(dataloader, total=len(dataloader))

    for i, batch in enumerate(progress):
        progress.set_description(f"<Train> Epoch{epoch+1}")

        attention_mask, input_ids, labels, token_type_ids = batch.values()
        del batch

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask, token_type_ids)
        del input_ids, attention_mask, token_type_ids
        loss = criterion(outputs, labels)  # 損失を計算
        _, preds = torch.max(outputs, 1)  # ラベルを予測
        del outputs

        loss.backward()
        optimizer.step()
        scheduler.step()

        total_loss += loss.item()
        del loss
        total_corrects += torch.sum(preds == labels)

        all_labels += labels.tolist()
        all_preds += preds.tolist()
        del labels, preds

        progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    train_loss = total_loss / len(dataloader)
    train_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)
    train_f1 = f1_score(all_labels, all_preds, average="macro")

    return train_loss, train_acc, train_f1


def eval_fn(dataloader, model, criterion, device, epoch):
    model.eval()
    total_loss = 0
    total_corrects = 0
    all_labels = []
    all_preds = []

    with torch.no_grad():
        progress = tqdm(dataloader, total=len(dataloader))
        
        for i, batch in enumerate(progress):
            progress.set_description(f"<Valid> Epoch{epoch+1}")

            attention_mask, input_ids, labels, token_type_ids = batch.values()
            del batch

            outputs = model(input_ids, attention_mask, token_type_ids)
            del input_ids, attention_mask, token_type_ids
            loss = criterion(outputs, labels)
            _, preds = torch.max(outputs, 1)
            del outputs

            total_loss += loss.item()
            del loss
            total_corrects += torch.sum(preds == labels)

            all_labels += labels.tolist()
            all_preds += preds.tolist()
            del labels, preds

            progress.set_postfix(loss=total_loss/(i+1), f1=f1_score(all_labels, all_preds, average="macro"))

    valid_loss = total_loss / len(dataloader)
    valid_acc = total_corrects.double().cpu().detach().numpy() / len(dataloader.dataset)

    valid_f1 = f1_score(all_labels, all_preds, average="macro")

    return valid_loss, valid_acc, valid_f1


def plot_training(train_losses, train_accs, train_f1s,
                  valid_losses, valid_accs, valid_f1s,
                  epoch, fold):
    
    loss_df = pd.DataFrame({"Train":train_losses,
                            "Valid":valid_losses},
                        index=range(1, epoch+2))
    loss_ax = sns.lineplot(data=loss_df).get_figure()
    #loss_ax.savefig(f"./figures/loss_plot_fold={fold}.png", dpi=300)
    loss_ax.clf()

    acc_df = pd.DataFrame({"Train":train_accs,
                           "Valid":valid_accs},
                          index=range(1, epoch+2))
    acc_ax = sns.lineplot(data=acc_df).get_figure()
    #acc_ax.savefig(f"./figures/acc_plot_fold={fold}.png", dpi=300)
    acc_ax.clf()

    f1_df = pd.DataFrame({"Train":train_f1s,
                          "Valid":valid_f1s},
                         index=range(1, epoch+2))
    f1_ax = sns.lineplot(data=f1_df).get_figure()
    #f1_ax.savefig(f"./figures/f1_plot_fold={fold}.png", dpi=300)
    f1_ax.clf()

def trainer(fold, df):
    
    train_df = df[df.kfold != fold].reset_index(drop=True)
    valid_df = df[df.kfold == fold].reset_index(drop=True)

    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

    train_dataset = make_dataset(train_df, tokenizer, DEVICE)
    valid_dataset = make_dataset(valid_df, tokenizer, DEVICE)

    train_dataloader = torch.utils.data.DataLoader(
        train_dataset, batch_size=TRAIN_BATCH_SIZE, shuffle=True
    )
    valid_dataloader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False
    )

    model = Classifier(MODEL_NAME, num_classes=NUM_CLASSES)
    model = model.to(DEVICE)

    # weight=torch.tensor(1/np.array([0.3, 0.1, 0.3, 0.3])).cuda().float(), label_smoothing=0.1
    criterion = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5)
    # ダミーのスケジューラー
    scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=100000, gamma=1.0)
    """
    num_train_optimization_steps = int(
            len(train_dataloader) * EPOCHS // gradient_accumulation_steps
            )
    num_warmup_steps = int(num_train_optimization_steps * num_warmup_steps_rate)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=num_train_optimization_steps
    )
    """
    

    train_losses = []
    train_accs = []
    train_f1s = []
    valid_losses = []
    valid_accs = []
    valid_f1s = []

    best_loss = np.inf
    best_acc = 0
    best_f1 = 0

    for epoch in range(EPOCHS):
        train_loss, train_acc, train_f1 = train_fn(train_dataloader, model, criterion, optimizer, scheduler, DEVICE, epoch)
        valid_loss, valid_acc, valid_f1 = eval_fn(valid_dataloader, model, criterion, DEVICE, epoch)
        print(f"Loss: {valid_loss}  Acc: {valid_acc}  f1: {valid_f1}  ", end="")

        train_losses.append(train_loss)
        train_accs.append(train_acc)
        train_f1s.append(train_f1)
        valid_losses.append(valid_loss)
        valid_accs.append(valid_acc)
        valid_f1s.append(valid_f1)

        plot_training(train_losses, train_accs, train_f1s,
                      valid_losses, valid_accs, valid_f1s,
                      epoch, fold)
        
        best_loss = valid_loss if valid_loss < best_loss else best_loss
        besl_acc = valid_acc if valid_acc > best_acc else best_acc
        if valid_f1 > best_f1:
            best_f1 = valid_f1
            print("model saving!", end="")
            torch.save(model.state_dict(), MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth")
        print("\n")

    return best_f1



In [None]:
# training
df = make_folded_df(TRAIN_FILE, NUM_SPLITS)
f1_scores = []
for fold in range(NUM_SPLITS):
    print(f"fold {fold}", "="*80)
    f1 = trainer(fold, df)
    f1_scores.append(f1)
    print(f"<fold={fold}> best score: {f1}\n")

cv = sum(f1_scores) / len(f1_scores)
print(f"CV: {cv}")

lines = ""
for i, f1 in enumerate(f1_scores):
    line = f"fold={i}: {f1}\n"
    lines += line
lines += f"CV    : {cv}"
#with open(f"./result/{MODEL_NAME}_result.txt", mode='w') as f:
 #   f.write(lines)


        id                                        description  labels  kfold
0        0  <li>Develop cutting-edge web applications that...       2    NaN
1        1  <li> Designs and develops high quality, scalab...       2    NaN
2        2  <li>Functions as a point person for Network St...       3    NaN
3        3  <li> Work on the technical design, development...       2    NaN
4        4  <li>Quantify the resources required for a task...       3    NaN
...    ...                                                ...     ...    ...
1511  1511  <li>Support detailed reporting, statistical an...       0    NaN
1512  1512  <li>Collaborate with teams to support the ML t...       1    NaN
1513  1513  <li> Work with executives and other business l...       0    NaN
1514  1514  <li>Leading design ideation sessions to ensure...       2    NaN
1515  1515  <li>Detection of Issues &amp; Impact Assessmen...       0    NaN

[1516 rows x 4 columns]
        id                                        d

  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.42995551228523254  Acc: 0.8326923076923077  f1: 0.7669261294261295  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4147233933210373  Acc: 0.8557692307692307  f1: 0.7967789486105684  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3895430326461792  Acc: 0.8730769230769231  f1: 0.826543696892202  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4633715353906155  Acc: 0.8596153846153847  f1: 0.8134947677718252  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5004901915788651  Acc: 0.8461538461538461  f1: 0.7987137466121355  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6195539608597755  Acc: 0.8538461538461538  f1: 0.8079738221361429  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5713064640760421  Acc: 0.8596153846153847  f1: 0.7982794843417604  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6032618165016175  Acc: 0.8596153846153847  f1: 0.7825195489742294  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.7138511270284653  Acc: 0.8519230769230769  f1: 0.7989402006510815  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.582192312180996  Acc: 0.8403846153846154  f1: 0.7998172320876402  

<fold=0> best score: 0.826543696892202



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4615212559700012  Acc: 0.8269230769230769  f1: 0.6703204357330886  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3819276783615351  Acc: 0.8538461538461538  f1: 0.7760182090961844  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4258216476067901  Acc: 0.8576923076923076  f1: 0.7955388611917055  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4730533391237259  Acc: 0.8538461538461538  f1: 0.7833290549226118  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.554118238389492  Acc: 0.8596153846153847  f1: 0.7908575951280965  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6699317168444395  Acc: 0.8615384615384616  f1: 0.7903243519879106  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6248144473880529  Acc: 0.8711538461538462  f1: 0.8028350025662643  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6051684290170669  Acc: 0.8634615384615385  f1: 0.8018788569976933  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.603958060592413  Acc: 0.8692307692307693  f1: 0.8050408290756565  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6948220245540142  Acc: 0.8538461538461538  f1: 0.790791204693871  

<fold=1> best score: 0.8050408290756565



  0%|          | 0/2078 [00:00<?, ?it/s]

  0%|          | 0/520 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.39413820803165434  Acc: 0.8480769230769231  f1: 0.7626809433195099  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.353364372253418  Acc: 0.8596153846153847  f1: 0.8054920918824083  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4148722603917122  Acc: 0.8403846153846154  f1: 0.784708788577734  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4205557182431221  Acc: 0.8615384615384616  f1: 0.8012198763050256  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4690649973228574  Acc: 0.8519230769230769  f1: 0.7412273499529856  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5261447825469077  Acc: 0.8442307692307692  f1: 0.7784025394925568  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5669312492012978  Acc: 0.8519230769230769  f1: 0.7964912639673243  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.503302606381476  Acc: 0.875  f1: 0.8149360482209319  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5422608129680156  Acc: 0.85  f1: 0.7949418660303058  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5640535340644419  Acc: 0.8615384615384616  f1: 0.7967181669974965  

<fold=2> best score: 0.8149360482209319



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4080572485923767  Acc: 0.838150289017341  f1: 0.7601121736623284  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.43017974495887756  Acc: 0.8304431599229287  f1: 0.7636313365805313  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3952509999275208  Acc: 0.859344894026975  f1: 0.7954672934171015  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3843531858175993  Acc: 0.8709055876685935  f1: 0.8002980334145493  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.39827166590839624  Acc: 0.884393063583815  f1: 0.8429711342667453  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.49902799278497695  Acc: 0.8497109826589595  f1: 0.8055393726644555  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5678376723080873  Acc: 0.8420038535645472  f1: 0.8054075583280319  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4931875397451222  Acc: 0.8805394990366089  f1: 0.8346004573490156  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5434604816604406  Acc: 0.8747591522157996  f1: 0.8355893240203676  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5089591078693048  Acc: 0.8901734104046243  f1: 0.8505317233656111  model saving!

<fold=3> best score: 0.8505317233656111



  0%|          | 0/2079 [00:00<?, ?it/s]

  0%|          | 0/519 [00:00<?, ?it/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4585642635822296  Acc: 0.7996146435452793  f1: 0.6932076969914387  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3588937796652317  Acc: 0.8535645472061657  f1: 0.8114563463422179  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.3537159744650126  Acc: 0.8670520231213873  f1: 0.853529434473706  model saving!



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.41047248281538484  Acc: 0.8670520231213873  f1: 0.8419273113896368  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.4595057857222855  Acc: 0.8651252408477842  f1: 0.8435642982598228  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6359576727263629  Acc: 0.8265895953757225  f1: 0.804434120224278  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5009288183413446  Acc: 0.8574181117533719  f1: 0.8366622604301369  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5992785815149546  Acc: 0.8497109826589595  f1: 0.8091167937628556  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.6550239883363247  Acc: 0.8554913294797688  f1: 0.7947236640995394  



  0%|          | 0/130 [00:00<?, ?it/s]

  0%|          | 0/5 [00:00<?, ?it/s]

Loss: 0.5736539493314922  Acc: 0.861271676300578  f1: 0.8283129771978699  

<fold=4> best score: 0.853529434473706

CV: 0.8301163464056215


<Figure size 432x288 with 0 Axes>

In [None]:
# inference
models = []
for fold in range(NUM_SPLITS):
    model = Classifier(MODEL_NAME)
    model.load_state_dict(torch.load(MODELS_DIR + f"best_{MODEL_NAME}_{fold}.pth"))
    model.to(DEVICE)
    model.eval()
    models.append(model)

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
test_df = pd.read_csv(TEST_FILE)
test_df["labels"] = -1
test_dataset = make_dataset(test_df, tokenizer, DEVICE)
test_dataloader = torch.utils.data.DataLoader(
    test_dataset, batch_size=VALID_BATCH_SIZE, shuffle=False)

with torch.no_grad():
    progress = tqdm(test_dataloader, total=len(test_dataloader))
    final_output = []

    for batch in progress:
        progress.set_description("<Test>")

        attention_mask, input_ids, labels, token_type_ids = batch.values()

        outputs = []
        for model in models:
            output = model(input_ids, attention_mask, token_type_ids)
            outputs.append(output)

        outputs = sum(outputs) / len(outputs)
        outputs = torch.softmax(outputs, dim=1).cpu().detach().tolist()
        outputs = np.argmax(outputs, axis=1)

        final_output.extend(outputs)

submit = pd.read_csv(os.path.join(data_dir, "submit_sample.csv"), names=["id", "labels"])
submit["labels"] = final_output
submit["labels"] = submit["labels"] + 1
try:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission_cv{}.csv".format(str(cv).replace(".", "")[:10]), index=False, header=False)
except NameError:
    submit.to_csv("/content/drive/MyDrive/Competition/SIGNATE/Datascientist/outputs/submission.csv", index=False, header=False)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.dense.weight', 'cls.predic

  0%|          | 0/1517 [00:00<?, ?it/s]

  0%|          | 0/12 [00:00<?, ?it/s]