# CFG

In [1]:
class CFG:
    # ====================================================
    # general
    # ====================================================
    seed = 42
    n_fold=5
    trn_fold=[0, 1]
    train=True
    train_csv = '../input/train.csv'
    test_csv = '../input/test.csv'
    target_col='label'
    output_dir = '../output/'
    num_workers=4
    # ====================================================
    # model
    # ====================================================
    model="studio-ousia/luke-japanese-large"
    # mlm_dir='./drive/MyDrive/Colab Notebooks/hate-speech-detection/mlm/exp02/'
    rnn='GRU' # [None, 'GRU', 'LSTM']
    pooling='max_old' #["None", "mean", "max", "attention"] # RANDOM SAMPLING
    # ====================================================
    # model tuning
    # ====================================================
    reinit_layers=-1
    multi_sample_dropout=0.2
    n_msd = 7 # 5~8
    # ====================================================
    # optimizer
    # ====================================================
    encoder_lr=1e-5
    decoder_lr=1e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    weight_decay=0.2
    # ====================================================
    # scheduler
    # ====================================================
    epochs=1
    scheduler='cosine'
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    # ====================================================
    # batch size
    # ====================================================
    train_batch_size=16
    valid_batch_size=16
    # ====================================================
    # gradient
    # ====================================================
    max_grad_norm=1
    gradient_accumulation_steps=1
    
def class2dict(f):
    return dict((name, getattr(f, name)) for name in dir(f) if not name.startswith('__'))
cfg_dict = class2dict(CFG)

In [57]:
[1,2,3,4,5][-1:]

[5]

# Library

In [2]:
# ====================================================
# Library
# ====================================================

import warnings
warnings.filterwarnings("ignore")

import gc
import scipy as sp
import numpy as np
import pandas as pd
import hashlib
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from pathlib import Path
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
from pytorch_lightning.loggers import CSVLogger, WandbLogger
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.1
transformers.__version__: 4.23.1
env: TOKENIZERS_PARALLELISM=true


# Data Loading

In [3]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv(CFG.train_csv)
test = pd.read_csv(CFG.test_csv)
def clean_text(text):
    return text.replace(' ', '').replace('　', '').replace('__BR__', '\n').replace('\xa0', '').replace('\r', '').lstrip('\n')

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())

train.shape: (5256, 4)


Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1
3,e76638295,livejupiter,よくよく思えば川上は配布にしたらとんでもなく有能だよな\nガチャから引いたら圧倒的歓喜レベルやで,0
4,51e4036bf,newsplus,押井は原作レイプの専門家だから\n原作マンガの真意を誤解させることに関してはプロだが\nそれ...,0


test.shape: (3223, 3)


Unnamed: 0,id,source,text
0,001026808,news4vip,上でも言ったけどオタクレベルの知識求めてる訳じゃない\nただ囲碁やります！って人が誰1人プロ...
1,00465ac96,livejupiter,たとえば、黒人なんかは、生物学的欠陥はないのに、文化的要因で、悪循環に陥り、実力をつけられず...
2,004674725,livejupiter,そうなんやろなあ色々と勿体ない感じしたわ\n終わり方と黒幕キャラは好きやったで\n\nちなワ...
3,00474460f,news4vip,法的というか自治体ごとにバラバラの条例で定めてるだけだからな\n普通の淫行条例だと「青少年に...
4,004a7525c,newsplus,別のジャーナリストの感想として言われてるので客観的な事実とは言えないけど、\n現地は不測の事...


# CV split

In [4]:
# ====================================================
# CV split
# ====================================================
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_col].astype(int))):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1052
1    1051
2    1051
3    1051
4    1051
dtype: int64

# tokenizer

# Dataset

In [5]:
# ====================================================
# Define max_len
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
for text_col in ['text']:
    train_lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        train_lengths.append(length)

CFG.max_len = max(train_lengths) + 3 # cls & sep & sep
print(f"max_len: {CFG.max_len}")

  0%|          | 0/5256 [00:00<?, ?it/s]

max_len: 77


In [6]:
# ====================================================
# Dataset
# ====================================================
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df["text"].values
        self.labels = df[cfg.target_col].values
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer.encode_plus(
            self.texts[item],
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=self.texts[item],
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.tensor(self.labels[item]),
        )


class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df["text"].values
        self.tokenizer = AutoTokenizer.from_pretrained(cfg.model)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.tokenizer.encode_plus(
            self.texts[item],
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=self.texts[item],
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
        )

# Model

In [7]:
class AttentionPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_fc):
        super(AttentionPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_fc = hiddendim_fc
        self.dropout = nn.Dropout(0.1)

        q_t = np.random.normal(loc=0.0, scale=0.1, size=(1, self.hidden_size))
        self.q = nn.Parameter(torch.from_numpy(q_t)).float()
        w_ht = np.random.normal(loc=0.0, scale=0.1, size=(self.hidden_size, self.hiddendim_fc))
        self.w_h = nn.Parameter(torch.from_numpy(w_ht)).float()

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out = self.attention(hidden_states)
        out = self.dropout(out)
        return out

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if cfg.rnn == "LSTM":
            self.rnn = nn.LSTM(
                self.config.hidden_size,
                self.config.hidden_size,
                # bidirectional=True,
                batch_first=True,
            )
        elif cfg.rnn == "GRU":
            self.rnn = nn.GRU(
                self.config.hidden_size,
                self.config.hidden_size,
                # bidirectional=True,
                batch_first=True,
            )
        self.dropouts = nn.ModuleList([nn.Dropout(self.cfg.multi_sample_dropout) for _ in range(self.cfg.n_msd)])
        self.pooler = AttentionPooling(self.config.num_hidden_layers+1, self.config.hidden_size, 128)
        self.fc = nn.Linear(self.config.hidden_size, 2)
        for layer in self.model.encoder.layer[self.cfg.reinit_layers:]:
            for module in layer.modules():
                self._init_weights(module)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        all_hidden_states = torch.stack(outputs['hidden_states'])
        rnn_out, _ = self.rnn(outputs['last_hidden_state'], None)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(rnn_out.size()).float()
        if self.cfg.pooling == "mean_old":
            sequence_output = rnn_out.mean(axis=1)
        elif self.cfg.pooling == "max_old":
            sequence_output, _ = rnn_out.max(1)
        elif self.cfg.pooling == "mean":
            sum_embeddings = torch.sum(rnn_out * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            sequence_output = sum_embeddings / sum_mask
        elif self.cfg.pooling == "max":
            rnn_out[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
            sequence_output = torch.max(rnn_out, 1)[0]
        elif self.cfg.pooling == "attention":
            torch.cat([all_hidden_states, rnn_out.unsqueeze(0)])
            sequence_output = self.pooler(torch.cat([all_hidden_states, rnn_out.unsqueeze(0)]))
        else:
            sequence_output = rnn_out[:,-1,:]
        output = sum([self.fc(dropout(sequence_output)) for dropout in self.dropouts])/self.cfg.n_msd
        return output

In [8]:
class CustomDataModule(pl.LightningDataModule):
    """
    DataFrameからモデリング時に使用するDataModuleを作成
    """

    def __init__(self, cfg, train_df, fold):
        super().__init__()
        self.cfg = cfg
        self.fold = fold
        self.train_df = train_df

    def setup(self, stage=None):
        self.train_folds = self.train_df[self.train_df['fold'] != self.fold].reset_index(drop=True)
        self.valid_folds = self.train_df[self.train_df['fold'] == self.fold].reset_index(drop=True)
        self.cfg.num_train_steps = int(len(self.train_folds) / self.cfg.train_batch_size * self.cfg.epochs)

    def train_dataloader(self):
        return DataLoader(TrainDataset(self.cfg, self.train_folds),
                        batch_size=self.cfg.train_batch_size,
                        shuffle=True,
                        num_workers=self.cfg.num_workers, pin_memory=True, drop_last=True)

    def val_dataloader(self):
        return DataLoader(TrainDataset(self.cfg, self.valid_folds),
                        batch_size=self.cfg.valid_batch_size,
                        shuffle=False,
                        num_workers=self.cfg.num_workers, pin_memory=True, drop_last=False)

class CustomLitModule(pl.LightningModule):
    def __init__(self, cfg, fold):
        super().__init__()
        self.save_hyperparameters()
        
        self.model = CustomModel(cfg, config_path=None, pretrained=True)
        self.criterion = nn.CrossEntropyLoss()
        self.cfg = cfg
        self.fold = fold

    def forward(self, input_ids, attention_mask, labels=None):
        logits = self.model(input_ids, attention_mask)
        loss = 0 if labels is None else self.criterion(logits, labels)
        return loss, logits

    def training_step(self, batch, batch_idx):
        loss, preds = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        
        sch = self.lr_schedulers()
        if self.cfg.batch_scheduler:
            sch.step()
        return {"loss": loss, "batch_preds": preds, "batch_labels": batch["labels"]}

    def validation_step(self, batch, batch_idx):
        loss, preds = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
            labels=batch["labels"],
        )
        return {"loss": loss, "batch_preds": preds, "batch_labels": batch["labels"]}
    
    def predict_step(self, batch, batch_idx):
        loss, preds = self.forward(
            input_ids=batch["input_ids"],
            attention_mask=batch["attention_mask"],
        )
        return preds

    def validation_epoch_end(self, outputs, mode="val"):
        # loss計算
        epoch_preds = torch.cat([x["batch_preds"] for x in outputs])
        epoch_labels = torch.cat([x["batch_labels"] for x in outputs])
        epoch_loss = self.criterion(epoch_preds, epoch_labels)
        self.log(f"[fold{self.fold}]{mode}_loss", epoch_loss, logger=True, prog_bar=True)
        self.log(f"[fold{self.fold}]{mode}_f1", f1_score(epoch_labels.cpu(), epoch_preds.cpu().argmax(dim=1)), logger=True, prog_bar=True)

    def configure_optimizers(self):
        def get_optimizer_params(model, encoder_lr, decoder_lr, weight_decay=0.0):
            param_optimizer = list(model.named_parameters())
            no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
            optimizer_parameters = [
                {'params': [p for n, p in model.model.named_parameters() if not any(nd in n for nd in no_decay)],
                'lr': encoder_lr, 'weight_decay': weight_decay},
                {'params': [p for n, p in model.model.named_parameters() if any(nd in n for nd in no_decay)],
                'lr': encoder_lr, 'weight_decay': 0.0},
                {'params': [p for n, p in model.named_parameters() if "model" not in n],
                'lr': decoder_lr, 'weight_decay': 0.0}
            ]
            return optimizer_parameters

        optimizer_parameters = get_optimizer_params(self.model,
                                                    encoder_lr=self.cfg.encoder_lr, 
                                                    decoder_lr=self.cfg.decoder_lr,
                                                    weight_decay=self.cfg.weight_decay)
        optimizer = AdamW(optimizer_parameters, lr=self.cfg.encoder_lr, eps=self.cfg.eps, betas=self.cfg.betas)
        # ====================================================
        # scheduler
        # ====================================================
        def get_scheduler(cfg, optimizer, num_train_steps):
            if cfg.scheduler=='linear':
                scheduler = get_linear_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps
                )
            elif cfg.scheduler=='cosine':
                scheduler = get_cosine_schedule_with_warmup(
                    optimizer, num_warmup_steps=cfg.num_warmup_steps, num_training_steps=num_train_steps, num_cycles=cfg.num_cycles
                )
            return scheduler

        scheduler = {
                "scheduler": get_scheduler(self.cfg, optimizer, self.cfg.num_train_steps),
                "interval": "step" if self.cfg.batch_scheduler else "epoch",
            }

        return [optimizer], [scheduler]

In [9]:
pl.seed_everything(CFG.seed)
preds = []
results = []
test_dataloader = DataLoader(TestDataset(CFG, test),
                        batch_size=32,
                        shuffle=False,
                        num_workers=CFG.num_workers, pin_memory=True, drop_last=False)
out_dir = Path(CFG.output_dir)/CFG.model/CFG.rnn/CFG.pooling
out_dir.mkdir(parents=True, exist_ok=True)
wandb_logger = WandbLogger(
    name=f"{CFG.model}_{CFG.rnn}_{CFG.pooling}",
    save_dir=str(out_dir),
    project="hatespeech_detection",
    group=CFG.model,
    anonymous=True,
)
for k, v in cfg_dict.items():
    wandb_logger.experiment.config[k] = v
for fold in CFG.trn_fold:
    logger = CSVLogger(save_dir=out_dir, name=f"fold_{fold}")
    checkpoint_callback = ModelCheckpoint(
        dirpath=out_dir,
        filename=f"fold_{fold}",
        verbose=True,
        save_top_k=1,
        monitor=f"[fold{fold}]val_f1",
        mode="max",
        save_weights_only=True,
    )
    trainer = pl.Trainer(
        max_epochs=CFG.epochs,
        accelerator="gpu",
        # strategy="dp",
        devices=1,
        gradient_clip_val=CFG.max_grad_norm,
        gradient_clip_algorithm="value",
        amp_backend="native",
        deterministic=True,
        auto_select_gpus=False,
        benchmark=False,
        default_root_dir=os.getcwd(),
        limit_train_batches=0.2,
        limit_val_batches=0.2,
        accumulate_grad_batches=CFG.gradient_accumulation_steps,
        callbacks=[checkpoint_callback],
        logger=[wandb_logger,logger],
    )
    datamodule = CustomDataModule(CFG, train, fold)
    model = CustomLitModule(CFG, fold)
    trainer.fit(model, datamodule=datamodule)
    results += trainer.validate(model=model, dataloaders=datamodule.val_dataloader())
    logits = trainer.predict(model=CustomLitModule(CFG, fold), dataloaders=test_dataloader)
    pred = torch.cat(logits)
    test["label"] = pred.argmax(1)
    test[["id", "label"]].to_csv(out_dir/f"submission_fold{fold}.csv", index=None)
    display(test[["id", "label"]].groupby("label").count())
    preds.append(pred)
    del trainer, datamodule, model; gc.collect()

wandb_logger.finalize("success")
result_df = pd.DataFrame([{k[7:]: v for k,v in result.items()} for result in results])
display(result_df)
test["label"] = (sum(preds) / len(preds)).argmax(1)
test[["id", "label"]].to_csv(out_dir/"submission_ave.csv", index=None)
display(test[["id", "label"]].groupby("label").count())

Global seed set to 42
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeModel: ['lm_head.dense.bias', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'entity_predictions.transform.dense.bias', 'entity_predictions.transform.dense.weight', 'entity_predictions.bias', 'entity_predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 52: '[fold0]val_f1' reached 0.00000 (best 0.00000), saving model to '/home/workspace/labo/hatespeech_detection/output/studio-ousia/luke-japanese-large/GRU/max_old/fold_0-v5.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      [fold0]val_f1                 0.0
     [fold0]val_loss        0.23703131079673767
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeModel: ['lm_head.dense.bias', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'entity_predictions.transform.dense.bias', 'entity_predictions.transform.dense.weight', 'entity_predictions.bias', 'entity_predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting: 52it [00:00, ?it/s]

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,3223


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeModel: ['lm_head.dense.bias', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'entity_predictions.transform.dense.bias', 'entity_predictions.transform.dense.weight', 'entity_predictions.bias', 'entity_predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification 

Sanity Checking: 0it [00:00, ?it/s]

Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

Epoch 0, global step 52: '[fold1]val_f1' reached 0.00000 (best 0.00000), saving model to '/home/workspace/labo/hatespeech_detection/output/studio-ousia/luke-japanese-large/GRU/max_old/fold_1-v3.ckpt' as top 1
`Trainer.fit` stopped: `max_epochs=1` reached.
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Validation: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
     Validate metric           DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
      [fold1]val_f1                 0.0
     [fold1]val_loss        0.17478825151920319
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


Some weights of the model checkpoint at studio-ousia/luke-japanese-large were not used when initializing LukeModel: ['lm_head.dense.bias', 'entity_predictions.transform.LayerNorm.bias', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'entity_predictions.transform.dense.bias', 'entity_predictions.transform.dense.weight', 'entity_predictions.bias', 'entity_predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing LukeModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing LukeModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0,1,2,3]


Predicting: 52it [00:00, ?it/s]

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,3221
1,2


Unnamed: 0,[fold0]val_loss,[fold0]val_f1,[fold1]val_loss,[fold1]val_f1
0,0.237031,0.0,,
1,,,0.174788,0.0


Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,3223


In [43]:
result_df = pd.DataFrame([{k[7:]: v for k,v in result.items()} for result in results])

In [48]:
result_df["val_f1"].mean()

0.0

In [10]:
test["label"] = sum(preds[0:4]).argmax(1)
test[["id", "label"]].to_csv(out_dir/"submission_04.csv", index=None)
display(test[["id", "label"]].groupby("label").count())

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,3223


In [11]:
test["label"] = sum(preds[0:3]).argmax(1)
test[["id", "label"]].to_csv(out_dir/"submission_03.csv", index=None)
display(test[["id", "label"]].groupby("label").count())

Unnamed: 0_level_0,id
label,Unnamed: 1_level_1
0,3223


## To Do
- Wandbが保存されない問題を解決(Foldごとにスクリプトを分割する)
- Hydraを用いたtrainの書き方に変更する(ハイパラ探索のため)
- いろいろ