# Directory settings

# CFG

In [1]:
class CFG:
    seed = 42
    n_fold=5
    trn_fold=[0, 1, 2, 3, 4]
    train=True
    target_col='label'
    num_workers=4
    # ====================================================
    # model
    # ====================================================
    model="studio-ousia/luke-japanese-large"
    mlm_dir='./drive/MyDrive/Colab Notebooks/hate-speech-detection/mlm/exp02/'
    rnn='GRU' # [None, 'GRU', 'LSTM']
    pooling='mean' #TO DO ["mean", "max", "attention"]
    # ====================================================
    # model tuning
    # ====================================================
    reinit_layers=-1 #TO DO
    multi_sample_dropout=0.2
    n_msd = 7 # 5~8
    # ====================================================
    # optimizer
    # ====================================================
    encoder_lr=1e-5
    decoder_lr=1e-5
    min_lr=1e-6
    eps=1e-6
    betas=(0.9, 0.999)
    weight_decay=0.2
    # ====================================================
    # scheduler
    # ====================================================
    epochs=4
    scheduler='cosine'
    batch_scheduler=True
    num_cycles=0.5
    num_warmup_steps=0
    # ====================================================
    # batch size
    # ====================================================
    train_batch_size=32
    valid_batch_size=32
    # ====================================================
    # gradient
    # ====================================================
    max_grad_norm=1
    gradient_accumulation_steps=1

# Library

In [2]:
# ====================================================
# Library
# ====================================================

import warnings
warnings.filterwarnings("ignore")

import gc
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam, SGD, AdamW
from torch.utils.data import DataLoader, Dataset

import tokenizers
import transformers
import os
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, ModelCheckpoint
print(f"tokenizers.__version__: {tokenizers.__version__}")
print(f"transformers.__version__: {transformers.__version__}")
from transformers import AutoTokenizer, AutoModel, AutoConfig
from transformers import get_linear_schedule_with_warmup, get_cosine_schedule_with_warmup
%env TOKENIZERS_PARALLELISM=true

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

tokenizers.__version__: 0.13.1
transformers.__version__: 4.23.1
env: TOKENIZERS_PARALLELISM=true


# Helper functions for scoring

In [3]:
class AttentionPooling(nn.Module):
    def __init__(self, num_layers, hidden_size, hiddendim_fc):
        super(AttentionPooling, self).__init__()
        self.num_hidden_layers = num_layers
        self.hidden_size = hidden_size
        self.hiddendim_fc = hiddendim_fc
        self.dropout = nn.Dropout(0.1)

        q_t = np.random.normal(loc=0.0, scale=0.1, size=(1, self.hidden_size))
        self.q = nn.Parameter(torch.from_numpy(q_t)).float()
        w_ht = np.random.normal(loc=0.0, scale=0.1, size=(self.hidden_size, self.hiddendim_fc))
        self.w_h = nn.Parameter(torch.from_numpy(w_ht)).float()

    def forward(self, all_hidden_states):
        hidden_states = torch.stack([all_hidden_states[layer_i][:, 0].squeeze()
                                     for layer_i in range(1, self.num_hidden_layers+1)], dim=-1)
        hidden_states = hidden_states.view(-1, self.num_hidden_layers, self.hidden_size)
        out = self.attention(hidden_states)
        out = self.dropout(out)
        return out

    def attention(self, h):
        v = torch.matmul(self.q, h.transpose(-2, -1)).squeeze(1)
        v = F.softmax(v, -1)
        v_temp = torch.matmul(v.unsqueeze(1), h).transpose(-2, -1)
        v = torch.matmul(self.w_h.transpose(1, 0), v_temp).squeeze(2)
        return v

In [4]:
# ====================================================
# Data Loading
# ====================================================
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
def clean_text(text):
    return text.replace(' ', '').replace('　', '').replace('__BR__', '\n').replace('\xa0', '').replace('\r', '').lstrip('\n')

train['text'] = train['text'].apply(clean_text)
test['text'] = test['text'].apply(clean_text)

print(f"train.shape: {train.shape}")
display(train.head())
print(f"test.shape: {test.shape}")
display(test.head())

train.shape: (5256, 4)


Unnamed: 0,id,source,text,label
0,80074aa43,news4vip,まともに相手されてない人との関係なんて\nそんな大事にするものか？,0
1,6378fea6b,livejupiter,最近はアヘアヘQSマンやない？ｲｲ!(・∀・)+1-0(・Ａ・)ｲｸﾅｲ!,0
2,c535f5613,livejupiter,日本人として生まれても無能な低学歴って分かったら日本人の権利剥奪して追放すべきやろ\n甘えるな,1
3,e76638295,livejupiter,よくよく思えば川上は配布にしたらとんでもなく有能だよな\nガチャから引いたら圧倒的歓喜レベルやで,0
4,51e4036bf,newsplus,押井は原作レイプの専門家だから\n原作マンガの真意を誤解させることに関してはプロだが\nそれ...,0


test.shape: (3223, 3)


Unnamed: 0,id,source,text
0,001026808,news4vip,上でも言ったけどオタクレベルの知識求めてる訳じゃない\nただ囲碁やります！って人が誰1人プロ...
1,00465ac96,livejupiter,たとえば、黒人なんかは、生物学的欠陥はないのに、文化的要因で、悪循環に陥り、実力をつけられず...
2,004674725,livejupiter,そうなんやろなあ色々と勿体ない感じしたわ\n終わり方と黒幕キャラは好きやったで\n\nちなワ...
3,00474460f,news4vip,法的というか自治体ごとにバラバラの条例で定めてるだけだからな\n普通の淫行条例だと「青少年に...
4,004a7525c,newsplus,別のジャーナリストの感想として言われてるので客観的な事実とは言えないけど、\n現地は不測の事...


# CV split

In [5]:
# ====================================================
# CV split
# ====================================================
Fold = StratifiedKFold(n_splits=CFG.n_fold, shuffle=True, random_state=CFG.seed)
for n, (train_index, val_index) in enumerate(Fold.split(train, train[CFG.target_col].astype(int))):
    train.loc[val_index, 'fold'] = int(n)
train['fold'] = train['fold'].astype(int)
display(train.groupby('fold').size())

fold
0    1052
1    1051
2    1051
3    1051
4    1051
dtype: int64

# tokenizer

In [6]:
# ====================================================
# tokenizer
# ====================================================
tokenizer = AutoTokenizer.from_pretrained(CFG.model)
CFG.tokenizer = tokenizer

# Dataset

In [7]:
# ====================================================
# Define max_len
# ====================================================
for text_col in ['text']:
    train_lengths = []
    tk0 = tqdm(train[text_col].fillna("").values, total=len(train))
    for text in tk0:
        length = len(tokenizer(text, add_special_tokens=False)['input_ids'])
        train_lengths.append(length)

CFG.max_len = max(train_lengths) + 3 # cls & sep & sep
print(f"max_len: {CFG.max_len}")

  0%|          | 0/5256 [00:00<?, ?it/s]

max_len: 77


In [8]:
# ====================================================
# Dataset
# ====================================================
class TrainDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values
        self.labels = df[cfg.target_col].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.cfg.tokenizer.encode_plus(
            self.texts[item],
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=self.texts[item],
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten(),
            labels=torch.tensor(self.labels[item]),
        )

class TestDataset(Dataset):
    def __init__(self, cfg, df):
        self.cfg = cfg
        self.texts = df['text'].values

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, item):
        encoding = self.cfg.tokenizer.encode_plus(
            self.texts[item],
            add_special_tokens=True,
            max_length=self.cfg.max_len,
            padding="max_length",
            truncation=True,
            return_attention_mask=True,
            return_tensors="pt",
        )

        return dict(
            text=self.texts[item],
            input_ids=encoding["input_ids"].flatten(),
            attention_mask=encoding["attention_mask"].flatten()
        )

# Model

In [9]:
# ====================================================
# Model
# ====================================================
class CustomModel(nn.Module):
    def __init__(self, cfg, config_path=None, pretrained=False):
        super().__init__()
        self.cfg = cfg
        if config_path is None:
            self.config = AutoConfig.from_pretrained(cfg.model, output_hidden_states=True)
        else:
            self.config = torch.load(config_path)
        if pretrained:
            self.model = AutoModel.from_pretrained(cfg.model, config=self.config)
        else:
            self.model = AutoModel(self.config)
        if cfg.rnn == "LSTM":
            self.rnn = nn.LSTM(
                self.config.hidden_size,
                self.config.hidden_size,
                # bidirectional=True,
                batch_first=True,
            )
        elif cfg.rnn == "GRU":
            self.rnn = nn.GRU(
                self.config.hidden_size,
                self.config.hidden_size,
                # bidirectional=True,
                batch_first=True,
            )
        self.dropouts = nn.ModuleList([nn.Dropout(self.cfg.multi_sample_dropout) for _ in range(self.cfg.n_msd)])
        self.fc = nn.Linear(self.config.hidden_size, 2)
        for layer in self.model.encoder.layer[self.cfg.reinit_layers:]:
            for module in layer.modules():
                self._init_weights(module)
        self._init_weights(self.fc)
        
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, input_ids, attention_mask, labels=None):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        out, _ = self.rnn(outputs['last_hidden_state'], None)
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(out.size()).float()
        if self.cfg.pooling == "mean":
            sum_embeddings = torch.sum(out * input_mask_expanded, 1)
            sum_mask = input_mask_expanded.sum(1)
            sum_mask = torch.clamp(sum_mask, min=1e-9)
            sequence_output = sum_embeddings / sum_mask
        elif self.cfg.pooling == "max":
            out[input_mask_expanded == 0] = -1e9  # Set padding tokens to large negative value
            sequence_output = torch.max(out, 1)[0]
        elif self.cfg.pooling == "attention":
            pass
        else:
            sequence_output = out[:,-1,:]
        output = sum([self.fc(dropout(sequence_output)) for dropout in self.dropouts])/self.cfg.n_msd
        return output

In [10]:
class CustomDataModule(pl.LightningDataModule):
    """
    DataFrameからモデリング時に使用するDataModuleを作成
    """

    def __init__(self, cfg, train_df, fold):
        super().__init__()
        self.cfg = cfg
        self.fold = fold
        self.train_df = train_df

    def setup(self, stage=None):
        self.train_folds = self.train_df[self.train_df['fold'] != self.fold].reset_index(drop=True)
        self.valid_folds = self.train_df[self.train_df['fold'] == self.fold].reset_index(drop=True)
        self.cfg.num_train_steps = int(len(self.train_folds) / self.cfg.train_batch_size * self.cfg.epochs)

    def train_dataloader(self):
        return DataLoader(TrainDataset(self.cfg, self.train_folds),
                        batch_size=self.cfg.train_batch_size,
                        shuffle=True,
                        num_workers=self.cfg.num_workers, pin_memory=True, drop_last=True)

    def val_dataloader(self):
        return DataLoader(TrainDataset(self.cfg, self.valid_folds),
                        batch_size=self.cfg.valid_batch_size,
                        shuffle=False,
                        num_workers=self.cfg.num_workers, pin_memory=True, drop_last=False)

In [17]:
datamodule = CustomDataModule(CFG,train,3)
datamodule.setup()
features = list(datamodule.train_dataloader())[0]

In [None]:
config = AutoConfig.from_pretrained(CFG.model, output_hidden_states=True)
model = AutoModel.from_pretrained(CFG.model, config=config)

In [29]:
lstm = nn.LSTM(config.hidden_size, config.hidden_size, batch_first=True)

In [31]:
with torch.no_grad():
    outputs = model(features['input_ids'], features['attention_mask'])
    rnn_out, _ = lstm(outputs['last_hidden_state'], None)

In [47]:
rnn_out.shape

torch.Size([32, 77, 1024])

In [48]:
all_hidden_states = torch.stack(outputs['hidden_states'])
last_hidden_state = outputs['last_hidden_state']
print(all_hidden_states.shape)
print(last_hidden_state.shape)

torch.Size([25, 32, 77, 1024])
torch.Size([32, 77, 1024])


In [65]:
all_hidden_states

AttributeError: 'Tensor' object has no attribute 'append'

In [67]:
torch.cat([all_hidden_states, rnn_out.unsqueeze(0)])

torch.Size([26, 32, 77, 1024])

In [71]:
hiddendim_fc = 128
pooler = AttentionPooling(config.num_hidden_layers+1, config.hidden_size, hiddendim_fc)
attention_pooling_embeddings = pooler(torch.cat([all_hidden_states, rnn_out.unsqueeze(0)]))

In [72]:
attention_pooling_embeddings.shape

torch.Size([32, 128])