In [22]:
import os
import pickle
import re
from typing import AnyStr, List, Optional

import nltk
import numpy as np
import pandas as pd
import textstat
import pytorch_lightning as pl
import torch
import torch.nn as nn
import transformers
from nltk import pos_tag
from nltk.corpus import stopwords
from pandarallel import pandarallel
from sklearn.preprocessing import StandardScaler
from torch.utils.data import DataLoader
from transformers import (
    AdamW,
    AutoConfig,
    AutoModel,
    AutoTokenizer,
    get_cosine_schedule_with_warmup,
    get_linear_schedule_with_warmup,
)

pandarallel.initialize(progress_bar=True)

INFO: Pandarallel will run on 4 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.


## Preprocessing

In [3]:
def get_preprocessed_excerpt(src_data: pd.DataFrame) -> pd.DataFrame:
    def preprocess_excerpt(text: AnyStr):
        text = re.sub("[^a-zA-Z]", " ", text).lower()
        text = nltk.word_tokenize(text)  # NOTE: 英文を単語分割する
        text = [word for word in text if word not in set(stopwords.words("english"))]

        lemma = nltk.WordNetLemmatizer()  # NOTE: 複数形の単語を単数形に変換する
        text = " ".join([lemma.lemmatize(word) for word in text])
        return text

    dst_data = src_data["excerpt"].parallel_apply(preprocess_excerpt)
    return dst_data


def get_textstat(src_data: pd.DataFrame) -> pd.DataFrame:
    dst_data = pd.DataFrame()

    dst_data = dst_data.assign(
        excerpt_len=src_data["preprocessed_excerpt"].str.len(),
        avg_word_len=(
            src_data["preprocessed_excerpt"]
            .apply(lambda x: [len(s) for s in x.split()])
            .map(np.mean)
        ),
        char_count=src_data["excerpt"].map(textstat.char_count),
        word_count=src_data["preprocessed_excerpt"].map(textstat.lexicon_count),
        sentence_count=src_data["excerpt"].map(textstat.sentence_count),
        syllable_count=src_data["excerpt"].apply(textstat.syllable_count),
        smog_index=src_data["excerpt"].apply(textstat.smog_index),
        automated_readability_index=src_data["excerpt"].apply(
            textstat.automated_readability_index
        ),
        coleman_liau_index=src_data["excerpt"].apply(textstat.coleman_liau_index),
        linsear_write_formula=src_data["excerpt"].apply(textstat.linsear_write_formula),
    )

    scaler = StandardScaler()
    feat_cols = dst_data.columns.tolist()
    dst_data[feat_cols] = scaler.fit_transform(dst_data)
    return dst_data

In [4]:
test = pd.read_csv("../data/raw/test.csv", usecols=["id", "excerpt"])
test["preprocessed_excerpt"] = get_preprocessed_excerpt(test)
textstat_feat = get_textstat(test)

test = pd.concat([test, textstat_feat], axis=1)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=2), Label(value='0 / 2'))), HBox(c…

## Predict with RoBERTa

In [27]:
class RMSELoss(nn.Module):
    def __init__(self, eps=1e-6):
        super().__init__()
        self.mse = nn.MSELoss()
        self.eps = eps

    def forward(self, yhat, y):
        loss = torch.sqrt(self.mse(yhat, y) + self.eps)
        return loss

In [15]:
class CommonLitDataset(torch.utils.data.Dataset):
    def __init__(
        self,
        data: pd.DataFrame,
        tokenizer: transformers.PreTrainedTokenizer,
        max_len: int = 256,
        is_test: int = False,
    ):
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.excerpt = data[["excerpt"]].to_numpy()

        if is_test:
            self.target = np.zeros((len(data), 1))
            self.textstat = np.zeros((len(data), 1))
        else:
            self.target = data[["target"]].to_numpy()
            textstat = data.drop(["excerpt", "target"], axis=1)
            self.textstat = textstat.to_numpy()

    def __len__(self):
        return len(self.excerpt)

    def __getitem__(self, idx):
        text = str(self.excerpt[idx])
        inputs = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding="max_length",
            return_attention_mask=True,
            return_token_type_ids=True,
        )
        textstat = self.textstat[idx]
        target = self.target[idx]

        return {
            "inputs": {
                "input_ids": torch.tensor(inputs["input_ids"], dtype=torch.long),
                "attention_mask": torch.tensor(
                    inputs["attention_mask"], dtype=torch.long
                ),
                "token_type_ids": torch.tensor(
                    inputs["token_type_ids"], dtype=torch.long
                ),
            },
            "textstat": torch.tensor(textstat, dtype=torch.float32),
            "target": torch.tensor(target, dtype=torch.float32),
        }

In [28]:
class CommonLitRoBERTaModel(nn.Module):
    def __init__(
        self,
        model_name_or_path: str = "roberta-base",
        output_hidden_states: bool = False,
    ):
        super(CommonLitRoBERTaModel, self).__init__()
        self.roberta = AutoModel.from_pretrained(
            model_name_or_path,
            output_hidden_states=output_hidden_states,
        )
        self.config = self.roberta.config

        reg_input_dim = 768
        self.regression_head = nn.Sequential(
            nn.LayerNorm(reg_input_dim),
            nn.Dropout(0.5),
            nn.Linear(reg_input_dim, 1),
        )
        # Initialize Weights
        self.regression_head.apply(self._init_weights)

    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def forward(self, batch):
        outputs = self.roberta(**batch["inputs"])
        pooler_output = outputs.pooler_output

        # NOTE: この段階でtextstat特徴量を追加すると、fine-tuning時の学習率と相性が悪く過学習する可能性が高い、
        # x = torch.cat((pooler_output, batch["textstat"]), dim=1)
        x = self.regression_head(pooler_output)
        return x


class CommonLitModel(pl.LightningModule):
    def __init__(
        self,
        lr: float = 5e-5,
        num_epoch: int = 10,
        roberta_model_name_or_path: str = "roberta-base",
        output_hidden_states: bool = False,
        lr_scheduler: str = "linear",
        lr_interval: str = "epoch",
        lr_warmup_step: int = 0,
        lr_num_cycles: int = 0.5,
    ):
        super(CommonLitModel, self).__init__()
        self.save_hyperparameters()

        self.roberta_model = CommonLitRoBERTaModel(
            model_name_or_path=roberta_model_name_or_path,
            output_hidden_states=output_hidden_states,
        )
        self.loss_fn = nn.MSELoss()
        self.eval_fn = RMSELoss()

    def forward(self, batch):
        z = self.roberta_model(batch)
        return z

    def configure_optimizers(self):
        optimizer_grouped_parameters = self._get_optimizer_params(self.roberta_model)
        optimizer = torch.optim.AdamW(
            optimizer_grouped_parameters,  # self.parameters()
            lr=self.hparams.lr,
            betas=(0.9, 0.999),
            eps=1e-8,
            weight_decay=0,
        )

        total_steps = self.train_dataloader_len * self.hparams.num_epoch
        if self.hparams.lr_scheduler == "linear":
            # Linear scheduler
            lr_scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=0,
                num_training_steps=total_steps,
            )
        elif self.hparams.lr_scheduler == "cosine":
            # Cosine scheduler
            lr_scheduler = get_cosine_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.lr_warmup_step,
                num_training_steps=total_steps,
                num_cycles=self.hparams.lr_num_cycles,
            )
        else:
            # Linear scheduler
            lr_scheduler = get_linear_schedule_with_warmup(
                optimizer,
                num_warmup_steps=self.hparams.lr_warmup_step,
                num_training_steps=total_steps,
            )

        lr_dict = {
            "scheduler": lr_scheduler,
            "interval": self.hparams.lr_interval,  # step or epoch
            "strict": True,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_dict}

    def _get_optimizer_params(self, model):
        # Differential learning rate and weight decay
        param_optimizer = list(model.named_parameters())
        learning_rate = self.hparams.lr
        no_decay = ["bias", "gamma", "beta"]
        group1 = ["layer.0.", "layer.1.", "layer.2.", "layer.3."]
        group2 = ["layer.4.", "layer.5.", "layer.6.", "layer.7."]
        group3 = ["layer.8.", "layer.9.", "layer.10.", "layer.11."]
        group_all = [
            "layer.0.",
            "layer.1.",
            "layer.2.",
            "layer.3.",
            "layer.4.",
            "layer.5.",
            "layer.6.",
            "layer.7.",
            "layer.8.",
            "layer.9.",
            "layer.10.",
            "layer.11.",
        ]
        optimizer_parameters = [
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.01,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if not any(nd in n for nd in no_decay)
                    and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.01,
                "lr": learning_rate * 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay)
                    and not any(nd in n for nd in group_all)
                ],
                "weight_decay_rate": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group1)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate / 2.6,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group2)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate,
            },
            {
                "params": [
                    p
                    for n, p in model.roberta.named_parameters()
                    if any(nd in n for nd in no_decay) and any(nd in n for nd in group3)
                ],
                "weight_decay_rate": 0.0,
                "lr": learning_rate * 2.6,
            },
            {
                "params": [
                    p for n, p in model.named_parameters() if "roberta" not in n
                ],
                "lr": 1e-3,  # learning_rate
                "momentum": 0.99,
            },
        ]
        return optimizer_parameters

    def shared_step(self, batch):
        z = self(batch)
        return z

    def training_step(self, batch, batch_idx):
        z = self.shared_step(batch)
        loss = self.loss_fn(z, batch["target"])
        self.log("train_loss", loss)
        return {"loss": loss}

    def validation_step(self, batch, batch_idx):
        z = self.shared_step(batch)
        return {"pred": z, "target": batch["target"]}

    def validation_step_end(self, batch_parts):
        return batch_parts

    def validation_epoch_end(self, validation_step_outputs):
        pred = []
        target = []

        for output in validation_step_outputs:
            pred.append(output["pred"])
            target.append(output["target"])

        pred = torch.cat(pred, dim=0)
        target = torch.cat(target, dim=0)

        loss = self.loss_fn(pred, target)
        metric = self.eval_fn(pred, target)

        self.log("val_loss", loss, prog_bar=True)
        self.log("val_metric", metric, prog_bar=True)

In [29]:
def get_dataloader(data: pd.DataFrame):
    tokenizer = AutoTokenizer.from_pretrained("roberta-base")
    dataset = CommonLitDataset(data, tokenizer, 256, is_test=True)
    return DataLoader(
        dataset,
        batch_size=32,
        num_workers=4,
        pin_memory=True,
        shuffle=False,
        drop_last=False,
    )

In [30]:
def predict_by_ckpt(data, checkpoints):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dataloader = get_dataloader(data)

    pred = []
    for _, ckpt in enumerate(checkpoints):
        print(f"Predicted by {ckpt}")

        model = CommonLitModel().load_from_checkpoint(ckpt)
        model = model.to(device)
        model.eval()
        model.freeze()

        pred_ckpt = []
        for batch in dataloader:
            batch["inputs"]["input_ids"] = batch["inputs"]["input_ids"].to(device)
            batch["inputs"]["attention_mask"] = batch["inputs"]["attention_mask"].to(
                device
            )
            batch["inputs"]["token_type_ids"] = batch["inputs"]["token_type_ids"].to(
                device
            )
            batch["textstat"] = batch["textstat"].to(device)

            z = model(batch)
            pred_ckpt.append(z)

        pred_ckpt = torch.cat(pred_ckpt, dim=0).detach().cpu().numpy().copy()
        pred.append(pred_ckpt)

    return pred


def get_ckpt_path(checkpoint_path: str) -> List:
    with open(checkpoint_path, "r") as f:
        txt = f.readlines()

    model_version = "RoBERTa-Baseline"
    dir_path = "../data/models/roberta/"
    checkpoints = [t.strip() for t in txt]
    checkpoints = [ckpt.replace("../tb_logs/", dir_path) for ckpt in checkpoints]
    return checkpoints

In [31]:
# Predict by RoBERTa
ckpt_path = "../data/models/roberta/best_checkpoints_0.496413±0.0162.txt"
checkpoints = get_ckpt_path(ckpt_path)

pred = predict_by_ckpt(test, checkpoints)
test[[f"pred_{i}" for i in range(len(checkpoints))]] = pred

X_pred = test[[f"pred_{i}" for i in range(len(checkpoints))]]

Predicted by ../data/models/roberta/RoBERTa-Baseline/version_0/checkpoints/epoch=07-loss=0.0000-val_loss=0.2481-val_metric=0.4980.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_1/checkpoints/epoch=13-loss=0.0000-val_loss=0.2724-val_metric=0.5219.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_2/checkpoints/epoch=04-loss=0.0000-val_loss=0.2484-val_metric=0.4984.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_3/checkpoints/epoch=05-loss=0.0000-val_loss=0.2018-val_metric=0.4492.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_4/checkpoints/epoch=06-loss=0.0000-val_loss=0.2538-val_metric=0.5038.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_5/checkpoints/epoch=05-loss=0.0000-val_loss=0.2604-val_metric=0.5103.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/version_6/checkpoints/epoch=07-loss=0.0000-val_loss=0.2343-val_metric=0.4841.ckpt
Predicted by ../data/models/roberta/RoBERTa-Baseline/ve

In [32]:
X_pred.head()

Unnamed: 0,pred_0,pred_1,pred_2,pred_3,pred_4,pred_5,pred_6,pred_7,pred_8,pred_9,pred_10,pred_11,pred_12,pred_13,pred_14
0,-0.50413,-0.473537,-0.540328,-0.432704,-0.379853,-0.250393,-0.230802,-0.461931,-0.320304,-0.396811,-0.434372,-0.266711,-0.336394,-0.188015,-0.262966
1,-0.359742,-0.38418,-0.450077,-0.375443,-0.402787,-0.301277,-0.091749,-0.088621,-0.240885,-0.199681,-0.426405,-0.484604,-0.402243,-0.385206,-0.597047
2,-0.139508,-0.456083,-0.470303,-0.428771,-0.389819,-0.459563,-0.416336,-0.433987,-0.196438,-0.353424,-0.610762,-0.673162,-0.588096,-0.411148,-0.418171
3,-2.508181,-2.309677,-2.22882,-2.345114,-2.402572,-2.391798,-2.559634,-2.266225,-2.113398,-2.698644,-2.476523,-2.241605,-2.594661,-2.761666,-2.130869
4,-1.999642,-1.796057,-1.763693,-1.621618,-1.979182,-1.580597,-1.572743,-1.535815,-1.527643,-1.720838,-2.076024,-1.46719,-1.642941,-1.74186,-1.400126


## Predict with SVR

In [33]:
def predict(data: pd.DataFrame, model_dir: str, n_splits: int) -> np.ndarray:
    pred = np.zeros(data.shape[0])
    for n_fold in range(n_splits):
        with open(os.path.join(model_dir, f"{n_fold}-fold.pkl"), mode="rb") as file:
            model = pickle.load(file)

        pred += model.predict(data) / n_splits

    return pred

In [34]:
model_dir = "../data/models/svr/"
# model_dir = "../data/models/xgb/"

submission = test[["id"]].copy()
submission["target"] = predict(X_pred, model_dir, 5)

# submission.to_csv("submission.csv", index=False)

In [35]:
submission.head()

Unnamed: 0,id,target
0,c0f722661,-0.43554
1,f0953f0a5,-0.384058
2,0df072751,-0.482642
3,04caf4e0c,-2.479009
4,0e63f8bea,-1.647921
