Config

In [17]:
class Config:
    exp = "003"
    ver = "001"
    training = True
    resume = False

    project_name = f'pii-{exp}-{ver}'

    seed = 42

    num_proc = 4
    val_size = 1000

    threshold = 0.99

    tokenize_options = {
        "return_offsets_mapping": True,
        "truncation": True,
        "max_length": 3072,
    }
    deberta_options = {
        "output_hidden_states": True,
        "hidden_dropout_prob": 0.1,
        "layer_norm_eps": 1e-7,
        "add_pooling_layer": False,
    }

    model_name = "deberta3base-truncation-false"
    freeze_emb = False
    num_freeze_layers = 3

    save_dir = f"/kaggle/input/{project_name}"
    save_path = f"{save_dir}/{model_name}.ckpt"

    output_dir = "/kaggle/output"

    model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-base"

    train_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    test_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    moredata_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"
    pii_dataset_fixed_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"
    mpware_path = "/kaggle/input/pii-mixtral8x7b-generated-essays/mpware_mixtral8x7b_v1.1-no-i-username.json"
    sample_sub_path = '/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv'

    batch_size = 1
    epochs = 3
    lr = 1e-5

    all_labels = [
        "B-EMAIL",
        "B-ID_NUM",
        "B-NAME_STUDENT",
        "B-PHONE_NUM",
        "B-STREET_ADDRESS",
        "B-URL_PERSONAL",
        "B-USERNAME",
        "I-ID_NUM",
        "I-NAME_STUDENT",
        "I-PHONE_NUM",
        "I-STREET_ADDRESS",
        "I-URL_PERSONAL",
        "O",
    ]
    num_pii_labels = len(all_labels) - 1
    label2id = {label: index for index, label in enumerate(all_labels)}
    id2label = {index: label for index, label in enumerate(all_labels)}

In [18]:
import os
import sys
os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ['TRANSFORMERS_NO_ADVISORY_WARNINGS'] = 'true'
sys.path.append('/kaggle/input/piimetric')

In [19]:
! pip install -U wandb



In [20]:
import json, argparse, torch, sys, random, gc, os
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path
import ctypes
from tqdm import tqdm
# from rich import print

import wandb
import pytorch_lightning as pl
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, ModelSummary
from pytorch_lightning.loggers import WandbLogger

from comp_metric import compute_metrics

# Transformer
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoModel,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    PreTrainedTokenizer,
    PreTrainedModel,
    PretrainedConfig,
    DebertaV2Config,
    DebertaV2Model,
    get_cosine_schedule_with_warmup,
    EvalPrediction,
)
from transformers.modeling_outputs import BaseModelOutput

In [21]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(Config.seed)


libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


Pre Processeing

In [22]:
def split_df_by_sampling(df: pd.DataFrame, n_samples: int):
    # Get the sample df
    samples_df = df.sample(n=n_samples, random_state=Config.seed)
    # The remaining df
    cond = df["document"].isin(samples_df["document"])
    others_df = df.drop(df[cond].index, inplace=False)
    return samples_df, others_df


def load_train_data():
    df_original = pd.read_json(Path(Config.train_path))
    print(f"kaggle train data = {len(df_original)}")

    df_extra = pd.read_json(Path(Config.mpware_path))
    print(f"mpware data = {len(df_extra)}")

        # df_more = pd.read_json(Path(Config.moredata_path))
    # df_fix = pd.read_json(Path(Config.pii_dataset_fixed_path))
    # df_extra = pd.concat([df_more, df_fix])
    # print(f"moredata + pii_dataset_fixed = {len(df_extra)}")


    df_original["is_original"] = True
    df_extra["is_original"] = False

    df = pd.concat([df_original, df_extra])
    df["document"] = [i for i in range(len(df))]  # Update the document id
    df.reset_index(drop=True, inplace=True)

    df["is_labels"] = df["labels"].apply(
        lambda labels: any(label != "O" for label in labels)
    )

    df_original = df[df["is_original"]]
    df_extra = df[~df["is_original"]]

    # One or more labels are not 'O'
    true_labels = df_original[df_original["is_labels"]]
    # all labels are 'O'
    false_labels = df_original[~df_original["is_labels"]]

    # Reset index to two df
    true_labels = true_labels.reset_index(drop=True, inplace=False)
    false_labels = false_labels.reset_index(drop=True, inplace=False)
    print(f"Number of true_labels = {len(true_labels)}")
    print(f"Number of false_labels = {len(false_labels)}")

    # Get 300 as valid dataset
    n_true_samples = len(true_labels) - int(Config.val_size * len(true_labels) / len(df_original))

    # Sample true labels
    true_samples, true_others = split_df_by_sampling(true_labels, n_true_samples)
    print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
    n_samples = len(false_labels) - (Config.val_size - int(Config.val_size * len(true_labels) / len(df_original)))
    # Sample false labels
    false_samples, false_others = split_df_by_sampling(false_labels, n_samples)
    print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
    # Training ds = P * true_labels + P * false_labels
    train_df = pd.concat([true_samples, false_samples, df_extra])
    # Valid ds = (1-P) * true_labels + (1-P) * false_labels
    valid_df = pd.concat([true_others, false_others])

    return train_df, valid_df

In [23]:
class CreateDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: PreTrainedTokenizer, is_test: bool) -> None:
        self.tokenizer = tokenizer
        self.is_test = is_test
        if is_test:
            self.tokenized_df = df.apply(self.tokenize_test, axis=1, result_type="expand")
        else:
            self.tokenized_df = df.apply(self.tokenize_train, axis=1, result_type="expand")

    def __len__(self):
        return len(self.tokenized_df)
    
    def tokenize_train(self, row):
        text = []
        token_map = []
        labels = []
        targets = []
        idx = 0
        for t, l, ws in zip(row["tokens"], row["labels"], row["trailing_whitespace"]):
            text.append(t)
            labels.extend([l]*len(t))
            token_map.extend([idx]*len(t))

            if l in Config.all_labels:  
                targets.append(1)
            else:
                targets.append(0)
            
            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)
            idx += 1

        tokenized = self.tokenizer("".join(text), **Config.tokenize_options)
         
        target_num = sum(targets)
        labels = np.array(labels)

        text = "".join(text)
        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            if start_idx == 0 and end_idx == 0: 
                token_labels.append(Config.label2id["O"])
                continue
            
            if text[start_idx].isspace():
                start_idx += 1
            try:
                token_labels.append(Config.label2id[labels[start_idx]])
            except:
                continue
        length = len(tokenized.input_ids)
        
        return {
            "input_ids": tokenized.input_ids,
            "attention_mask": tokenized.attention_mask,
            "offset_mapping": tokenized.offset_mapping,
            "labels": token_labels,
            "length": length,
            "target_num": target_num,
            "group": 1 if target_num > 0 else 0,
            "token_map": token_map,
            "document": row["document"],
            "tokens": row["tokens"],      
            "raw_labels": row["labels"],     
        }

    def tokenize_test(self, row):
        text = []
        token_map = []
        
        idx = 0
        for t, ws in zip(row["tokens"], row["trailing_whitespace"]):
            text.append(t)
            token_map.extend([idx]*len(t))
            if ws:
                text.append(" ")
                token_map.append(-1)
                
            idx += 1
            
        tokenized = self.tokenizer("".join(text), **Config.tokenize_options)
        
        return {
            "input_ids": tokenized.input_ids,
            "attention_mask": tokenized.attention_mask,
            "offset_mapping": tokenized.offset_mapping,
            "token_map": token_map,       
            "document": row["document"],
            "tokens": row["tokens"],         
        }

    def __getitem__(self, index):
        if self.is_test:
            row = self.tokenized_df.drop(["document", "tokens"], axis=1).iloc[index]
        else:
            row = self.tokenized_df.drop(["document", "tokens", "raw_labels"], axis=1).iloc[index]
        return row
        
        

In [24]:
class CreateDataModule(pl.LightningDataModule):
    """
    DataFrameからモデリング時に使用するDataModuleを作成
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        valid_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
    ):
        super().__init__()
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
        self.batch_size = Config.batch_size
        self.tokenizer = tokenizer
        self.collator = DataCollatorForTokenClassification(
            tokenizer, pad_to_multiple_of=512
        )
        self.reference_df = self.create_val_reference_df(valid_df)

    def create_val_reference_df(self, valid_df: pd.DataFrame):
        valid_df = valid_df[['document', 'tokens', 'labels']].copy()
        valid_df = valid_df.explode(['tokens', 'labels']).reset_index(drop=True).rename(columns={'tokens': 'token', 'labels': 'label'})
        valid_df['token'] = valid_df.groupby('document').cumcount()
        
        reference_df = valid_df[valid_df['label'] != 'O'].copy()
        reference_df = reference_df.reset_index().rename(columns={'index': 'row_id'})
        reference_df = reference_df[['row_id', 'document', 'token', 'label']].copy()
        return reference_df

    def setup(self, stage=None):
        self.train_dataset = CreateDataset(
            self.train_df, self.tokenizer, is_test=False
        )
        self.valid_dataset = CreateDataset(
            self.valid_df, self.tokenizer, is_test=False
        )
        self.test_dataset = CreateDataset(
            self.test_df, self.tokenizer, is_test=True
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
        )

In [25]:
def gen_dfs():
    # Load data
    # Split 'df' into training and valid dataset (300) based on whether the row is all 'O' or not. 
    train_df, valid_df = load_train_data()
    train_df.reset_index(drop=True, inplace=True)
    valid_df.reset_index(drop=True, inplace=True)
    print(f"Number of train_df = {len(train_df)}")
    print(f"Number of valid_df = {len(valid_df)}")

    test_df = pd.read_json(Path(Config.test_path))
    clear_memory()
    return train_df, valid_df, test_df

In [26]:
train_df, valid_df, test_df = gen_dfs()
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
dm = CreateDataModule(train_df, valid_df, test_df, tokenizer)
dm.setup()

kaggle train data = 6807
mpware data = 2692
Number of true_labels = 945
Number of false_labels = 5862
true_samples = 807 true_others = 138
false_samples = 5000 false_others = 862
Number of train_df = 8499
Number of valid_df = 1000




Model

In [27]:
def post_processing_preds(logits_list: list[torch.Tensor]) -> list[torch.Tensor]:
    preds_final = []
    for logits in logits_list:
        predictions_prob = torch.softmax(logits, dim=-1)
        predictions = logits.argmax(-1)
        predictions_without_O = predictions_prob[:, :12].argmax(-1)

        O_prob = predictions_prob[:, 12]
        pred_final = torch.where(
            O_prob < Config.threshold, predictions_without_O, predictions
        )
        preds_final.append(pred_final)

    return preds_final

In [28]:
def predictions_to_df(preds_list: list[torch.Tensor], tokenized_df: pd.DataFrame):
    triplets = []
    pairs = set()
    document, token, label, token_str = [], [], [], []
    for preds, token_map, offsets, tokens, doc in zip(
        preds_list,
        tokenized_df["token_map"],
        tokenized_df["offset_mapping"],
        tokenized_df["tokens"],
        tokenized_df["document"],
    ):
        # p = p.argmax(-1).cpu().detach().numpy()
        preds = preds.cpu().detach().numpy()

        for token_pred, (start_idx, end_idx) in zip(preds, offsets):
            label_pred = Config.id2label[(token_pred)]

            if start_idx + end_idx == 0:
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map):
                break

            token_id = token_map[start_idx]

            if label_pred == "O" or token_id == -1:
                continue

            pair = (doc, token_id)

            if pair in pairs:
                continue

            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            pairs.add(pair)

    df = pd.DataFrame(
        {"document": document, "token": token, "label": label, "token_str": token_str}
    )
    df["row_id"] = list(range(len(df)))

    return df

In [29]:
def freeze(module: nn.Module):
    for param in module.parameters():
        param.requires_grad = False

class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers):
        super().__init__()
        self.lstm = nn.LSTM(
            in_features,
            hidden_dim,
            n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.1,
        )
        self.out_features = hidden_dim

    def forward(self, x) -> torch.Tensor:
        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm.forward(x)
        out = hidden
        return out


class DebertaLSTMModel(pl.LightningModule):
    def __init__(self, dm: CreateDataModule):
        super(DebertaLSTMModel, self).__init__()

        # self.example_input_array = torch.Tensor(1, 1).to(dtype=torch.int64)
        self.dm = dm

        self.model_config: DebertaV2Config = AutoConfig.from_pretrained(
            Config.model_path
        )

        self.model_config.update(Config.deberta_options)

        self.transformers_model: DebertaV2Model = AutoModel.from_pretrained(
            Config.model_path, config=self.model_config
        )
        self.transformers_model.gradient_checkpointing_enable()
        self.num_features = self.transformers_model.config.hidden_size * 3

        self.head = LSTMHead(
            in_features=self.num_features,
            hidden_dim=self.num_features // 2,
            n_layers=1,
        )

        self.fc = nn.Linear(self.num_features, len(Config.all_labels))
        self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
        self.validation_step_outputs = []        

        if Config.freeze_emb:
            freeze(self.transformers_model.embeddings)
        if Config.num_freeze_layers:
            freeze(self.transformers_model.encoder.layer[:Config.num_freeze_layers])

    def forward(self, input_ids, attention_mask=None, train=True):
        transformer_out: BaseModelOutput = self.transformers_model.forward(
            input_ids, attention_mask=attention_mask
        )
        hidden_states = transformer_out.hidden_states
        transformer_features = torch.cat(hidden_states[-3:], dim=-1)
        assert transformer_features.shape[-1] == self.num_features
        head_output = self.head.forward(transformer_features)
        # last_hidden_state = (batch_size, seq_len, hidden_size)

        logits = self.fc.forward(head_output)

        # logits = (batch_size, seq_len, num_labels)
        return logits

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        target: torch.Tensor = batch["labels"]

        logits = self.forward(input_ids, attention_mask, train=True)
        # output = (seq_len, num_labels)
        loss = self.loss_function(
            logits.view(-1, len(Config.all_labels)), target.view(-1)
        )

        self.log("train_loss", loss, prog_bar=True)
        return {"loss": loss}

    def train_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
        print(f"epoch {self.trainer.current_epoch} training loss {avg_loss}")
        return {"train_loss": avg_loss}

    def validation_step(self, batch, batch_idx):
        input_ids: torch.Tensor = batch["input_ids"]
        attention_mask: torch.Tensor = batch["attention_mask"]
        target: torch.Tensor = batch["labels"]

        logits = self.forward(input_ids, attention_mask, train=False)
        # logits.shape = (batch_size, seq_len, num_labels)

        loss = self.loss_function(
            logits.view(-1, len(Config.all_labels)), target.view(-1)
        )

        self.log("val_loss", loss, prog_bar=True)
        self.validation_step_outputs.append(
            {"val_loss": loss, "logits": logits, "targets": target}
        )
        return {"val_loss": loss, "logits": logits, "targets": target}

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        logits_list = [logits for batch in outputs for logits in batch["logits"]]
        preds_list = post_processing_preds(logits_list)
        
        pred_df = predictions_to_df(preds_list, self.dm.valid_dataset.tokenized_df)

        self.validation_step_outputs = []    

        avg_score = compute_metrics(pred_df, self.dm.reference_df)
        f5_score = avg_score["ents_f5"]
        self.log("precision", avg_score["ents_p"])
        self.log("recall", avg_score["ents_r"])
        self.log("f5", avg_score["ents_f5"])
        
        print(f"epoch {self.trainer.current_epoch} validation loss {avg_loss}")
        print(avg_score["ents_per_type"])

        return {"val_loss": avg_loss, "val_f5": f5_score}

    def predict_step(self, batch, batch_idx):
        input_ids: torch.Tensor = batch["input_ids"]
        attention_mask: torch.Tensor = batch["attention_mask"]

        logits = self.forward(input_ids, attention_mask, train=False)
        return logits

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p
                    for n, p in self.transformers_model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "lr": encoder_lr,
                "weight_decay": weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.transformers_model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "lr": encoder_lr,
                "weight_decay": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if "transformers_model" not in n
                ],
                "lr": decoder_lr,
                "weight_decay": 0.0,
            },
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=Config.lr)

        epoch_steps = len(self.dm.train_dataset)
        batch_size = Config.batch_size

        warmup_steps = 0.05 * epoch_steps // batch_size
        training_steps = Config.epochs * epoch_steps // batch_size
        # scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
        # scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=1e-6, power=3.0)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, training_steps, num_cycles=0.5
        )

        lr_scheduler_config = {
            "scheduler": scheduler,
            "interval": "step",
            "frequency": 1,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}

Train

In [30]:
clear_memory()

In [31]:
if Config.training:
    wandb.login()
    id = wandb.util.generate_id()
    wandb.init(project=Config.project_name, id=id, resume="allow")
    # if Config.resume:
    #     wandb.init(project=f'pii-{Config.exp}-{Config.ver}', id="helpful-water-6")
    wandb_logger = WandbLogger(project=Config.project_name)
    model = DebertaLSTMModel(dm=dm)
    early_stop_callback = EarlyStopping(
        monitor="val_loss", min_delta=0.00, patience=8, verbose=True, mode="min"
    )
    Path(Config.save_dir).mkdir(parents=True, exist_ok=True)
    with open(Path(Config.save_dir).joinpath("dataset-metadata.json"), "w") as f:
        json.dump(
            {
                "title": Config.project_name,
                "id": f"zume666/{Config.project_name}",
                "licenses": [{"name": "CC0-1.0"}],
            },
            f,
        )

    checkpoint_callback = ModelCheckpoint(
        monitor="val_loss",
        dirpath=Config.save_dir,
        save_top_k=1,
        save_last=True,
        save_weights_only=False,
        filename=Config.model_name,
        verbose=True,
        mode="min",
    )
    trainer = pl.Trainer(
        max_epochs=Config.epochs,
        deterministic=True,
        val_check_interval=0.25,
        accumulate_grad_batches=4,
        devices=[0],
        precision="bf16-mixed",
        accelerator="gpu",
        callbacks=[
            checkpoint_callback,
            early_stop_callback,
            ModelSummary(max_depth=-1),
        ],
        logger=wandb_logger,
    )
    if Config.resume:
        trainer.fit(
            model=model,
            datamodule=dm,
            ckpt_path=Path(Config.save_dir).joinpath("last.ckpt"),
        )
    else:
        trainer.fit(model=model, datamodule=dm)
    wandb.finish()

  rank_zero_warn(
  rank_zero_warn(f"Checkpoint directory {dirpath} exists and is not empty.")


Sanity Checking: 0it [00:00, ?it/s]

epoch 0 validation loss 2.564984083175659
{'URL_PERSONAL': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'EMAIL': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'USERNAME': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'STREET_ADDRESS': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'NAME_STUDENT': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'ID_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Training: 0it [00:00, ?it/s]

Validation: 0it [00:00, ?it/s]

epoch 0 validation loss 0.012347443960607052
{'NAME_STUDENT': {'p': 0.006292749658002736, 'r': 0.06036745406824147, 'f5': 0.045371775417298935}, 'URL_PERSONAL': {'p': 0.0027063599458728013, 'r': 1.0, 'f5': 0.0659062103929024}, 'STREET_ADDRESS': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'EMAIL': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'ID_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Validation: 0it [00:00, ?it/s]

epoch 0 validation loss 0.006115811411291361
{'NAME_STUDENT': {'p': 0.14380081300813008, 'r': 0.7427821522309711, 'f5': 0.6402157835204038}, 'URL_PERSONAL': {'p': 0.03934426229508197, 'r': 1.0, 'f5': 0.5157024793388431}, 'STREET_ADDRESS': {'p': 0.5625, 'r': 0.8181818181818182, 'f5': 0.8041237113402062}, 'EMAIL': {'p': 0.4166666666666667, 'r': 0.625, 'f5': 0.6132075471698113}, 'ID_NUM': {'p': 1.0, 'r': 0.5, 'f5': 0.5098039215686274}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Validation: 0it [00:00, ?it/s]

epoch 0 validation loss 0.0035959044471383095
{'NAME_STUDENT': {'p': 0.5063694267515924, 'r': 0.8346456692913385, 'f5': 0.8143405889884763}, 'STREET_ADDRESS': {'p': 0.6923076923076923, 'r': 0.8181818181818182, 'f5': 0.8125000000000002}, 'ID_NUM': {'p': 0.6, 'r': 0.75, 'f5': 0.7428571428571428}, 'URL_PERSONAL': {'p': 0.6111111111111112, 'r': 0.9166666666666666, 'f5': 0.8993710691823898}, 'EMAIL': {'p': 0.75, 'r': 0.75, 'f5': 0.75}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Validation: 0it [00:00, ?it/s]

epoch 0 validation loss 0.0024881805293262005
{'NAME_STUDENT': {'p': 0.37760702524698136, 'r': 0.9028871391076115, 'f5': 0.8570333461096205}, 'URL_PERSONAL': {'p': 0.20689655172413793, 'r': 1.0, 'f5': 0.8715083798882681}, 'STREET_ADDRESS': {'p': 0.6206896551724138, 'r': 0.8181818181818182, 'f5': 0.8082901554404146}, 'EMAIL': {'p': 0.6666666666666666, 'r': 1.0, 'f5': 0.9811320754716982}, 'ID_NUM': {'p': 0.34375, 'r': 0.9166666666666666, 'f5': 0.8614457831325301}, 'PHONE_NUM': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Validation: 0it [00:00, ?it/s]

epoch 1 validation loss 0.0021531246602535248
{'NAME_STUDENT': {'p': 0.693446088794926, 'r': 0.8608923884514436, 'f5': 0.8529705941188237}, 'STREET_ADDRESS': {'p': 0.6428571428571429, 'r': 0.8181818181818182, 'f5': 0.8096885813148791}, 'EMAIL': {'p': 0.5, 'r': 1.0, 'f5': 0.9629629629629629}, 'ID_NUM': {'p': 0.5238095238095238, 'r': 0.9166666666666666, 'f5': 0.8909657320872274}, 'URL_PERSONAL': {'p': 0.3157894736842105, 'r': 1.0, 'f5': 0.9230769230769229}, 'PHONE_NUM': {'p': 1.0, 'r': 1.0, 'f5': 1.0}, 'USERNAME': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


Validation: 0it [00:00, ?it/s]

epoch 1 validation loss 0.0017778610344976187
{'NAME_STUDENT': {'p': 0.44746600741656367, 'r': 0.9501312335958005, 'f5': 0.9107799496806658}, 'STREET_ADDRESS': {'p': 0.6206896551724138, 'r': 0.8181818181818182, 'f5': 0.8082901554404146}, 'EMAIL': {'p': 0.7272727272727273, 'r': 1.0, 'f5': 0.985781990521327}, 'ID_NUM': {'p': 0.36666666666666664, 'r': 0.9166666666666666, 'f5': 0.8666666666666668}, 'URL_PERSONAL': {'p': 0.27906976744186046, 'r': 1.0, 'f5': 0.9096209912536443}, 'PHONE_NUM': {'p': 1.0, 'r': 1.0, 'f5': 1.0}, 'USERNAME': {'p': 0.0, 'r': 0.0, 'f5': 0.0}}


  rank_zero_warn("Detected KeyboardInterrupt, attempting graceful shutdown...")


VBox(children=(Label(value='0.023 MB of 0.023 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
epoch,▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁█████████████████
f5,▁▆▇███
precision,▁▂▇▅█▆
recall,▁▆▇█▇█
train_loss,█▄▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁▁
trainer/global_step,▁▁▁▂▂▂▂▂▂▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇███
val_loss,█▄▂▁▁▁

0,1
epoch,1.0
f5,0.90588
precision,0.44541
recall,0.94495
train_loss,0.00053
trainer/global_step,3699.0
val_loss,0.00178


: 

Infer

In [None]:
def predict(dm: CreateDataModule, model: pl.LightningModule):
    model.eval()  
    model.to(DEVICE)
    test_dataloader = dm.test_dataloader()
    
    trainer = pl.Trainer()
    outputs = trainer.predict(model=model, dataloaders=test_dataloader, ckpt_path=Config.save_path)
        
    logits_list = [logits for batch in outputs for logits in batch]
    preds_list = post_processing_preds(logits_list)

    pred_df = predictions_to_df(preds_list, dm.test_dataset.tokenized_df)

    return pred_df

In [None]:
if ~Config.training:
    model = DebertaLSTMModel(dm=dm)
    sub_df = predict(dm, model)
    sample_sub = pd.read_csv(Config.sample_sub_path)
    sub_df = sub_df[sample_sub.columns]
    sub_df.to_csv('submission.csv',index=False)
    display(sub_df)



Error: 

Post Processing

In [None]:
# # Convert preds to a list of dictionaries
# results = []
# for preds, token_map, offsets, tokens, doc in zip(preds_final,
#                                               tokenized_ds["token_map"], 
#                                               tokenized_ds["offset_mapping"],
#                                               tokenized_ds["tokens"],
#                                               tokenized_ds["document"]):
#     for token_pred, (start_idx, end_idx) in zip(preds, offsets):
#         try:
#             label_pred = Config.id2label[str(token_pred)]
#             if start_idx + end_idx == 0: 
#                 continue

#             if token_map[start_idx] == -1:
#                 start_idx += 1
#              # ignore "\n\n"
#             while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
#                 start_idx += 1

#             if start_idx >= len(token_map): 
#                 break

#             token_id = token_map[start_idx]

#             # ignore "O" predictions and whitespace preds
#             if label_pred != "O" and token_id != -1:
#                 results.append({
#                         "document": doc,
#                         "token": token_id,
#                         "label": label_pred,
#                         "token_str": tokens[token_id]
#                     })
                
#         except Exception as e:
#             print(f"Error {e}")
#             print(f"token_map {len(token_map)} and {token_pred}  {start_idx} {end_idx}")
#             sys.exit(-1)

In [None]:
# import re
# from spacy.lang.en import English
# nlp = English()

# def find_span(target: list[str], document: list[str]) -> list[list[int]]:
#     idx = 0
#     spans = []
#     span = []
    
#     for i, token in enumerate(document):
#         if token != target[idx]:
#             idx = 0
#             span = []
#             continue
#         span.append(i)
        
#         idx += 1
#         if idx == len(target):
#             spans.append(span)
#             span = []
#             idx = 0
#             continue
    
#     return spans

# email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
# phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
# emails = []
# phone_nums = []

# for _data in test_ds:
#     # email
#     for token_idx, token in enumerate(_data["tokens"]):
#         if re.fullmatch(email_regex, token) is not None:
#             emails.append(
#                 {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
#             )
#     # phone number
#     matches = phone_num_regex.findall(_data["full_text"])
#     if not matches:
#         continue
        
#     for match in matches:
#         target = [t.text for t in nlp.tokenizer(match)]
#         matched_spans = find_span(target, _data["tokens"])
        
#     for matched_span in matched_spans:
#         for intermediate, token_idx in enumerate(matched_span):
#             prefix = "I" if intermediate else "B"
#             phone_nums.append(
#                 {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
#             )

# results.extend(emails)
# results.extend(phone_nums)

# def remove_duplicates(df: pd.DataFrame):
#     # Sort by the document and token
#     df.sort_values(by=['document', 'token'])
#     # Combine three columns 
#     df['triplet'] = df[["document", "token", "label"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 
#     # display(df)
#     # Drop duplicated triplets and keep the first one as unique row
#     df = df.drop_duplicates(subset=["triplet"], keep='first')
#     # Regenerate 'row_id'
#     df['row_id'] = list(range(len(df)))    
#     df = df.reset_index(drop=True, inplace=False) 
#     print("Remove duplicates")
# #     display(df)
#     return df

In [None]:
# test_df = pd.DataFrame(results)
# test_df = remove_duplicates(test_df)
# test_df = test_df[["row_id", "document", "token", "label"]]
# # Create submission df
# test_df.to_csv("submission.csv", index=False)
# display(test_df)