Config

In [1]:
class Config:
    exp = "002"
    ver = "001"
    training = True

    seed = 42

    num_proc = 4

    threshold = 0.99

    tokenize_options = {
        "return_offsets_mapping": True,
        "truncation": False,
        "max_length": 4096,
    }
    deberta_options = {
        "output_hidden_states": True,
        "hidden_dropout_prob": 0.1,
        "layer_norm_eps": 1e-7,
        "add_pooling_layer": False,
    }

    model_name = "deberta3base-truncation-false"
    freeze_layers = 0

    save_path = f"/kaggle/input/{model_name}-{exp}-{ver}"

    output_dir = "/kaggle/output"

    model_path = "/kaggle/input/huggingfacedebertav3variants/deberta-v3-small"

    train_path = "/kaggle/input/pii-detection-removal-from-educational-data/train.json"
    test_path = "/kaggle/input/pii-detection-removal-from-educational-data/test.json"
    moredata_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/moredata_dataset_fixed.json"
    pii_dataset_fixed_path = "/kaggle/input/fix-punctuation-tokenization-external-dataset/pii_dataset_fixed.json"

    batch_size = 1
    epochs = 3
    lr = 1e-5

    all_labels = [
        "B-EMAIL",
        "B-ID_NUM",
        "B-NAME_STUDENT",
        "B-PHONE_NUM",
        "B-STREET_ADDRESS",
        "B-URL_PERSONAL",
        "B-USERNAME",
        "I-ID_NUM",
        "I-NAME_STUDENT",
        "I-PHONE_NUM",
        "I-STREET_ADDRESS",
        "I-URL_PERSONAL",
        "O",
    ]
    num_pii_labels = len(all_labels) - 1
    label2id = {label: index for index, label in enumerate(all_labels)}
    id2label = {index: label for index, label in enumerate(all_labels)}

In [23]:
import json, argparse, torch, sys, random, gc, os
import numpy as np
import pandas as pd
import functools
from itertools import chain
from functools import partial
from pathlib import Path
import ctypes

import pytorch_lightning as pl
import torch.nn as nn
from torch.optim import AdamW
from torch.utils.data import DataLoader, Dataset


# Transformer
from transformers import (
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    AutoConfig,
    AutoModel,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    PreTrainedTokenizer,
    PreTrainedModel,
    PretrainedConfig,
    DebertaV2Config,
    get_cosine_schedule_with_warmup,
    EvalPrediction,
)
from transformers.modeling_outputs import BaseModelOutput
# from datasets import Dataset, features
from typing import Iterable, Any, Callable
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer

from seqeval.metrics import recall_score, precision_score

In [24]:
# Seed the same seed to all 
def seed_everything(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)

seed_everything(Config.seed)


libc = ctypes.CDLL("libc.so.6")
def clear_memory():
    libc.malloc_trim(0)
    torch.cuda.empty_cache()
    gc.collect()

DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print(f"Device: {DEVICE}")

Device: cuda


Pre Processeing

In [25]:
def load_data():
    # Load training data
    train_data = pd.read_json(Path(Config.train_path))
    print(f"kaggle train data = {len(train_data)}")

    more_data = pd.read_json(Path(Config.moredata_path))
    print(f"more data = {len(more_data)}")

    pii_dataset_fixed = pd.read_json(Path(Config.pii_dataset_fixed_path))
    print(f"pii_dataset_fixed = {len(pii_dataset_fixed)}")

    # Combine to a single df
    df = pd.concat([train_data, more_data, pii_dataset_fixed])
    df["document"] = [i for i in range(len(df))]  # Update the document id
    df.reset_index(drop=True, inplace=True)

    return df


def split_df_by_sampling(df: pd.DataFrame, n_samples: int):
    # Get the sample df
    samples_df = df.sample(n=n_samples, random_state=Config.seed)
    # The remaining df
    cond = df["document"].isin(samples_df["document"])
    others_df = df.drop(df[cond].index, inplace=False)
    return samples_df, others_df


def downsample_df(df: pd.DataFrame):
    """Split the df into training and valid dataset"""
    df["is_labels"] = df["labels"].apply(
        lambda labels: any(label != "O" for label in labels)
    )

    # One or more labels are not 'O'
    true_labels = df[df["is_labels"]]
    # all labels are 'O'
    false_labels = df[~df["is_labels"]]

    # Reset index to two df
    true_labels = true_labels.reset_index(drop=True, inplace=False)
    false_labels = false_labels.reset_index(drop=True, inplace=False)
    print(f"Number of true_labels = {len(true_labels)}")
    print(f"Number of false_labels = {len(false_labels)}")

    # Get 300 as valid dataset
    n_true_samples = len(true_labels) - int(300 * len(true_labels) / len(df))

    # Sample true labels
    true_samples, true_others = split_df_by_sampling(true_labels, n_true_samples)
    print(f"true_samples = {len(true_samples)} true_others = {len(true_others)}")
    n_samples = len(false_labels) - (300 - int(300 * len(true_labels) / len(df)))
    # Sample false labels
    false_samples, false_others = split_df_by_sampling(false_labels, n_samples)
    print(f"false_samples = {len(false_samples)} false_others = {len(false_others)}")
    # Training ds = P * true_labels + P * false_labels
    train_df = pd.concat([true_samples, false_samples])
    # Valid ds = (1-P) * true_labels + (1-P) * false_labels
    valid_df = pd.concat([true_others, false_others])
    return train_df, valid_df

In [35]:
class CreateDataset(Dataset):
    def __init__(self, df: pd.DataFrame, tokenizer: PreTrainedTokenizer, is_test: bool) -> None:
        self.df = df
        self.tokenizer = tokenizer
        self.is_test = is_test

    def __len__(self):
        return len(self.df)
    
    def tokenize_train(self, row):
        text = []
        token_map = []
        labels = []
        targets = []
        idx = 0
        for t, l, ws in zip(row["tokens"], row["labels"], row["trailing_whitespace"]):
            text.append(t)
            labels.extend([l]*len(t))
            token_map.extend([idx]*len(t))

            if l in Config.all_labels:  
                targets.append(1)
            else:
                targets.append(0)
            
            if ws:
                text.append(" ")
                labels.append("O")
                token_map.append(-1)
            idx += 1

        tokenized = self.tokenizer("".join(text), **Config.tokenize_options)
         
        target_num = sum(targets)
        labels = np.array(labels)

        text = "".join(text)
        token_labels = []

        for start_idx, end_idx in tokenized.offset_mapping:
            if start_idx == 0 and end_idx == 0: 
                token_labels.append(Config.label2id["O"])
                continue
            
            if text[start_idx].isspace():
                start_idx += 1
            try:
                token_labels.append(Config.label2id[labels[start_idx]])
            except:
                continue
        length = len(tokenized.input_ids)
        
        return {
            "input_ids": tokenized.input_ids,
            "attention_mask": tokenized.attention_mask,
            "offset_mapping": tokenized.offset_mapping,
            "labels": token_labels,
            "length": length,
            "target_num": target_num,
            "group": 1 if target_num > 0 else 0,
            "token_map": token_map,
            # "tokens": row["tokens"],
        }

    def tokenize_test(self, row):
        text = []
        token_map = []
        
        idx = 0
        for t, ws in zip(row["tokens"], row["trailing_whitespace"]):
            text.append(t)
            token_map.extend([idx]*len(t))
            if ws:
                text.append(" ")
                token_map.append(-1)
                
            idx += 1
            
        tokenized = self.tokenizer("".join(text), **Config.tokenize_options)
        
        return {
            "input_ids": tokenized.input_ids,
            "attention_mask": tokenized.attention_mask,
            "offset_mapping": tokenized.offset_mapping,
            "token_map": token_map,
            # "tokens": row["tokens"],
        }

    def __getitem__(self, index):
        row = self.df.iloc[index]
        if self.is_test:
            return self.tokenize_test(row)
        else:
            return self.tokenize_train(row)
        

In [36]:
class CreateDataModule(pl.LightningDataModule):
    """
    DataFrameからモデリング時に使用するDataModuleを作成
    """

    def __init__(
        self,
        train_df: pd.DataFrame,
        valid_df: pd.DataFrame,
        test_df: pd.DataFrame,
        tokenizer: PreTrainedTokenizer,
    ):
        super().__init__()
        self.train_df = train_df
        self.valid_df = valid_df
        self.test_df = test_df
        self.batch_size = Config.batch_size
        self.tokenizer = tokenizer
        self.collator = DataCollatorForTokenClassification(
            tokenizer, pad_to_multiple_of=512
        )

    def setup(self, stage=None):
        self.train_dataset = CreateDataset(
            self.train_df, self.tokenizer, is_test=False
        )
        self.valid_dataset = CreateDataset(
            self.valid_df, self.tokenizer, is_test=False
        )
        self.test_dataset = CreateDataset(
            self.test_df, self.tokenizer, is_test=True
        )

    def train_dataloader(self):
        return DataLoader(
            self.train_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            shuffle=True,
            num_workers=os.cpu_count(),
        )

    def val_dataloader(self):
        return DataLoader(
            self.valid_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
        )

    def test_dataloader(self):
        return DataLoader(
            self.test_dataset,
            collate_fn=self.collator,
            batch_size=self.batch_size,
            num_workers=os.cpu_count(),
        )

In [37]:
def gen_dfs():
    # Load data
    df = load_data()
    # Split 'df' into training and valid dataset (300) based on whether the row is all 'O' or not. 
    train_df, valid_df = downsample_df(df.copy())
    train_df.reset_index(drop=True, inplace=True)
    valid_df.reset_index(drop=True, inplace=True)
    print(f"Number of train_df = {len(train_df)}")
    print(f"Number of valid_df = {len(valid_df)}")

    test_df = pd.read_json(Path(Config.test_path))
    clear_memory()
    return train_df, valid_df, test_df

In [38]:
train_df, valid_df, test_df = gen_dfs()
tokenizer = AutoTokenizer.from_pretrained(Config.model_path)
dm = CreateDataModule(train_df, valid_df, test_df, tokenizer)
dm.setup()

kaggle train data = 6807
more data = 2000
pii_dataset_fixed = 4434
Number of true_labels = 7369
Number of false_labels = 5872
true_samples = 7203 true_others = 166
false_samples = 5738 false_others = 134
Number of train_df = 12941
Number of valid_df = 300


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Model

In [39]:
def post_processing_preds(logits_list: list[torch.Tensor], is_train: bool = True):
    preds_final = []
    for logits in logits_list:
        logits = logits.cpu().detach().numpy()
        if is_train:
            logits_softmax = np.exp(logits) / np.sum(
                np.exp(logits), axis=1
            ).repeat(logits.shape[1]).reshape(logits.shape[0], logits.shape[1])
        else:
            logits_softmax = np.exp(logits) / np.sum(
                np.exp(logits), axis=1
            ).repeat(logits.shape[1]).reshape(logits.shape[0], logits.shape[1])
        # Get the maximal value as the final preds
        preds = logits_softmax.argmax(-1)
        preds_without_O = logits_softmax[:, : Config.num_pii_labels].argmax(
            -1
        )  # Prob of entity labels (like 'NAME_STUDENT')
        O_preds = logits_softmax[:, Config.num_pii_labels]  # Prob for 'O'

        preds_final.append(np.where(O_preds < Config.threshold, preds_without_O, preds))
    return preds_final

In [40]:
# Compute the model performance metrics using `seqeval`
def compute_metrics(label_pred: list[list[str]], label_gt: list[list[str]]):    
    try:
        #print("Compute metrics")
        
        # Compute recall, precision and f1 score
        recall = recall_score(label_pred, label_gt)
        precision = precision_score(label_pred, label_gt)
        # f5 score to measure the performance
        f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
        result = {'f1': f1_score,  
                  'recall': recall,
                  'precision': precision}
        print(f"result = {result}")
        return result
    except Exception as e: 
        print(e)

In [67]:
def predictions_to_df(
    preds_list: list[torch.Tensor], valid_dataset: CreateDataset, valid_df: pd.DataFrame
):
    pairs = set()
    document, token, label, token_str = [], [], [], []
    for index, (preds, row) in enumerate(zip(preds_list, valid_dataset)):
        print(preds)
        for p, token_map, offsets, tokens in zip(
            preds,
            row["token_map"],
            row["offset_mapping"],
            valid_df.iloc[index]["tokens"],            
        ):
            doc = valid_df.iloc[index]["document"]
            # p = p.argmax(-1).cpu().detach().numpy()
            # p = p.cpu().detach().numpy()

            for token_pred, (start_idx, end_idx) in zip(p, offsets):
                label_pred = Config.id2label[(token_pred)]

                if start_idx + end_idx == 0:
                    continue

                if token_map[start_idx] == -1:
                    start_idx += 1

                # ignore "\n\n"
                while (
                    start_idx < len(token_map)
                    and tokens[token_map[start_idx]].isspace()
                ):
                    start_idx += 1

                if start_idx >= len(token_map):
                    break

                token_id = token_map[start_idx]

                if label_pred == "O" or token_id == -1:
                    continue

                pair = (doc, token_id)

                if pair in pairs:
                    continue

                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                pairs.add(pair)

    df = pd.DataFrame(
        {"document": document, "token": token, "label": label, "token_str": token_str}
    )
    df["row_id"] = list(range(len(df)))

    return df

In [68]:
class LSTMHead(nn.Module):
    def __init__(self, in_features, hidden_dim, n_layers):
        super().__init__()
        self.lstm = nn.LSTM(
            in_features,
            hidden_dim,
            n_layers,
            batch_first=True,
            bidirectional=True,
            dropout=0.1,
        )
        self.out_features = hidden_dim

    def forward(self, x) -> torch.Tensor:
        self.lstm.flatten_parameters()
        hidden, (_, _) = self.lstm.forward(x)
        out = hidden
        return out


class DebertaLSTMModel(pl.LightningModule):
    def __init__(self, dm: CreateDataModule):
        super(DebertaLSTMModel, self).__init__()

        self.dm = dm

        self.model_config: DebertaV2Config = AutoConfig.from_pretrained(
            Config.model_path
        )

        self.model_config.update(Config.deberta_options)

        self.transformers_model: PreTrainedModel = AutoModel.from_pretrained(
            Config.model_path
        )
        self.head = LSTMHead(
            in_features=self.model_config.hidden_size,
            hidden_dim=self.model_config.hidden_size // 2,
            n_layers=1,
        )

        self.fc = nn.Linear(self.model_config.hidden_size, len(Config.all_labels))
        self.loss_function = nn.CrossEntropyLoss(reduction='mean',ignore_index=-100) 
        self.validation_step_outputs = []

        # if Config.freeze_layers>0:
        # 	print(f'Freezing {Config.freeze_layers} layers.')
        # 	for layer in self.transformers_model.longformer.encoder.layer[:Config.freeze_layers]:
        # 		for param in layer.parameters():
        # 			param.requires_grad = False

    def forward(self, input_ids, attention_mask=None, train=True):
        transformer_out: BaseModelOutput = self.transformers_model.forward(
            input_ids, attention_mask=attention_mask
        )
        last_hidden_state = transformer_out.last_hidden_state
        head_output = self.head.forward(last_hidden_state)
        # last_hidden_state = (batch_size, seq_len, hidden_size)

        logits = self.fc.forward(head_output)

        # logits = (batch_size, seq_len, num_labels)
        return (logits, _)

    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        target: torch.Tensor = batch["labels"]

        outputs = self.forward(input_ids, attention_mask, train=True)

        ## only batch_size=1
        output = outputs[0]
        # output = (seq_len, num_labels)
        loss = self.loss_function(
            output.view(-1, len(Config.all_labels)), target.view(-1)
        )

        self.log("train_loss", loss, prog_bar=True)
        return {"loss": loss}

    def train_epoch_end(self, outputs):
        avg_loss = torch.stack([x["loss"] for x in outputs]).mean()
        print(f"epoch {self.trainer.current_epoch} training loss {avg_loss}")
        return {"train_loss": avg_loss}

    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        target: torch.Tensor = batch["labels"]

        outputs = self.forward(input_ids, attention_mask, train=False)
        output = outputs[0]

        loss = self.loss_function(
            output.view(-1, len(Config.all_labels)), target.view(-1)
        )

        self.log("val_loss", loss, prog_bar=True)
        self.validation_step_outputs.append(
            {"val_loss": loss, "logits": output, "targets": target, "row": batch}
        )
        return {"val_loss": loss, "logits": output, "targets": target}

    def on_validation_epoch_end(self):
        outputs = self.validation_step_outputs
        avg_loss = torch.stack([x["val_loss"] for x in outputs]).mean()
        logits_list = [logits for batch in outputs for logits in batch["logits"]]
        print(type(logits_list[0]))
        preds_list = post_processing_preds(logits_list)
        # print(flattened_preds.shape)
        pred_df = predictions_to_df(preds_list, self.dm.valid_dataset, self.dm.valid_df)

        print(pred_df.shape)
        print(pred_df)

        self.validation_step_outputs = []

        # print(output_val.shape)
        labels_gt = [target for batch in outputs for target in batch["targets"]]
        avg_score = compute_metrics([pred_df["label"].to_list()], [labels_gt])
        f5_score = avg_score["ents_f5"]
        print(f"epoch {self.trainer.current_epoch} validation loss {avg_loss}")
        print(f"epoch {self.trainer.current_epoch} validation scores {avg_score}")

        return {"val_loss": avg_loss, "val_f5": f5_score}

    def get_optimizer_params(self, encoder_lr, decoder_lr, weight_decay=0.0):
        param_optimizer = list(model.named_parameters())
        no_decay = ["bias", "LayerNorm.bias", "LayerNorm.weight"]
        optimizer_parameters = [
            {
                "params": [
                    p
                    for n, p in self.transformers_model.named_parameters()
                    if not any(nd in n for nd in no_decay)
                ],
                "lr": encoder_lr,
                "weight_decay": weight_decay,
            },
            {
                "params": [
                    p
                    for n, p in self.transformers_model.named_parameters()
                    if any(nd in n for nd in no_decay)
                ],
                "lr": encoder_lr,
                "weight_decay": 0.0,
            },
            {
                "params": [
                    p
                    for n, p in self.named_parameters()
                    if "transformers_model" not in n
                ],
                "lr": decoder_lr,
                "weight_decay": 0.0,
            },
        ]
        return optimizer_parameters

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=Config.lr)

        epoch_steps = len(self.dm.train_dataset)
        batch_size = Config.batch_size

        warmup_steps = 0.05 * epoch_steps // batch_size
        training_steps = Config.epochs * epoch_steps // batch_size
        # scheduler = get_linear_schedule_with_warmup(optimizer,warmup_steps,training_steps,-1)
        # scheduler = get_polynomial_decay_schedule_with_warmup(optimizer, warmup_steps, training_steps, lr_end=1e-6, power=3.0)
        scheduler = get_cosine_schedule_with_warmup(
            optimizer, warmup_steps, training_steps, num_cycles=0.5
        )

        lr_scheduler_config = {
            "scheduler": scheduler,
            "interval": "step",
            "frequency": 1,
        }

        return {"optimizer": optimizer, "lr_scheduler": lr_scheduler_config}

Train

In [69]:
model = DebertaLSTMModel(dm=dm)
trainer = pl.Trainer()
trainer.fit(model=model, datamodule=dm)

  rank_zero_warn(


Sanity Checking: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fas

<class 'torch.Tensor'>
[10 10 10 10 10 10 10 10  0 10 10 10 10  4 10 10  3  3  2 10  2  5  5  4
  3  5 11 11  3  4  4 11  1  3  3  3  3  3  3  1  3 10 10  3  4  3 10 10
 11 11  4  5  1  4  4  5 10  4  4  4  4  4 11  3  1 10  4 11  3  3  1  3
  4  4  4 10  8  3 11  1  1  1  4  1  4  4  0  4  4  4  4  8  6  8  3  5
  5  3 11 11 10  4  4  4  8  4  1 10 10  4  4  4  4  8 10 10 10  3  3  1
  1 11  4 11 11  8  5  1  4  1  1  1  5  8  8  4  4 10  1  4 11 10 10 10
 11  5  1  3 10 10  1 10 10 10  4  4  1 10  3  4  4  9 10  4  4 11  1 10
  2  3  5  3  3  3 10  3  3  3 11  3 10 10  9  3  1 10 10 10 10 10 10 10
  3  1  1  5  3  4  1 11  1  4  4  1  5  5  5  3  4  3  3  4 10 10 10  4
  4  4  4  4  4  1 11 10 10  1  8 10 10 10 10 10 10 10  1 10 10  1 10 11
 11 11  4 11  1  3  1  9  9  4  4  3  3  5  4  4  3 10  1  1 10  1 10  1
 10  1  3  9  4  3  4  9  4  8  4 10 11 11 10 10  4  3  3  5  8  8  4 11
  5  4 10  5 10 10  8 10  3 10 10 10  5  2 10  3  5  4  3  3 10  3  3  3
  4  5  8  8  8 10 10 10 11 

AttributeError: 'numpy.int64' object has no attribute 'cpu'

Infer

In [None]:
# Model Inferer
class ModelInfer:
    def __init__(self):
        self.infer_dir = "/kaggle/working/infer" # Model infer output 
        self.load_model()

    def load_model(self):
        # Create the tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(Config.model_path) 
        # Create the model
        self.model = AutoModelForTokenClassification.from_pretrained(Config.model_path)        
        # # Load the fine-tuned adapter layer on top of base model
        # self.model = self.model.to(DEVICE)n
        print("Complete loading pretrained LLM model")     
    
    def infer_preds(self, ds: Dataset):
        # Tokenize the dataset using customized Tokenizer (the same as Training Tokenizer)
        tokenized_ds = ds.map(tokenize, fn_kwargs={"tokenizer": self.tokenizer}, num_proc=2)
        # Create data loader
        data_collator = DataCollatorForTokenClassification(self.tokenizer,
                                                           pad_to_multiple_of=16)
        # Arguments (infer only)
        args = TrainingArguments(output_dir=self.infer_dir,
                                 per_device_eval_batch_size=1, 
                                 report_to="none")
        # Create the trainer 
        trainer = Trainer(model=self.model, 
                          args=args, 
                          data_collator=data_collator, 
                          tokenizer=self.tokenizer)
        
        # predict for that split
        preds = trainer.predict(tokenized_ds).predictions
                
        # Clear the unused memory
        del self.model, data_collator, trainer, args 
        clear_memory()
        preds_final = post_processing_preds(preds)
        return preds_final, tokenized_ds

In [None]:
test_data = pd.read_json("/kaggle/input/pii-detection-removal-from-educational-data/test.json")

test_ds = Dataset.from_dict({
    "full_text": test_data["full_text"].tolist(),
    "document": test_data["document"].tolist(),
    "tokens": test_data["tokens"].tolist(),
    "trailing_whitespace": test_data["trailing_whitespace"].tolist(),
})
print(f"Total number of test dataset {len(test_ds)}")
# config = json.load(open(Path(Config.model_path) / "config.json"))
# id2label = config["id2label"]
# Load the pretrained model and make the predictions
inferer = ModelInfer()
preds_final, tokenized_ds = inferer.infer_preds(test_ds) 

Post Processing

In [None]:
# Convert preds to a list of dictionaries
results = []
for p, token_map, offsets, tokens, doc in zip(preds_final,
                                              tokenized_ds["token_map"], 
                                              tokenized_ds["offset_mapping"],
                                              tokenized_ds["tokens"],
                                              tokenized_ds["document"]):
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        try:
            label_pred = Config.id2label[str(token_pred)]
            if start_idx + end_idx == 0: 
                continue

            if token_map[start_idx] == -1:
                start_idx += 1
             # ignore "\n\n"
            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): 
                break

            token_id = token_map[start_idx]

            # ignore "O" predictions and whitespace preds
            if label_pred != "O" and token_id != -1:
                results.append({
                        "document": doc,
                        "token": token_id,
                        "label": label_pred,
                        "token_str": tokens[token_id]
                    })
                
        except Exception as e:
            print(f"Error {e}")
            print(f"token_map {len(token_map)} and {token_pred}  {start_idx} {end_idx}")
            sys.exit(-1)

In [None]:
import re
from spacy.lang.en import English
nlp = English()

def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    idx = 0
    spans = []
    span = []
    
    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

email_regex = re.compile(r'[\w.+-]+@[\w-]+\.[\w.-]+')
phone_num_regex = re.compile(r"(\(\d{3}\)\d{3}\-\d{4}\w*|\d{3}\.\d{3}\.\d{4})\s")
emails = []
phone_nums = []

for _data in test_ds:
    # email
    for token_idx, token in enumerate(_data["tokens"]):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": _data["document"], "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )
    # phone number
    matches = phone_num_regex.findall(_data["full_text"])
    if not matches:
        continue
        
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, _data["tokens"])
        
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": _data["document"], "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": _data["tokens"][token_idx]}
            )

results.extend(emails)
results.extend(phone_nums)

def remove_duplicates(df: pd.DataFrame):
    # Sort by the document and token
    df.sort_values(by=['document', 'token'])
    # Combine three columns 
    df['triplet'] = df[["document", "token", "label"]].apply(lambda row: '_'.join(row.values.astype(str)), axis=1) 
    # display(df)
    # Drop duplicated triplets and keep the first one as unique row
    df = df.drop_duplicates(subset=["triplet"], keep='first')
    # Regenerate 'row_id'
    df['row_id'] = list(range(len(df)))    
    df = df.reset_index(drop=True, inplace=False) 
    print("Remove duplicates")
#     display(df)
    return df

In [None]:
test_df = pd.DataFrame(results)
test_df = remove_duplicates(test_df)
test_df = test_df[["row_id", "document", "token", "label"]]
# Create submission df
test_df.to_csv("submission.csv", index=False)
display(test_df)