In [1]:
!cp //input/piidd-postprocessing-code-dataset/piidd_postprocessing.py .
!ls .

__notebook__.ipynb  piidd_postprocessing.py


In [2]:
%%writefile custom_model_23.py

from transformers import AutoTokenizer,AutoConfig, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import DebertaV2Config, DebertaV2ForTokenClassification
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput
from torch import nn
import torch
from transformers import Trainer
from torch.nn import CrossEntropyLoss
import numpy as np
from transformers.models.deberta.modeling_deberta import (
    DebertaPreTrainedModel,
    DebertaModel
)
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2Model,DebertaV2PreTrainedModel
)
## Pooling Strategies
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings

class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e4
        min_embeddings, _ = torch.min(embeddings, dim=1)
        return min_embeddings
# v2 vor latest
class CustomModel(DebertaV2PreTrainedModel):#nn.Module):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    #https://github.com/huggingface/transformers/blob/f7ef7cec6c6c162087421f36a17eabdbb223579d/src/transformers/models/deberta/modeling_deberta.py#L1342
    #def __init__(self,backbone,bilstm_layer=True,class_weights=None):
    def __init__(self, config):
        #super(CustomModel, self).__init__(config)
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = DebertaV2Model(config)
        print(f"Num Labels {config.num_labels}")
        self.mean_pooling = MeanPooling()
        # self.max_pooler = MaxPooling()
        # self.min_pooler = MinPooling()
        self.bilstm_layer = False
        self.mult_sample_dpt = True
        self.mean_pool = False
        # Loss Fn
        o_weight=0.05
        self.class_weights = torch.tensor([1.0]*(self.num_labels - 1) + [o_weight])
        self.loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        print(f"Class Weights {self.class_weights}")

        if self.bilstm_layer:
            print(f'Including LSTM layer hidden size {self.config.hidden_size} dropout {self.config.hidden_dropout_prob}')
            self.lstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, num_layers=2, dropout=config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
            self.initialize_lstm(self.lstm)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        #self._init_weights
        self.post_init()
    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        output_backbone = outputs[0]
        if self.bilstm_layer:
            output_backbone = self.dropout(output_backbone)
            output, hc = self.lstm(output_backbone)

        if self.mean_pool:
            output_backbone = self.dropout(output_backbone)
            output = self.mean_pooling(output_backbone, attention_mask)
        # max_pool = self.max_pooler(output, attention_mask)
        # min_pool = self.min_pooler(output, attention_mask)
        #concat = torch.cat([mean_pool], dim=1)

        # Multi-sample dropout.
        if self.mult_sample_dpt:
            output1 = self.classifier(self.dropout1(output_backbone))
            output2 = self.classifier(self.dropout2(output_backbone))
            output3 = self.classifier(self.dropout3(output_backbone))
            output4 = self.classifier(self.dropout4(output_backbone))
            output5 = self.classifier(self.dropout5(output_backbone))
            logits = (output1 + output2 + output3 + output4 + output5) / 5
        else:
            logits = self.classifier(output)

        loss = None

        if labels is not None:
            if self.mean_pool:
                loss = self.loss_fct(logits, labels.view(-1))
            else:
                loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

Writing custom_model_23.py


In [3]:
%%writefile custom_model_bilstm.py

from transformers import AutoTokenizer,AutoConfig, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import DebertaV2Config, DebertaV2ForTokenClassification
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput
from torch import nn
import torch
from transformers import Trainer
from torch.nn import CrossEntropyLoss
import numpy as np
from transformers.models.deberta.modeling_deberta import (
    DebertaPreTrainedModel,
    DebertaModel
)
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2Model,DebertaV2PreTrainedModel
)
## Pooling Strategies
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings

class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e4
        min_embeddings, _ = torch.min(embeddings, dim=1)
        return min_embeddings
# v2 vor latest
class CustomModel(DebertaV2PreTrainedModel):#nn.Module):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    #https://github.com/huggingface/transformers/blob/f7ef7cec6c6c162087421f36a17eabdbb223579d/src/transformers/models/deberta/modeling_deberta.py#L1342
    def __init__(self, config):
        #super(CustomModel, self).__init__(config)
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = DebertaV2Model(config)
        print(f"Num Labels {config.num_labels}")
        self.bilstm_layer = True
        self.mult_sample_dpt = False
        # Loss Fn
        o_weight=0.05
        self.loss_fct = torch.nn.CrossEntropyLoss()#weight=self.class_weights)
        #print(f"Class Weights {self.class_weights}")

        if self.bilstm_layer:
            print(f'Including LSTM layer hidden size {self.config.hidden_size} dropout {self.config.hidden_dropout_prob}')
            self.bilstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, num_layers=1, dropout=config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
            #self.initialize_lstm(self.bilstm)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        #self._init_weights
        self.post_init()
    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        output_backbone = outputs[0]
        if self.bilstm_layer:
            output_backbone = self.dropout(output_backbone)
            #print("output_backbone ",output_backbone)
            self.bilstm.flatten_parameters()
            output, hc = self.bilstm(output_backbone)
            #print("lstm output ",output)
 
        # Multi-sample dropout.
        if self.mult_sample_dpt:
            output1 = self.classifier(self.dropout1(output_backbone))
            output2 = self.classifier(self.dropout2(output_backbone))
            output3 = self.classifier(self.dropout3(output_backbone))
            output4 = self.classifier(self.dropout4(output_backbone))
            output5 = self.classifier(self.dropout5(output_backbone))
            logits = (output1 + output2 + output3 + output4 + output5) / 5
        else:
            logits = self.classifier(output)

        loss = None

        if labels is not None:
            if self.mean_pool:
                loss = self.loss_fct(logits, labels.view(-1))
            else:
                loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

Writing custom_model_bilstm.py


In [4]:
%%writefile custom_model_distil.py

from transformers import AutoTokenizer,AutoConfig, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from transformers import DebertaV2Config, DebertaV2ForTokenClassification
from transformers.tokenization_utils_base import PreTrainedTokenizerBase
from transformers.utils import PaddingStrategy
from transformers import TrainingArguments, Trainer, EarlyStoppingCallback
from datasets import Dataset
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AutoConfig, AutoModel
from transformers.modeling_outputs import TokenClassifierOutput
from torch import nn
import torch
from transformers import Trainer
from torch.nn import CrossEntropyLoss
import numpy as np
from transformers.models.deberta.modeling_deberta import (
    DebertaPreTrainedModel,
    DebertaModel
)
from transformers.models.deberta_v2.modeling_deberta_v2 import (
    DebertaV2Model,DebertaV2PreTrainedModel
)
## Pooling Strategies
class MeanPooling(nn.Module):
    def __init__(self):
        super(MeanPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * input_mask_expanded, 1)
        sum_mask = input_mask_expanded.sum(1)
        sum_mask = torch.clamp(sum_mask, min=1e-9)
        mean_embeddings = sum_embeddings / sum_mask
        return mean_embeddings

class MaxPooling(nn.Module):
    def __init__(self):
        super(MaxPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = -1e4
        max_embeddings, _ = torch.max(embeddings, dim=1)
        return max_embeddings

class MinPooling(nn.Module):
    def __init__(self):
        super(MinPooling, self).__init__()

    def forward(self, last_hidden_state, attention_mask):
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        embeddings = last_hidden_state.clone()
        embeddings[input_mask_expanded == 0] = 1e4
        min_embeddings, _ = torch.min(embeddings, dim=1)
        return min_embeddings
# v2 vor latest
class CustomModel(DebertaV2PreTrainedModel):#nn.Module):
    _keys_to_ignore_on_load_unexpected = [r"pooler"]
    #https://github.com/huggingface/transformers/blob/f7ef7cec6c6c162087421f36a17eabdbb223579d/src/transformers/models/deberta/modeling_deberta.py#L1342
    #def __init__(self,backbone,bilstm_layer=True,class_weights=None):
    def __init__(self, config):
        #super(CustomModel, self).__init__(config)
        super().__init__(config)
        self.num_labels = config.num_labels
        self.deberta = DebertaV2Model(config)
        print(f"Num Labels {config.num_labels}")
        self.mean_pooling = MeanPooling()
        # self.max_pooler = MaxPooling()
        # self.min_pooler = MinPooling()
        self.bilstm_layer = False
        self.mult_sample_dpt = True
        self.mean_pool = False
        # Loss Fn
        o_weight=0.05
        self.class_weights = torch.tensor([1.0]*(self.num_labels-1) + [0.05])
        self.loss_fct = torch.nn.CrossEntropyLoss(weight=self.class_weights)
        #print(f"Class Weights {self.class_weights}")

        if self.bilstm_layer:
            print(f'Including LSTM layer hidden size {self.config.hidden_size} dropout {self.config.hidden_dropout_prob}')
            self.lstm = nn.LSTM(config.hidden_size, (config.hidden_size) // 2, num_layers=2, dropout=config.hidden_dropout_prob, batch_first=True,
                              bidirectional=True)
            self.initialize_lstm(self.lstm)

        self.dropout = nn.Dropout(config.hidden_dropout_prob)
        self.dropout1 = nn.Dropout(0.1)
        self.dropout2 = nn.Dropout(0.2)
        self.dropout3 = nn.Dropout(0.3)
        self.dropout4 = nn.Dropout(0.4)
        self.dropout5 = nn.Dropout(0.5)
        self.classifier = nn.Linear(config.hidden_size, config.num_labels)

        #self._init_weights
        self.post_init()
    def initialize_lstm(self, lstm_layer):
        for name, param in lstm_layer.named_parameters():
            if 'weight_ih' in name:
                torch.nn.init.xavier_uniform_(param.data)
            elif 'weight_hh' in name:
                torch.nn.init.orthogonal_(param.data)
            elif 'bias' in name:
                param.data.fill_(0)
    def _init_weights(self, module):
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean = 0.0, std = self.config.initializer_range)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)

    def get_input_embeddings(self):
        return self.deberta.get_input_embeddings()

    def set_input_embeddings(self, new_embeddings):
        self.deberta.set_input_embeddings(new_embeddings)

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        inputs_embeds=None,
        labels=None,
        output_attentions=None,
        output_hidden_states=None,
        return_dict=None,
        **kwargs
    ):
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
        outputs = self.deberta(
            input_ids,
            token_type_ids=token_type_ids,
            attention_mask=attention_mask,
            position_ids=position_ids,
            inputs_embeds=inputs_embeds,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
            return_dict=return_dict,
        )

        output_backbone = outputs[0]
        if self.bilstm_layer:
            output_backbone = self.dropout(output_backbone)
            output, hc = self.lstm(output_backbone)

        if self.mean_pool:
            output_backbone = self.dropout(output_backbone)
            output = self.mean_pooling(output_backbone, attention_mask)
        # max_pool = self.max_pooler(output, attention_mask)
        # min_pool = self.min_pooler(output, attention_mask)
        #concat = torch.cat([mean_pool], dim=1)

        # Multi-sample dropout.
        if self.mult_sample_dpt:
            output1 = self.classifier(self.dropout1(output_backbone))
            output2 = self.classifier(self.dropout2(output_backbone))
            output3 = self.classifier(self.dropout3(output_backbone))
            output4 = self.classifier(self.dropout4(output_backbone))
            output5 = self.classifier(self.dropout5(output_backbone))
            logits = (output1 + output2 + output3 + output4 + output5) / 5
        else:
            logits = self.classifier(output)

        loss = None

        if labels is not None:
            if self.mean_pool:
                loss = self.loss_fct(logits, labels.view(-1))
            else:
                loss = self.loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

Writing custom_model_distil.py


In [5]:
import json
import argparse
from pathlib import Path
from itertools import chain
import re

import numpy as np
import pandas as pd
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
from scipy.special import softmax
from collections import defaultdict
from typing import Dict
from piidd_postprocessing import label_postprocessing

class PRFScore:
    """A precision / recall / F score."""

    def __init__(
        self,
        *,
        tp: int = 0,
        fp: int = 0,
        fn: int = 0,
    ) -> None:
        self.tp = tp
        self.fp = fp
        self.fn = fn

    def __len__(self) -> int:
        return self.tp + self.fp + self.fn

    def __iadd__(self, other):  # in-place add
        self.tp += other.tp
        self.fp += other.fp
        self.fn += other.fn
        return self

    def __add__(self, other):
        return PRFScore(
            tp=self.tp + other.tp, fp=self.fp + other.fp, fn=self.fn + other.fn
        )

    def score_set(self, cand: set, gold: set) -> None:
        self.tp += len(cand.intersection(gold))
        self.fp += len(cand - gold)
        self.fn += len(gold - cand)

    @property
    def precision(self) -> float:
        return self.tp / (self.tp + self.fp + 1e-100)

    @property
    def recall(self) -> float:
        return self.tp / (self.tp + self.fn + 1e-100)

    @property
    def f1(self) -> float:
        p = self.precision
        r = self.recall
        return 2 * ((p * r) / (p + r + 1e-100))

    @property
    def f5(self) -> float:
        beta = 5
        p = self.precision
        r = self.recall

        fbeta = (1+(beta**2))*p*r / ((beta**2)*p + r + 1e-100)
        return fbeta

    def to_dict(self) -> Dict[str, float]:
        return {"p": self.precision, "r": self.recall, "f5": self.f5}


def compute_metrics(pred_df, gt_df):
    """
    Compute the LB metric (lb) and other auxiliary metrics
    """
    
    references = {(row.document, row.token, row.label) for row in gt_df.itertuples()}
    predictions = {(row.document, row.token, row.label) for row in pred_df.itertuples()}

    score_per_type = defaultdict(PRFScore)
    references = set(references)

    for ex in predictions:
        pred_type = ex[-1] # (document, token, label)
        if pred_type != 'O':
            pred_type = pred_type[2:] # avoid B- and I- prefix
            
        if pred_type not in score_per_type:
            score_per_type[pred_type] = PRFScore()

        if ex in references:
            score_per_type[pred_type].tp += 1
            references.remove(ex)
        else:
            score_per_type[pred_type].fp += 1

    for doc, tok, ref_type in references:
        if ref_type != 'O':
            ref_type = ref_type[2:] # avoid B- and I- prefix
        
        if ref_type not in score_per_type:
            score_per_type[ref_type] = PRFScore()
        score_per_type[ref_type].fn += 1

    totals = PRFScore()
    
    for prf in score_per_type.values():
        totals += prf

    return {
        "ents_p": totals.precision,
        "ents_r": totals.recall,
        "ents_f5": totals.f5,
        "ents_per_type": {k: v.to_dict() for k, v in score_per_type.items() if k!= 'O'},
    }

def tokenize(example, tokenizer):
    text = []
    token_map = []
    
    idx = 0
    for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        idx += 1
        
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False)
    return {
        **tokenized,
        "token_map": token_map,
    }

def parse_predictions(pred_softmax, ds, id2label, thresholds):
    preds = pred_softmax.argmax(-1)
    preds_without_O = pred_softmax[:,:,:len(id2label)-1].argmax(-1)
    O_preds = pred_softmax[:,:,len(id2label)-1]
    
    indexes = defaultdict(list)
    for k,v in id2label.items():
        if k != str(len(id2label)-1):
            indexes[v.split("-")[1]].append(int(k))

    print(indexes)

    for label_name, label_threshold in thresholds.items():
        if len(indexes[label_name]) == 1:
            preds = np.where(O_preds < label_threshold, 
                             np.where(preds_without_O == indexes[label_name][0], preds_without_O, preds), 
                             preds)
        else:
            preds = np.where(O_preds < label_threshold, 
                             np.where((preds_without_O == indexes[label_name][0]) | (preds_without_O == indexes[label_name][1]), preds_without_O, preds), 
                             preds)

    preds_final = preds

    
    pairs = set()
    document, token, label, token_str = [], [], [], []
    for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):
        for token_pred, (start_idx, end_idx) in zip(p, offsets):
            label_pred = id2label[str(token_pred)]

            if start_idx + end_idx == 0: 
                continue

            if token_map[start_idx] == -1:
                start_idx += 1

            while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
                start_idx += 1

            if start_idx >= len(token_map): 
                break

            token_id = token_map[start_idx]
            if label_pred in ("O") or token_id == -1:
                continue

            pair = (doc, token_id)
            if pair in pairs:
                continue

            document.append(doc)
            token.append(token_id)
            label.append(label_pred)
            token_str.append(tokens[token_id])
            pairs.add(pair)
            
    df = pd.DataFrame({
        "document": document,
        "token": token,
        "label": label,
        "token_str": token_str
    })
    df["row_id"] = list(range(len(df)))
    return df



In [6]:
weights = {
    'custom-model-23': 2.3136772671200951 ,
    'custom-model-distil': 8.7434000461426,
    'exp073': 4.958400473808574,
    'custom-model-bilstm': 2.0696284831356238,
    'exp076': 4.595871408325738,
    'basic-model-2': 9.942166951500617, #deberta large
    'custom-model-2': 2.5511467213118255, #deberta large
}

voting_thr = 14.720990487155191

models_paths = [
    (['/kaggle/input/pii-detection-models/custom-model-23-fullfit-seed42/custom-model-23-fullfit-seed42'], 'custom-model-23', 
     {'EMAIL': 0.5, 'ID_NUM': 0.6, 'NAME_STUDENT': 0.8, 'PHONE_NUM': 0.5, 'STREET_ADDRESS': 0.5, 'URL_PERSONAL': 0.5, 'USERNAME': 0.8}),
    
    (['/kaggle/input/distil-mpware-fp16-maxlen2048-lr7e-5/fold0-cv9758', '/kaggle/input/distil-mpware-fp16-maxlen2048-lr7e-5/fold1-cv9687'], 'custom-model-distil', 
     {'EMAIL': 0.5, 'ID_NUM': 0.6, 'NAME_STUDENT': 0.8, 'PHONE_NUM': 0.5, 'STREET_ADDRESS': 0.5, 'URL_PERSONAL': 0.5, 'USERNAME': 0.8}),
    
    (['/kaggle/input/pii-detect-exp073-0'], 'exp073', 
     {'EMAIL': 0.5, 'ID_NUM': 0.9, 'NAME_STUDENT': 0.9, 'PHONE_NUM': 0.6, 'STREET_ADDRESS': 0.6, 'URL_PERSONAL': 0.5, 'USERNAME': 0.9}),
    
    (['/kaggle/input/bilstm1-nosafe-maxlen1650/fold3-lr7e-5-cv9709'], 'custom-model-bilstm', 
     {'EMAIL': 0.5, 'ID_NUM': 0.6, 'NAME_STUDENT': 0.8, 'PHONE_NUM': 0.5, 'STREET_ADDRESS': 0.5, 'URL_PERSONAL': 0.5, 'USERNAME': 0.8}),
    
    (['/kaggle/input/pii-detect-exp076-0'], 'exp076', 
     {'EMAIL': 0.5, 'ID_NUM': 0.9, 'NAME_STUDENT': 0.9, 'PHONE_NUM': 0.6, 'STREET_ADDRESS': 0.6, 'URL_PERSONAL': 0.5, 'USERNAME': 0.9}), 
    
    (
        [
            '/kaggle/input/pii-detection-models/basic-model-2-fullfit-seed42/basic-model-2-fullfit-seed42', 
            '/kaggle/input/pii-detection-models/basic-model-2-fullfit-seed6543/basic-model-2-fullfit-seed6543',
        ], 
        'basic-model-2',
        {
            'EMAIL': 0.5,
            'ID_NUM': 0.6,
            'NAME_STUDENT': 0.8,
            'PHONE_NUM': 0.5,
            'STREET_ADDRESS': 0.5,
            'URL_PERSONAL': 0.5,
            'USERNAME': 0.8,
        }
        
    ),
    (
        [
            '/kaggle/input/pii-detection-models/custom-model-2-fullfit-seed42/custom-model-2-fullfit-seed42',
            '/kaggle/input/pii-detection-models/custom-model-2-fullfit-seed6543/custom-model-2-fullfit-seed6543',
        ], 
        'custom-model-2',
        {
            'EMAIL': 0.5,
            'ID_NUM': 0.6,
            'NAME_STUDENT': 0.8,
            'PHONE_NUM': 0.5,
            'STREET_ADDRESS': 0.5,
            'URL_PERSONAL': 0.5,
            'USERNAME': 0.8,
        }
    ),
]

with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", "r") as f:
    data = json.load(f)
    
doc2tokens = {str(row["document"]): row["tokens"] for row in data}
    
dfs = []
for group_paths, group_name, thresholds in models_paths:
    print(f"creating predictions for {group_name}")
    ds = Dataset.from_dict({
        "full_text": [x["full_text"] for x in data],
        "document": [x["document"] for x in data],
        "tokens": [x["tokens"] for x in data],
        "trailing_whitespace": [x["trailing_whitespace"] for x in data],
    })
    tokenizer = AutoTokenizer.from_pretrained(group_paths[0])
    collator = DataCollatorForTokenClassification(tokenizer)
    ds = ds.map(tokenize, fn_kwargs={"tokenizer": tokenizer}, num_proc=2)

    config = json.load(open(f'/kaggle/input/pii-detection-models/basic-model-2-fullfit-seed42/basic-model-2-fullfit-seed42/config.json'))
    id2label = config["id2label"]
    print(id2label)
    
    weighted_average_predictions = None
    for model_path in group_paths:
        
        if "custom" in group_name:
            if "distil" in group_name:
                from custom_model_distil import CustomModel
                print("initializing distil model")
            elif "bilstm" in group_name:
                from custom_model_bilstm import CustomModel
                print("initializing bilstm model")
            else:
                from custom_model_23 import CustomModel
                print("initializing custom model")
            
            backbone = CustomModel.from_pretrained(model_path)
            backbone.resize_token_embeddings(len(tokenizer), pad_to_multiple_of=16)
            model = backbone
        else:
            model = AutoModelForTokenClassification.from_pretrained(model_path)
        args = TrainingArguments(
            ".", 
            per_device_eval_batch_size=1, 
            report_to="none",
        )
        trainer = Trainer(
            model=model, 
            args=args, 
            data_collator=collator, 
            tokenizer=tokenizer,
        )
        predictions = trainer.predict(ds).predictions
        if weighted_average_predictions is None:
            weighted_average_predictions = softmax(predictions, axis = -1) * (1 / len(group_paths))
        else:
            weighted_average_predictions += softmax(predictions, axis = -1) * (1 / len(group_paths))
    
    df = parse_predictions(weighted_average_predictions, ds, id2label, thresholds)
    df['weight'] = weights[group_name]
    dfs.append(df)

creating predictions for custom-model-23


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}
initializing custom model
Num Labels 13
Class Weights tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0500])


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for custom-model-distil


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}
initializing distil model
Num Labels 13


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


initializing distil model
Num Labels 13


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for exp073
   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for custom-model-bilstm


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}
initializing bilstm model
Num Labels 13
Including LSTM layer hidden size 1024 dropout 0.1


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for exp076
   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for basic-model-2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})
creating predictions for custom-model-2


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


   

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

 

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

{'0': 'B-EMAIL', '1': 'B-ID_NUM', '2': 'B-NAME_STUDENT', '3': 'B-PHONE_NUM', '4': 'B-STREET_ADDRESS', '5': 'B-URL_PERSONAL', '6': 'B-USERNAME', '7': 'I-ID_NUM', '8': 'I-NAME_STUDENT', '9': 'I-PHONE_NUM', '10': 'I-STREET_ADDRESS', '11': 'I-URL_PERSONAL', '12': 'O'}
initializing custom model
Num Labels 13
Class Weights tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0500])


You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


initializing custom model
Num Labels 13
Class Weights tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0500])


defaultdict(<class 'list'>, {'EMAIL': [0], 'ID_NUM': [1, 7], 'NAME_STUDENT': [2, 8], 'PHONE_NUM': [3, 9], 'STREET_ADDRESS': [4, 10], 'URL_PERSONAL': [5, 11], 'USERNAME': [6]})


In [7]:
df = pd.concat(dfs)
df = df.groupby(['document', 'token', 'label', 'token_str'])['weight'].sum().reset_index()
print("weighted df")
display(df)
df = df[df['weight'] >= voting_thr]

df = df.sort_values(['document', 'token', 'weight'], ascending=[True, True, False])
df = df.drop_duplicates(['document', 'token'], keep='first')

train_data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))
dfs = []
for doc in df['document'].unique():
    sub = df[df['document'] == doc].copy()
    if not 'I-NAME_STUDENT' in sub['label'].values:
        dfs.append(sub)
        continue
    for sample in train_data:
        if sample['document'] == doc:
            break
    new_labels = []
    for tok, lab in sub[['token', 'label']].values:
        if lab == 'I-NAME_STUDENT' and '\n' in sample['tokens'][tok-1]:
            new_labels.append('B-NAME_STUDENT')
        else:
            new_labels.append(lab)
    sub['label'] = new_labels
    dfs.append(sub)       
df = pd.concat(dfs).drop(columns=["weight"])

df

weighted df


Unnamed: 0,document,token,label,token_str,weight
0,7,6,B-NAME_STUDENT,Avril,2.069628
1,7,9,B-NAME_STUDENT,Nathalie,35.174291
2,7,10,I-NAME_STUDENT,Sylla,35.174291
3,7,479,B-NAME_STUDENT,Avril,2.069628
4,7,482,B-NAME_STUDENT,Nathalie,35.174291
5,7,483,I-NAME_STUDENT,Sylla,35.174291
6,7,738,B-NAME_STUDENT,Avril,2.069628
7,7,741,B-NAME_STUDENT,Nathalie,35.174291
8,7,742,I-NAME_STUDENT,Sylla,35.174291
9,10,0,B-NAME_STUDENT,Diego,35.174291


Unnamed: 0,document,token,label,token_str
1,7,9,B-NAME_STUDENT,Nathalie
2,7,10,I-NAME_STUDENT,Sylla
4,7,482,B-NAME_STUDENT,Nathalie
5,7,483,I-NAME_STUDENT,Sylla
7,7,741,B-NAME_STUDENT,Nathalie
8,7,742,I-NAME_STUDENT,Sylla
9,10,0,B-NAME_STUDENT,Diego
10,10,1,I-NAME_STUDENT,Estrada
11,10,464,B-NAME_STUDENT,Diego
12,10,465,I-NAME_STUDENT,Estrada


# LABEL BASED POSTPROCESSING

In [8]:
data = json.load(open('/kaggle/input/pii-detection-removal-from-educational-data/test.json'))
df = label_postprocessing(df, doc2tokens, data)
display(df)
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)

postprocessing id and phone
postprocessing street address
postprocessing username
removing long ids (>25) and short urls (<10)
postprocessing id span
adding regex predictions


Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9
