# DeBERTa Inference Notebook with Partitioning 

This notebook serves as the main runfile for making inferences on DeBERTa models using partitioning. The parameters that can be changed are the threshold for determining if something is a non-PII or not. Otherwise, make sure to import the correct model path for the most updated DeBERTa partitioning model.

Note that we tested models by making inferences and submitting these inferences to kaggle for scoring.

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/deberta-model-partition-usethis/__results__.html
/kaggle/input/deberta-model-partition-usethis/__notebook__.ipynb
/kaggle/input/deberta-model-partition-usethis/__output__.json
/kaggle/input/deberta-model-partition-usethis/custom.css
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/spm.model
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/config.json
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/trainer_state.json
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/training_args.bin
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/tokenizer.json
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/tokenizer_config.json
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/scheduler.pt
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/model.safetensors
/kaggle/input/deberta-model-partition-usethis/output/checkpoint-500/special_tokens

In [2]:
INFERENCE_MAX_LENGTH = 2048

# change to local model path 
model_path = '/kaggle/input/deberta-model-partition-usethis/deberta_partition'

In [3]:
# import modules
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

2024-05-02 19:39:40.493108: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 19:39:40.493211: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 19:39:40.583154: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# load test data (assuming locally stored in the correct format)
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
df_test = pd.DataFrame(data)

In [5]:
def reconstruct(batch): 
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(batch["tokens"], batch["trailing_whitespace"]):
    
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        
        idx += 1
            
                
    return text, token_map

def tokenize(batch, tokenizer):
    
    text,token_map = reconstruct(batch)
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False, max_length=INFERENCE_MAX_LENGTH)

    return tokenized.input_ids, tokenized.attention_mask, tokenized.offset_mapping, token_map

In [6]:
def split_files(df, MAX_LENGTH): 
    df_split = pd.DataFrame(columns = ['document', 'full_text', 'tokens', 'trailing_whitespace', 'input_ids', 'attention_mask', 'offset_mapping', 'token_map'])

    for index, row in df.iterrows(): 
        document = df.loc[index].document
        full_text = df.loc[index].full_text
        tokens = df.loc[index].tokens
        trailing_whitespace = df.loc[index].trailing_whitespace
        input_ids = df.loc[index].input_ids
        attention_mask = df.loc[index].attention_mask
        offset_mapping = df.loc[index].offset_mapping
        token_map = df.loc[index].token_map
        length = len(input_ids)
        num_rows = -(length // -MAX_LENGTH)       
        for i in range(num_rows): 
            new_input_ids = input_ids[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            new_attention_mask = attention_mask[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            new_offset_mapping = offset_mapping[i*MAX_LENGTH: (i + 1)* MAX_LENGTH]
            df_split = df_split._append({'document': document, 'full_text': full_text, 'tokens': tokens, 'trailing_whitespace': trailing_whitespace, 'input_ids': new_input_ids, 'attention_mask': new_attention_mask, 'offset_mapping': new_offset_mapping, 'token_map': token_map}, ignore_index=True)
    
    return df_split

In [7]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
df_test[['input_ids', 'attention_mask', 'offset_mapping', 'token_map']] = df_test.apply(lambda row: tokenize(row, tokenizer), axis='columns', result_type='expand')
df_tt = split_files(df_test, INFERENCE_MAX_LENGTH)

# Make Predictions

In [8]:
ds =  Dataset.from_pandas(df_tt)

In [9]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [10]:
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]

threshold = 0.99
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

In [11]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# Prepare Submission

In [12]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [13]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)