# DeBERTa Inference Notebook for Truncation
This notebook serves as the main runfile for making inferences on DeBERTa models using truncation. The parameters that can be changed are the threshold for determining if something is a non-PII or not as well as how the data is truncated. Otherwise, make sure to import the correct model path for the most updated DeBERTa truncation model

Note that we tested models by making inferences and submitting these inferences to kaggle for scoring.

To replicate the baseline, change truncation = True when tokenizing (as well as other changes made in model)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/roberta-model-partition/__results__.html
/kaggle/input/roberta-model-partition/__notebook__.ipynb
/kaggle/input/roberta-model-partition/__output__.json
/kaggle/input/roberta-model-partition/custom.css
/kaggle/input/roberta-model-partition/roberta_partition/config.json
/kaggle/input/roberta-model-partition/roberta_partition/merges.txt
/kaggle/input/roberta-model-partition/roberta_partition/training_args.bin
/kaggle/input/roberta-model-partition/roberta_partition/tokenizer.json
/kaggle/input/roberta-model-partition/roberta_partition/vocab.json
/kaggle/input/roberta-model-partition/roberta_partition/tokenizer_config.json
/kaggle/input/roberta-model-partition/roberta_partition/model.safetensors
/kaggle/input/roberta-model-partition/roberta_partition/special_tokens_map.json
/kaggle/input/roberta-model-partition/output/checkpoint-3000/config.json
/kaggle/input/roberta-model-partition/output/checkpoint-3000/merges.txt
/kaggle/input/roberta-model-partition/output/checkpoint-3000/

In [2]:
INFERENCE_MAX_LENGTH = 2048
model_path = '/kaggle/input/deberta-model-truncation/deberta_opttrunc1'

In [3]:
import json
import argparse
from itertools import chain
import pandas as pd
from pathlib import Path
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments, DataCollatorForTokenClassification
from datasets import Dataset
import numpy as np

2024-05-02 22:36:33.526981: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 22:36:33.527095: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 22:36:33.700420: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
data = json.load(open("/kaggle/input/pii-detection-removal-from-educational-data/test.json"))
df_test = pd.DataFrame(data)

In [5]:
def reconstruct(batch): 
    text = []
    token_map = []
    
    idx = 0
    
    for t, ws in zip(batch["tokens"], batch["trailing_whitespace"]):
    
        text.append(t)
        token_map.extend([idx]*len(t))
        if ws:
            text.append(" ")
            token_map.append(-1)
        
        idx += 1
            
                
    return text, token_map

def tokenize(batch, tokenizer):
    
    text,token_map = reconstruct(batch)
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False, max_length=INFERENCE_MAX_LENGTH)
    
    # TRUNCATE: VARIOUS CUTOFFS
    # LINE 1: SAME CUTOFF AS TRAINING (MODULO DIFFERENT LENGTH)
    # LINE 2: BEST CUTOFF FOR INFERENCE_MAX_LENGTH
    length = len(tokenized.input_ids)
    if (length > INFERENCE_MAX_LENGTH):
        for k,v in tokenized.items():
            # tokenized[k] = v[:175] + v[-(INFERENCE_MAX_LENGTH - 175):]
            tokenized[k] = v[:1024] + v[-(INFERENCE_MAX_LENGTH - 1024):]    

    return tokenized.input_ids, tokenized.attention_mask, tokenized.offset_mapping, token_map

In [6]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
df_test[['input_ids', 'attention_mask', 'offset_mapping', 'token_map']] = df_test.apply(lambda row: tokenize(row, tokenizer), axis='columns', result_type='expand')

# Make Predictions

In [7]:
ds =  Dataset.from_pandas(df_test)

In [8]:
model = AutoModelForTokenClassification.from_pretrained(model_path)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)
args = TrainingArguments(
    ".", 
    per_device_eval_batch_size=1, 
    report_to="none",
)
trainer = Trainer(
    model=model, 
    args=args, 
    data_collator=collator, 
    tokenizer=tokenizer,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [9]:
predictions = trainer.predict(ds).predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis = 2).reshape(predictions.shape[0],predictions.shape[1],1)

config = json.load(open(Path(model_path) / "config.json"))
id2label = config["id2label"]
preds = predictions.argmax(-1)
preds_without_O = pred_softmax[:,:,:12].argmax(-1)
O_preds = pred_softmax[:,:,12]

threshold = 0.99
preds_final = np.where(O_preds < threshold, preds_without_O , preds)

In [10]:
triplets = []
document, token, label, token_str = [], [], [], []
for p, token_map, offsets, tokens, doc in zip(preds_final, ds["token_map"], ds["offset_mapping"], ds["tokens"], ds["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        if start_idx + end_idx == 0: continue

        if token_map[start_idx] == -1:
            start_idx += 1

        # ignore "\n\n"
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        if start_idx >= len(token_map): break

        token_id = token_map[start_idx]

        # ignore "O" predictions and whitespace preds
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

# Prepare Submission

In [11]:
df = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
df["row_id"] = list(range(len(df)))
display(df.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [12]:
df[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)