### Please note: 
This is submission version which disabled internet connection and only contain the result, please refer to the whole Data Analysis notebook: https://www.kaggle.com/code/jackren000/piidatadetection-dataanalysis

In [1]:
#### import libraries
import pandas as pd
import numpy as np
import os
import json
# import argparse library to handle command-line arguments
import argparse
# a Hugging Face Dataset used for natural language processing (NLP)
from datasets import Dataset
from pathlib import Path
# import the 'chain' function to combine multiple iterables into a single iterable
from itertools import chain
# tokenizer that can automatically find the model's required tokenization from the model name
from transformers import AutoTokenizer
# model class that can automatically find a token classification model from the model name
from transformers import AutoModelForTokenClassification
# class that provides an API for feature-complete training in PyTorch
from transformers import Trainer
# class to store hyperparameters for training
from transformers import TrainingArguments
# data collator that dynamically pads the inputs received, used for token classification tasks
from transformers import DataCollatorForTokenClassification
# use `partial` to create a new function with fixed arguments
from functools import partial

2024-02-15 23:27:32.054589: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-02-15 23:27:32.054719: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-02-15 23:27:32.199494: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
#### load the test file
# load the JSON file content
with open("/kaggle/input/pii-detection-removal-from-educational-data/test.json", 'r') as file:
    test_data = json.load(file)

In [3]:
#### prepare the test dataset
# prepare the dataset from a dictionary format
# key will be the column names, value will assign to each row
test_data = Dataset.from_dict({
    'full_text': [essay['full_text'] for essay in test_data],
    'document': [essay['document'] for essay in test_data],
    'tokens': [essay['tokens'] for essay in test_data],
    'trailing_whitespace': [essay['trailing_whitespace'] for essay in test_data]
    
})

In [4]:
#### define the tokenize for test data
def tokenize(essay, tokenizer):
    '''
    Tokenizes the text data within an essay dictionary for inference
    '''
    # extract the full text from the essay dictionary
    text = essay['full_text']    
    # perform tokenization of the input text
    tokenized = tokenizer(
        text, 
        return_offsets_mapping=True,
        truncation=True,
        max_length=INFERENCE_MAX_LENGTH)
    
    # initialize the token_map list
    token_map = []
    idx = 0
    # iterate over each token and its corresponding trailing whitespace
    for token, has_whitespace in zip(essay["tokens"], essay["trailing_whitespace"]):
        # map the current token index to all characters in the token
        token_map.extend([idx]*len(token))
        # if there is trailing whitespace, map it to -1
        if has_whitespace:
            token_map.append(-1)
        # increment the token index
        idx += 1
        
    # add the 'token_map' key and store the token_map variable
    tokenized['token_map'] = token_map
    
    # return the tokenized input and the token_map
    return tokenized


In [5]:
#### load the fine-tuned pre-trained tokenizer
# define the model path
model_path = '/kaggle/input/piimodel/deberta3base_1024'
INFERENCE_MAX_LENGTH = 2048

tokenizer = AutoTokenizer.from_pretrained(model_path)
test_data = test_data.map(
    tokenize, 
    fn_kwargs={"tokenizer": tokenizer}, 
    num_proc=2) # the number of parallel processes used for training

# load the fine-tuned pre-trained model
model = AutoModelForTokenClassification.from_pretrained(model_path)

# load the data collator used for padding input
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# setting up training arguments
args = TrainingArguments(
    ".",  # output directory where training artifacts will be written
    per_device_eval_batch_size=1,  # evaluation batch size per device (e.g., GPU)
    report_to="none",  # disables reporting to any external entity
)

# initializing the Trainer
trainer = Trainer(
    model=model,  # the pre-trained model to be fine-tuned or evaluated
    args=args,  # training arguments set up above
    data_collator=collator,  # responsible for batching and preparing data
    tokenizer=tokenizer,  # tokenizer to be used for pre-processing text data
)

    

#1:   0%|          | 0/5 [00:00<?, ?ex/s]

#0:   0%|          | 0/5 [00:00<?, ?ex/s]

  block_group = [InMemoryTable(cls._concat_blocks(list(block_group), axis=axis))]
  table = cls._concat_blocks(blocks, axis=0)


In [6]:
# call the predict method of the trainer object on test_data
# extract the .predictions attribute from the result
predictions = trainer.predict(test_data).predictions
# apply the softmax function to predictions
pred_softmax = np.exp(predictions) / np.sum(np.exp(predictions), axis=2).reshape(predictions.shape[0], predictions.shape[1], 1)
# loads the configuration JSON file
config = json.load(open(os.path.join(model_path, "config.json")))
# id2label is a dictionary extracted from the loaded configuration
# mapping numerical IDs to their corresponding labels
id2label = config["id2label"]
# find the index of the maximum value along the last axis
preds = predictions.argmax(-1)
# apply softmax to the first 12 classes (assuming 'O' is the 13th class)
preds_without_O = pred_softmax[:, :, :12].argmax(-1)
# extract the probabilities of the 'O' class
O_preds = pred_softmax[:, :, 12]
# define the threshold for 'O' class predictions
threshold = 0.9
# decide final predictions based on the threshold:
# if the probability of 'O' is less than the threshold, use preds_without_O, else use preds
preds_final = np.where(O_preds < threshold, preds_without_O, preds)

In [7]:
# initialize lists to store the processed information
triplets = []
document, token, label, token_str = [], [], [], []

# iterate through the predictions and supporting data
for p, token_map, offsets, tokens, doc in zip(preds_final, test_data["token_map"], test_data["offset_mapping"], test_data["tokens"], test_data["document"]):

    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]

        # skip if start and end indices are both 0
        if start_idx + end_idx == 0:
            continue

        # adjust start index if token map is -1
        if token_map[start_idx] == -1:
            start_idx += 1

        # skip any whitespace tokens
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1

        # break if start index goes beyond token map length
        if start_idx >= len(token_map):
            break

        token_id = token_map[start_idx]

        # process only non-"O" labels and valid token IDs
        if label_pred != "O" and token_id != -1:
            triplet = (label_pred, token_id, tokens[token_id])

            # add unique triplets to the list and record details
            if triplet not in triplets:
                document.append(doc)
                token.append(token_id)
                label.append(label_pred)
                token_str.append(tokens[token_id])
                triplets.append(triplet)

In [8]:
# create the text dataframe
test_data = pd.DataFrame({
    "document": document,
    "token": token,
    "label": label,
    "token_str": token_str
})
# add row id column
test_data["row_id"] = list(range(len(test_data)))
display(test_data.head(100))

Unnamed: 0,document,token,label,token_str,row_id
0,7,9,B-NAME_STUDENT,Nathalie,0
1,7,10,I-NAME_STUDENT,Sylla,1
2,7,482,B-NAME_STUDENT,Nathalie,2
3,7,483,I-NAME_STUDENT,Sylla,3
4,7,741,B-NAME_STUDENT,Nathalie,4
5,7,742,I-NAME_STUDENT,Sylla,5
6,10,0,B-NAME_STUDENT,Diego,6
7,10,1,I-NAME_STUDENT,Estrada,7
8,10,464,B-NAME_STUDENT,Diego,8
9,10,465,I-NAME_STUDENT,Estrada,9


In [9]:
# save the predictions
test_data[["row_id", "document", "token", "label"]].to_csv("submission.csv", index=False)
print('Completed!')

Completed!
