**Some references**

https://www.kaggle.com/code/minhsienweng/train-infer-pii-detection-deberta-v3

(no training) https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook

In [None]:
!pip install pytorch-ignite
!pip install datasets



In [None]:
!pip install peft



In [None]:
import json
import datasets
import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from ignite.metrics import Fbeta
from functools import partial
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn.functional import softmax
from peft import get_peft_model, LoraConfig

In [None]:
# Install Kaggle API
!pip install kaggle
# Upload Kaggle API key
from google.colab import files
files.upload()
# Set up Kaggle API credentials
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle competitions download -c pii-detection-removal-from-educational-data
!unzip pii-detection-removal-from-educational-data.zip



Saving kaggle.json to kaggle (1).json
pii-detection-removal-from-educational-data.zip: Skipping, found more recently modified local copy (use --force to force download)
Archive:  pii-detection-removal-from-educational-data.zip
replace sample_submission.csv? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: sample_submission.csv   
replace test.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: test.json               
replace train.json? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: train.json              


# Data Preparation

In [None]:
#Finding out the number of labels
data = json.load(open('train.json'))

all_labels = set()

for d in data:
    all_labels = all_labels.union(set(d['labels']))

print(f"{len(list(all_labels))} labels, with the following labels:\n {list(all_labels)}")
del data

label2id = {label:index for index,label in enumerate(all_labels)}
id2label = {index:label for index,label in enumerate(all_labels)}


13 labels, with the following labels:
 ['B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'I-ID_NUM', 'O', 'I-URL_PERSONAL', 'I-PHONE_NUM', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'B-EMAIL', 'I-STREET_ADDRESS']


In [None]:
print(all_labels)

{'B-NAME_STUDENT', 'I-NAME_STUDENT', 'B-ID_NUM', 'I-ID_NUM', 'O', 'I-URL_PERSONAL', 'I-PHONE_NUM', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'B-EMAIL', 'I-STREET_ADDRESS'}


In [None]:
#Change to one-hot vector
def oh_encoder(labels):  #label: array of output for each sentence

    # unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-PHONE_NUM', 'I-PHONE_NUM','B-ID_NUM', 'I-ID_NUM',  'B-URL_PERSONAL','I-URL_PERSONAL',
    #                   'B-STREET_ADDRESS', 'I-STREET_ADDRESS',  'B-EMAIL', 'B-USERNAME']


    unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL', 'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                     'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']

    labels_oh = []
    for label in labels:    #label: str
        label_oh = [float(0)]*len(unique_labels)
        for k in range(len(unique_labels)):
            if unique_labels[k] == label:
                label_oh[k] = 1
                #labels_oh.append(torch.tensor(label_oh, requires_grad=True))
                labels_oh.append(label_oh)
                break


    #return torch.tensor(labels_oh, requires_grad=True)
    return torch.tensor(labels_oh, requires_grad=True, dtype=float)    #list of one-hot labels as tensors



In [None]:
def tokenize(example, tokenizer):
    import numpy as np
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"],
                                  example["labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and
        if t_ws:
            tokens.append(" ")
            # labels.append(oh_encoder("O"))
            labels.append("O")

    text = "".join(tokens)
    # print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True,
                          truncation=False)
    #labels = np.array(labels)
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O'
        if start_idx == 0 and end_idx == 0:
            #token_labels.append(label2id["O"])
            #token_labels.append(oh_encoder("O"))
            token_labels.append("O")
        else:
            # case when the text starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            # Convert label to id (int)
            #label_id = label2id[labels[start_idx]]
            label_id = labels[start_idx]
            #token_labels.append(oh_encoder(label_id))
            token_labels.append(label_id)

    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [None]:
def compute_metrics(outputs, labels, unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL',
                                                          'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                                                          'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']):
    try:
        #print("Compute metrics")
        predictions = torch.argmax(softmax(outputs, dim=2), dim=2)
        # Include prediction Remove ignored index (special tokens)
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([unique_labels[p] for p, l in zip(pred, label) if l[0] != -100])
            true_labels.append([unique_labels[torch.argmax(l)] for p, l in zip(pred, label) if l[0] != -100])

        mlb = MultiLabelBinarizer(classes=unique_labels)
        true_preds_bin = mlb.fit_transform(true_preds)
        true_labels_bin = mlb.transform(true_labels)
        # Compute recall, precision and f5 score
        recall = recall_score(true_labels_bin, true_preds_bin, average='samples')
        precision = precision_score(true_labels_bin, true_preds_bin, average='samples')
        # Use modified f5 score to measure the performance
        f5_score = (1 + 5*5) * (recall * precision / (5*5*precision + recall))
        result = {'f5': f5_score,
                  'recall': recall,
                  'precision': precision}
        # print(f"result = {result}")
        return result
    except Exception as e:
        print(e)

In [None]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, num_classes=len(all_labels)):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_classes = num_classes

    def forward(self, inputs, targets):
        mask = (targets != -100).float()
        targets = targets.clamp(min=0)
        #targets = F.one_hot(targets, num_classes=self.num_classes).float()

        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        # F_loss = F_loss.mean(dim=1)
        F_loss = torch.mul(F_loss, mask).mean(dim=1)

        return F_loss.sum()/mask.sum()


# Model: T5

Using a pretrained T5 (small), we will build a classifier head on top of it to predict the class at token level.

# Training

In [None]:
from transformers import T5TokenizerFast, T5ForTokenClassification
import torch

In [None]:
from transformers import BitsAndBytesConfig


tokenizer = T5TokenizerFast.from_pretrained("google-t5/t5-small")

#config = T5ForTokenClassification.from_pretrained("google-t5/t5-small").config

config = AutoModelForTokenClassification.from_pretrained("google-t5/t5-small").config

config.update({'num_labels': 13})

peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="FEATURE_EXTRACTION",
        inference_mode=False)
        #target_modules=["query_proj", "key_proj", "value_proj", "output.dense"])


# bnb_config = BitsAndBytesConfig(
#                 load_in_4bit=True,
#                 bnb_4bit_quant_type="nf4",
#                 bnb_4bit_use_double_quant=False,
#                 bnb_4bit_compute_dtype=torch.bfloat16,
#                 llm_int8_skip_modules=['classifier']
#             )

#bnb_config = BitsAndBytesConfig(
 #       load_in_8bit=True,
  #      llm_int8_skip_modules=['classifier']
#)

#model = T5ForTokenClassification.from_pretrained("google-t5/t5-small", ignore_mismatched_sizes=True)
model = AutoModelForTokenClassification.from_pretrained("google-t5/t5-small", config = config,
                                                        ignore_mismatched_sizes=True)
                                                        #quantization_config=bnb_config)
model = get_peft_model(model, peft_config)

# print(model)

for param in model.parameters():
        param.requires_grad = False

for param in model.classifier.parameters():
        param.requires_grad = True



print(f"Model's trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad): ,}")
print(f"Model's total parameters: {sum(p.numel() for p in model.parameters()): ,}")

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of T5ForTokenClassification were not initialized from the model checkpoint at google-t5/t5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model's trainable parameters:  6,669
Model's total parameters:  36,123,917


In [None]:
device = 'cuda'
model = model.to(device)
epochs = 5
##tokenizer = AutoTokenizer.from_pretrained(model_name)
# collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 32, max_length=3500)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)
#criterion = nn.CrossEntropyLoss().to(device)

In [None]:
#Preparing the datasets for token classification
data = json.load(open('train.json'))
##model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
##tokenizer = AutoTokenizer.from_pretrained(model_name)

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)
print('Data split completed')

# # Limit to 100 for testing
#train_data = train_data[:100]
#val_data = val_data[:100]

trainset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in train_data],
    'document': [x['document'] for x in train_data],
    'tokens': [x['tokens'] for x in train_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in train_data],
    'labels' :[x['labels'] for x in train_data]
    # 'labels' :[oh_encoder(x['labels']) for x in train_data]
})
print('trainset loaded')

trainset = trainset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
#train_labels = [oh_encoder(x['labels'] for x in train_data)]
print('trainset mapped')

# val_labels = [oh_encoder(x['labels']) for x in val_data]

valset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in val_data],
    'document': [x['document'] for x in val_data],
    'tokens': [x['tokens'] for x in val_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in val_data],
    'labels' :[x['labels'] for x in val_data]
    # 'labels' :[oh_encoder(x['labels']) for x in val_data]
})
print('valset loaded')

valset = valset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
print('valset mapped')

del data

Data split completed
trainset loaded


Map (num_proc=3):   0%|          | 0/5785 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (700 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (712 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (927 > 512). Running this sequence through the model will result in indexing errors


trainset mapped
valset loaded


Map (num_proc=3):   0%|          | 0/1022 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (603 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (859 > 512). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (771 > 512). Running this sequence through the model will result in indexing errors


valset mapped


In [None]:
#First item
print(f"Number of training data: {len(trainset)} || Number of validation data: {len(valset)}")

Number of training data: 5785 || Number of validation data: 1022


In [None]:
'''Not Required'''
# def to_dict(data):
#     dict_of_lists = {}
#     for d in data:
#         for key, value in d.items():
#             if key in dict_of_lists:
#                 dict_of_lists[key].append(value)
#             else:
#                 dict_of_lists[key] = [value]
#     return dict_of_lists

# trainset = to_dict(trainset)
# #trainset['labels'] = train_labels

# valset = to_dict(valset)
# #valset['labels'] = val_labels

'Not Required'

In [None]:
class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']
        self.doc_no = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.attention_mask[idx]), oh_encoder(self.labels[idx]) , torch.tensor(self.doc_no[idx])

custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)

In [None]:
def custom_collate(batch):
    '''
    For padding
    '''
    input_ids, attention_mask, one_hot_labels, document = zip(*batch)
    # Pad the input_ids and labels
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    padded_attention_mask = pad_sequence(attention_mask, batch_first = True, padding_value = 0)
    padded_labels = pad_sequence(one_hot_labels, batch_first=True, padding_value=-100)

    return padded_input_ids, padded_attention_mask, padded_labels, document

batch_size = 8
train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)



In [None]:
'''Uncomment to check'''
for i, (input_ids, attention_mask, labels, doc) in enumerate(val_dataloader):
     print(f'Batch {i + 1}:')
     print('Input IDs:', input_ids.size())
     print('Attention_mask:', attention_mask.size())
     print('Labels:', labels.size())
     print('Document:', doc)
     print()

Batch 1:
Input IDs: torch.Size([8, 1010])
Attention_mask: torch.Size([8, 1010])
Labels: torch.Size([8, 1010, 13])
Document: (tensor(17809), tensor(11144), tensor(16158), tensor(9980), tensor(17676), tensor(20173), tensor(10494), tensor(14012))

Batch 2:
Input IDs: torch.Size([8, 1395])
Attention_mask: torch.Size([8, 1395])
Labels: torch.Size([8, 1395, 13])
Document: (tensor(8593), tensor(22104), tensor(20242), tensor(10762), tensor(20529), tensor(4899), tensor(8849), tensor(13131))

Batch 3:
Input IDs: torch.Size([8, 2212])
Attention_mask: torch.Size([8, 2212])
Labels: torch.Size([8, 2212, 13])
Document: (tensor(15076), tensor(13209), tensor(8612), tensor(16882), tensor(16916), tensor(20336), tensor(21008), tensor(10877))

Batch 4:
Input IDs: torch.Size([8, 1524])
Attention_mask: torch.Size([8, 1524])
Labels: torch.Size([8, 1524, 13])
Document: (tensor(8229), tensor(17787), tensor(19041), tensor(16583), tensor(11136), tensor(17544), tensor(3686), tensor(13706))

Batch 5:
Input IDs: tor

In [None]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [None]:
def val(model, custom_val, batch_size, custom_collate, criterion, device):
    model.eval()

    avg_val_loss = 0

    with torch.no_grad():
        val_loss = 0
        val_score = {'f5': 0, 'recall': 0, 'precision': 0}
        val_dataloader = DataLoader(custom_val, batch_size = batch_size, collate_fn = custom_collate, shuffle = False)

        for batch, (input_ids, attention_mask, labels, doc) in enumerate(val_dataloader):

            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask).logits
            loss = criterion(outputs, labels)
            results = compute_metrics(outputs, labels)

            val_loss += loss
            val_score['f5'] += results['f5']
            val_score['recall'] += results['recall']
            val_score['precision'] += results['precision']

            if batch%200 == 0 or batch+1 == len(val_dataloader):
                print("********** For Validation Set **********")
                print(f"Completed {batch+1}/{len(val_dataloader)}, with current val_loss: {loss: .4e},\n current results:{results}")

    avg_val_loss = val_loss / len(custom_val)
    for k in val_score:
        val_score[k] /= len(val_dataloader)
    # avg_val_score = val_score/len(val_dataloader)

    print(f"Average val_loss: {avg_val_loss: .4e}, avgerage val_score = {val_score}")

    return avg_val_loss, val_score['f5']    #Return only f5



In [None]:
import time
def train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion,  device):

    model.train()
    best_val_loss = float('inf')
    best_val_score = -float('inf')

    for epoch in range(epochs):

        start_time = time.time()
        avg_train_loss = 0
        train_loss = 0
        train_score = {'f5': 0, 'recall': 0, 'precision': 0}


        train_dataloader = DataLoader(custom_train, batch_size = batch_size, collate_fn = custom_collate, shuffle = True)
        print('Starting training...')

        for batch, (input_ids, attention_mask, labels, doc) in enumerate(train_dataloader):

            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask).logits
            # print(outputs.size())
            # print(labels.size())
            # print(outputs)

            loss = criterion(outputs, labels)
            # print(f"Loss: {loss:.4e}")

            loss.backward()

            optimizer.step()
            scheduler.step()


            results = compute_metrics(outputs, labels)
            train_loss += loss
            train_score['f5']+= results['f5']
            train_score['recall'] += results['recall']
            train_score['precision'] += results['precision']



            if batch%200 == 0 or batch+1 == len(train_dataloader):
                results = compute_metrics(outputs, labels)
                print(f"Completed {batch+1}/{len(train_dataloader)}, with current train_loss: {loss: .4e},\n current results:{results}")

        avg_train_loss = train_loss / len(custom_train)
        for k in train_score:
            train_score[k] /= len(train_dataloader)
        #avg_train_score = train_score / len(train_dataloader)


        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)

        print()
        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss: .4e}, average train_score = {train_score}")

        print()
        print("Starting to validate")
        val_loss, val_score = val(model, custom_val, batch_size, custom_collate, criterion, device)

        print(f"Epoch time: {epoch_mins}m {epoch_secs}s")
        print()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'T5_small_LoRA_best_loss.pt')

        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), 'T5_small_LoRA_best_score.pt')


    torch.save(model.state_dict(), f'T5_small_LoRA_{epochs}.pt')




Running the function below will output the loss and results every 200 batches.

In [None]:
batch_size = 8
epochs = 5
criterion = FocalLoss()

train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion, device)

Starting training...
Completed 1/724, with current train_loss:  9.1803e-04,
 current results:{'f5': 0.7878787878787878, 'recall': 1.0, 'precision': 0.125}
Completed 201/724, with current train_loss:  1.2027e-03,
 current results:{'f5': 0.7546174142480211, 'recall': 1.0, 'precision': 0.10576923076923078}
Completed 401/724, with current train_loss:  1.0971e-03,
 current results:{'f5': 0.7344632768361582, 'recall': 1.0, 'precision': 0.09615384615384616}
Completed 601/724, with current train_loss:  1.1275e-03,
 current results:{'f5': 0.6842105263157894, 'recall': 1.0, 'precision': 0.07692307692307693}
Completed 724/724, with current train_loss:  1.0340e-03,
 current results:{'f5': 0.6842105263157894, 'recall': 1.0, 'precision': 0.07692307692307693}

Epoch 1/5: Average train_loss =  1.4279e-04, average train_score = {'f5': 0.7318610197008035, 'recall': 0.9997410220994475, 'precision': 0.09739893221419511}

Starting to validate
********** For Validation Set **********
Completed 1/128, with c

In [None]:
torch.save(model, 'T5LoRAmodel_full.pth')

In [None]:
from google.colab import files
files.download('T5LoRAmodel_full.pth')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
files.download('T5_small_LoRA_5.pt')
files.download('T5_small_LoRA_best_loss.pt')
files.download('T5_small_LoRA_best_score.pt')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# Converting from predictions to NER labels

In [None]:
# load model for testing
#model = torch.load('T5model_full.pth')
#model.eval()  # Set the model to evaluation mode