**Some references**

https://www.kaggle.com/code/minhsienweng/train-infer-pii-detection-deberta-v3

(no training) https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook

In [1]:
import json
import datasets
import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from ignite.metrics import Fbeta
from functools import partial
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn.functional import softmax
from peft import get_peft_model, LoraConfig

  from .autonotebook import tqdm as notebook_tqdm


# Data Preparation

In [2]:
#Finding out the number of labels
data = json.load(open('data/train.json'))


all_labels = set()

for d in data:
    all_labels = all_labels.union(set(d['labels']))

print(f"{len(list(all_labels))} labels, with the following labels:\n {list(all_labels)}")
del data

label2id = {label:index for index,label in enumerate(all_labels)}
id2label = {index:label for index,label in enumerate(all_labels)}


13 labels, with the following labels:
 ['B-ID_NUM', 'B-STREET_ADDRESS', 'B-USERNAME', 'B-URL_PERSONAL', 'I-STREET_ADDRESS', 'B-NAME_STUDENT', 'I-URL_PERSONAL', 'I-ID_NUM', 'I-PHONE_NUM', 'O', 'B-PHONE_NUM', 'B-EMAIL', 'I-NAME_STUDENT']


In [3]:
print(all_labels)

{'B-ID_NUM', 'B-STREET_ADDRESS', 'B-USERNAME', 'B-URL_PERSONAL', 'I-STREET_ADDRESS', 'B-NAME_STUDENT', 'I-URL_PERSONAL', 'I-ID_NUM', 'I-PHONE_NUM', 'O', 'B-PHONE_NUM', 'B-EMAIL', 'I-NAME_STUDENT'}


In [4]:
#Change to one-hot vector
def oh_encoder(labels):  #label: array of output for each sentence

    # unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-PHONE_NUM', 'I-PHONE_NUM','B-ID_NUM', 'I-ID_NUM',  'B-URL_PERSONAL','I-URL_PERSONAL',
    #                   'B-STREET_ADDRESS', 'I-STREET_ADDRESS',  'B-EMAIL', 'B-USERNAME']
    
    
    unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL', 'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                     'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']
    
    labels_oh = []
    for label in labels:    #label: str
        label_oh = [float(0)]*len(unique_labels)
        for k in range(len(unique_labels)):
            if unique_labels[k] == label:
                label_oh[k] = 1
                #labels_oh.append(torch.tensor(label_oh, requires_grad=True))
                labels_oh.append(label_oh)
                break
                

    #return torch.tensor(labels_oh, requires_grad=True)
    return torch.tensor(labels_oh, requires_grad=True, dtype=float)    #list of one-hot labels as tensors



In [5]:
def tokenize(example, tokenizer):
    import numpy as np
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], 
                                  example["labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and 
        if t_ws:
            tokens.append(" ")
            # labels.append(oh_encoder("O"))
            labels.append("O")
    
    text = "".join(tokens)
    # print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True,
                          truncation=False)
    #labels = np.array(labels)
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O' 
        if start_idx == 0 and end_idx == 0:
            #token_labels.append(label2id["O"]) 
            #token_labels.append(oh_encoder("O"))
            token_labels.append("O")
        else:
            # case when the text starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            # Convert label to id (int)
            #label_id = label2id[labels[start_idx]]
            label_id = labels[start_idx]
            #token_labels.append(oh_encoder(label_id))
            token_labels.append(label_id)

    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}

In [6]:
def compute_metrics(outputs, labels, unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL', 
                                                          'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                                                          'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']):    
    try:
        #print("Compute metrics")
        predictions = torch.argmax(softmax(outputs, dim=2), dim=2)
        # Include prediction Remove ignored index (special tokens)
        true_preds = []
        true_labels = []
        for pred, label in zip(predictions, labels):
            true_preds.append([unique_labels[p] for p, l in zip(pred, label) if l[0] != -100])
            true_labels.append([unique_labels[torch.argmax(l)] for p, l in zip(pred, label) if l[0] != -100])
        
        mlb = MultiLabelBinarizer(classes=unique_labels)
        true_preds_bin = mlb.fit_transform(true_preds)
        true_labels_bin = mlb.transform(true_labels)
        # Compute recall, precision and f5 score
        recall = recall_score(true_labels_bin, true_preds_bin, average='samples')
        precision = precision_score(true_labels_bin, true_preds_bin, average='samples')
        # Use modified f5 score to measure the performance
        f5_score = (1 + 5*5) * (recall * precision / (5*5*precision + recall))
        result = {'f5': f5_score,  
                  'recall': recall,
                  'precision': precision}
        # print(f"result = {result}")
        return result
    except Exception as e: 
        print(e)

In [7]:
class FocalLoss(nn.Module):
    def __init__(self, alpha=1, gamma=2, num_classes=len(all_labels)):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.num_classes = num_classes

    def forward(self, inputs, targets):
        mask = (targets != -100).float()
        targets = targets.clamp(min=0)
        #targets = F.one_hot(targets, num_classes=self.num_classes).float()

        BCE_loss = F.binary_cross_entropy_with_logits(inputs, targets, reduction='none')
        pt = torch.exp(-BCE_loss)
        F_loss = self.alpha * (1-pt)**self.gamma * BCE_loss

        # F_loss = F_loss.mean(dim=1)
        F_loss = torch.mul(F_loss, mask).mean(dim=1)
        
        return F_loss.sum()/mask.sum()


# Model: DeBERTa (Custom)

Using a pretrained DeBERTa, we will build a classifier head on top of it to predict the class at token level.

In [8]:
# class Classifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim = 13):
#         super(Classifier, self).__init__()

#         self.dropout_prob = 0.3
#         self.final_activation = nn.Softmax(dim = -1)

#         self.linear = nn.Sequential(
#             nn.ReLU(nn.Linear(input_dim, hidden_dim)),
            
#             nn.Dropout(self.dropout_prob),
#             nn.ReLU(nn.Linear(hidden_dim, hidden_dim*2)),

#             nn.Dropout(self.dropout_prob),
#             nn.ReLU(nn.Linear(hidden_dim*2, hidden_dim)),

#             nn.Linear(hidden_dim, output_dim)
#         )
       

#     def forward(self, x):
#         logit = self.linear(x)
#         return self.final_activation(logit)


In [9]:
# class Classifier(nn.Module):
#     def __init__(self, input_dim, hidden_dim, output_dim = 13):
#         super(Classifier, self).__init__()

#         self.dropout_prob = 0.3
#         self.final_activation = nn.Softmax(dim = -1)

#         self.dropout = nn.Dropout(self.dropout_prob)
#         self.linear1 = nn.ReLU(nn.Linear(input_dim,hidden_dim))
#         self.linear2 = nn.ReLU(nn.Linear(hidden_dim, hidden_dim*2))                            
#         self.linear3 = nn.ReLU(nn.Linear(hidden_dim*2, hidden_dim))
#         self.output = nn.Linear(hidden_dim,output_dim)

#     def forward(self, x):
#         print('now start linear1')
#         print(x.size())
#         x = self.linear1(x)

#         print('now start linear2')
#         print(x.size())
#         x = self.linear2(self.dropout(x))

#         print('now start linear3')
#         print(x.size())
#         x = self.linear3(self.dropout(x))

#         print('now start output')
#         print(x.size())
#         logit= self.output(x)
        
#         print('now start activation')
#         output =  self.final_activation(logit)
        
#         return output


In [10]:
# class Deberta_Classif(Classifier):
#     def __init__(self, model_name, classif_input, classif_hidden, classif_output = 13, finetune = False):
#         super(Deberta_Classif, self).__init__(classif_input, classif_input)
#         self.ft = finetune

#         self.extractor = AutoModelForTokenClassification.from_pretrained(model_name).base_model

#         if not finetune:
#             for param in self.extractor.parameters():
#                 param.requires_grad = False

#         self.extractor_num_param =  sum(p.numel() for p in self.extractor.parameters())
#         self.extractor_num_param_grad = sum(p.numel() for p in self.extractor.parameters() if p.requires_grad)
#         self.extractor_name = "DeBERTa"
                

#         self.classifier = Classifier(input_dim=classif_input, hidden_dim=classif_hidden ,output_dim=classif_output)
#         self.classifier_num_param = sum(p.numel() for p in self.classifier.parameters() if p.requires_grad)

        
#     def count_param(self):
        
#         if self.ft:
#             type = 'finetuned'
#             num_param = self.extractor_num_param_grad + self.classifier_num_param
#         else:
#             type = 'non-finetuned'
#             num_param = self.extractor_num_param + self.classifier_num_param

#         print(f"Number of parameters in {type} {self.extractor_name} model is {num_param:,}")
    
    
#     def forward(self, input_ids, attention_mask):
#         x = self.extractor(input_ids, attention_mask).last_hidden_state
#         x = self.classifier(x)

# Training

In [11]:
from transformers import BitsAndBytesConfig
# model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
model_name = "microsoft/deberta-v3-small"
#model = Deberta_Classif(model_name, classif_input = 768, classif_hidden = 100, classif_output = 13, finetune=False)  #Extractor output has dim 768 
config = AutoModelForTokenClassification.from_pretrained(model_name).config

config.update({'num_labels': 13})

peft_config = LoraConfig(
        r=64,
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        task_type="FEATURE_EXTRACTION",
        inference_mode=False,
        target_modules=["query_proj", "key_proj", "value_proj", "output.dense"])

# bnb_config = BitsAndBytesConfig(
#                 load_in_4bit=True,
#                 bnb_4bit_quant_type="nf4",
#                 bnb_4bit_use_double_quant=False,
#                 bnb_4bit_compute_dtype=torch.bfloat16,
#                 llm_int8_skip_modules=['classifier']
#             )

bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        llm_int8_skip_modules=['classifier']
)

model = AutoModelForTokenClassification.from_pretrained(model_name, config = config, 
                                                        ignore_mismatched_sizes=True,)
                                                        #quantization_config=bnb_config)
# model = get_peft_model(model, peft_config)

# print(model)

for param in model.parameters():
        param.requires_grad = False

for param in model.classifier.parameters():
        param.requires_grad = True



print(f"Model's trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad): ,}")
print(f"Model's total parameters: {sum(p.numel() for p in model.parameters()): ,}")

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model's trainable parameters:  9,997
Model's total parameters:  141,314,317


In [12]:
device = 'cuda' 
model = model.to(device)
epochs = 5
tokenizer = AutoTokenizer.from_pretrained(model_name)
# collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 32, max_length=3500)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)
criterion = nn.CrossEntropyLoss().to(device)



In [13]:
#Preparing the datasets for token classification
data = json.load(open('data/train.json'))
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)  
print('Data split completed')

# # Limit to 100 for testing
# train_data = train_data[:100]
# val_data = val_data[:100]

trainset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in train_data],
    'document': [x['document'] for x in train_data],
    'tokens': [x['tokens'] for x in train_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in train_data],
    'labels' :[x['labels'] for x in train_data]
    # 'labels' :[oh_encoder(x['labels']) for x in train_data] 
})
print('trainset loaded')

trainset = trainset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
#train_labels = [oh_encoder(x['labels'] for x in train_data)]
print('trainset mapped')

# val_labels = [oh_encoder(x['labels']) for x in val_data]

valset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in val_data],
    'document': [x['document'] for x in val_data],
    'tokens': [x['tokens'] for x in val_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in val_data],
    'labels' :[x['labels'] for x in val_data]
    # 'labels' :[oh_encoder(x['labels']) for x in val_data]
})
print('valset loaded')

valset = valset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
print('valset mapped')

del data

Data split completed
trainset loaded


Map (num_proc=3): 100%|██████████| 5785/5785 [00:11<00:00, 504.10 examples/s] 


trainset mapped
valset loaded


Map (num_proc=3): 100%|██████████| 1022/1022 [00:04<00:00, 222.41 examples/s]

valset mapped





In [14]:
#First item
print(f"Number of training data: {len(trainset)} || Number of validation data: {len(valset)}")

Number of training data: 5785 || Number of validation data: 1022


In [15]:
'''Not Required'''
# def to_dict(data):
#     dict_of_lists = {}
#     for d in data:
#         for key, value in d.items():
#             if key in dict_of_lists:
#                 dict_of_lists[key].append(value)
#             else:
#                 dict_of_lists[key] = [value]
#     return dict_of_lists

# trainset = to_dict(trainset)
# #trainset['labels'] = train_labels

# valset = to_dict(valset)
# #valset['labels'] = val_labels

'Not Required'

In [16]:
class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']
        self.doc_no = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.attention_mask[idx]), oh_encoder(self.labels[idx]) , torch.tensor(self.doc_no[idx])

custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)

In [17]:
def custom_collate(batch):
    '''
    For padding
    '''
    input_ids, attention_mask, one_hot_labels, document = zip(*batch)
    # Pad the input_ids and labels
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    padded_attention_mask = pad_sequence(attention_mask, batch_first = True, padding_value = 0)
    padded_labels = pad_sequence(one_hot_labels, batch_first=True, padding_value=-100)  
    
    return padded_input_ids, padded_attention_mask, padded_labels, document

batch_size = 8
train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)

'''Uncomment to check'''
# for i, (input_ids, attention_mask, labels, doc) in enumerate(val_dataloader):
#     print(f'Batch {i + 1}:')
#     print('Input IDs:', input_ids.size())
#     print('Attention_mask:', attention_mask.size())
#     print('Labels:', labels.size())
#     print('Document:', doc)
#     print()

'Uncomment to check'

In [18]:
def epoch_time(start_time, end_time):
    elapsed_time = end_time - start_time
    elapsed_mins = int(elapsed_time / 60)
    elapsed_secs = int(elapsed_time - (elapsed_mins * 60))
    return elapsed_mins, elapsed_secs

In [19]:
def val(model, custom_val, batch_size, custom_collate, criterion, device):
    model.eval()

    avg_val_loss = 0

    with torch.no_grad():
        val_loss = 0
        val_score = {'f5': 0, 'recall': 0, 'precision': 0}
        val_dataloader = DataLoader(custom_val, batch_size = batch_size, collate_fn = custom_collate, shuffle = False)

        for batch, (input_ids, attention_mask, labels, doc) in enumerate(val_dataloader):
            
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask).logits
            loss = criterion(outputs, labels)
            results = compute_metrics(outputs, labels)

            val_loss += loss
            val_score['f5'] += results['f5']
            val_score['recall'] += results['recall']
            val_score['precision'] += results['precision']

            if batch%200 == 0 or batch+1 == len(val_dataloader): 
                print("********** For Validation Set **********")                                                                       
                print(f"Completed {batch+1}/{len(val_dataloader)}, with current val_loss: {loss: .4e},\n current results:{results}") 

    avg_val_loss = val_loss / len(custom_val) 
    for k in val_score:
        val_score[k] /= len(val_dataloader)
    # avg_val_score = val_score/len(val_dataloader)

    print(f"Average val_loss: {avg_val_loss: .4e}, avgerage val_score = {val_score}")

    return avg_val_loss, val_score['f5']    #Return only f5
    


In [22]:
import time
def train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion,  device):
    
    model.train()
    best_val_loss = float('inf')
    best_val_score = -float('inf')
    
    for epoch in range(epochs):
        
        start_time = time.time()
        avg_train_loss = 0
        train_loss = 0
        train_score = {'f5': 0, 'recall': 0, 'precision': 0}

        
        train_dataloader = DataLoader(custom_train, batch_size = batch_size, collate_fn = custom_collate, shuffle = True)
        print('Starting training...')
        
        for batch, (input_ids, attention_mask, labels, doc) in enumerate(train_dataloader):

            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask).logits
            # print(outputs.size())
            # print(labels.size())
            # print(outputs)
            loss = criterion(outputs, labels)
            # print(f"Loss: {loss:.4e}")

            loss.backward()
            
            optimizer.step()
            scheduler.step()


            results = compute_metrics(outputs, labels)
            train_loss += loss 
            train_score['f5']+= results['f5']
            train_score['recall'] += results['recall']
            train_score['precision'] += results['precision']



            if batch%200 == 0 or batch+1 == len(train_dataloader):
                results = compute_metrics(outputs, labels)
                print(f"Completed {batch+1}/{len(train_dataloader)}, with current train_loss: {loss: .4e},\n current results:{results}")

        avg_train_loss = train_loss / len(custom_train)    
        for k in train_score:
            train_score[k] /= len(train_dataloader)
        #avg_train_score = train_score / len(train_dataloader)


        end_time = time.time()
        epoch_mins, epoch_secs = epoch_time(start_time, end_time)                
        
        print()
        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss: .4e}, average train_score = {train_score}")

        print()
        print("Starting to validate")
        val_loss, val_score = val(model, custom_val, batch_size, custom_collate, criterion, device)

        print(f"Epoch time: {epoch_mins}m {epoch_secs}s")
        print()

        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'deberta_small_best_loss.pt')

        if val_score > best_val_score:
            best_val_score = val_score
            torch.save(model.state_dict(), 'deberta_small_best_score.pt')
  
        
    torch.save(model.state_dict(), f'deberta_small_{epochs}.pt')


        

Running the function below will output the loss and results every 200 batches.

In [23]:
batch_size = 8
epochs = 5
criterion = FocalLoss()

train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion, device)

Starting training...
Completed 1/724, with current train_loss:  9.2220e-05,
 current results:{'f5': 0.7573015464427856, 'recall': 1.0, 'precision': 0.10715326340326341}
Completed 201/724, with current train_loss:  9.6862e-06,
 current results:{'f5': 1.0, 'recall': 1.0, 'precision': 1.0}
Completed 401/724, with current train_loss:  1.0302e-05,
 current results:{'f5': 0.7572815533980582, 'recall': 0.75, 'precision': 1.0}
Completed 601/724, with current train_loss:  9.7343e-06,
 current results:{'f5': 0.9196141479099676, 'recall': 0.9166666666666666, 'precision': 1.0}
Completed 724/724, with current train_loss:  1.2926e-05,
 current results:{'f5': 1.0, 'recall': 1.0, 'precision': 1.0}

Epoch 1/5: Average train_loss =  1.7887e-06, average train_score = {'f5': 0.9069239026625755, 'recall': 0.9136423802946586, 'precision': 0.9153994696985028}

Starting to validate
********** For Validation Set **********
Completed 1/128, with current val_loss:  1.1929e-05,
 current results:{'f5': 0.838709677

# Converting from predictions to NER labels

In [None]:
#Used to label each token at NER stage
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

In [None]:
## DON'T RUN ##
#### From KAGGLE: https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook ####

triplets = []
pairs = set()  # membership operation using set is faster O(1) than that of list O(n)
processed = []
emails = []
phone_nums = []
urls = []
streets = []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc, full_text in zip(
    preds_final, 
    ds["token_map"], 
    ds["offset_mapping"], 
    ds["tokens"], 
    ds["document"],
    ds["full_text"]
):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]  # Token ID at start index
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue
        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)
    
    # email
    for token_idx, token in enumerate(tokens):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": doc, "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )
                
    # phone number
    matches = phone_num_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": tokens[token_idx]}
            )
    
    # url
    matches = url_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            urls.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-URL_PERSONAL", "token_str": tokens[token_idx]}
            )
    
    # street
#     matches = street_regex.findall(full_text)
#     if not matches:
#         continue
#     for match in matches:
#         target = [t.text for t in nlp.tokenizer(match)]
#         matched_spans = find_span(target, tokens)
#     for matched_span in matched_spans:
#         for intermediate, token_idx in enumerate(matched_span):
#             prefix = "I" if intermediate else "B"
#             streets.append(
#                 {"document": doc, "token": token_idx, "label": f"{prefix}-STREET_ADDRESS", "token_str": tokens[token_idx]}
#             )