**Some references**

https://www.kaggle.com/code/minhsienweng/train-infer-pii-detection-deberta-v3

(no training) https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook

In [None]:
!pip install pytorch-ignite
!pip install datasets



In [None]:
import json
import datasets
import tqdm
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
import numpy as np
import torch.nn as nn
import torch.nn.functional as F
from ignite.metrics import Fbeta
from functools import partial
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics import recall_score, precision_score
from sklearn.preprocessing import MultiLabelBinarizer
from torch.nn.functional import softmax



In [None]:
# Install Kaggle API
#!pip install kaggle
# Upload Kaggle API key
#from google.colab import files
#files.upload()
# Set up Kaggle API credentials
#!mkdir -p ~/.kaggle
#!cp kaggle.json ~/.kaggle/
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle competitions download -c pii-detection-removal-from-educational-data
!#unzip pii-detection-removal-from-educational-data.zip

# Data Preparation

In [None]:
#Finding out the number of labels
data = json.load(open('train.json'))

# Finding unique labels
all_labels = set()
for d in data:
    all_labels.update(d['labels'])

# Clear memory
del data

# Creating label-index mappings
label2id = {label: index for index, label in enumerate(all_labels)}
id2label = {index: label for index, label in enumerate(all_labels)}

# Inspecting mappings
print(f"Number of labels: {len(label2id)}")
for label, idx in list(label2id.items())[:5]:  # print first 5
    print(f"Label {label} has index {idx}")

Number of labels: 13
Label I-PHONE_NUM has index 0
Label B-ID_NUM has index 1
Label I-NAME_STUDENT has index 2
Label I-ID_NUM has index 3
Label B-STREET_ADDRESS has index 4


In [None]:
def oh_encoder(labels, unique_labels=['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL',
                                      'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                                      'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']):
    labels_oh = []
    for label in labels:  # label: str
        label_oh = [0.0] * len(unique_labels)
        if label in unique_labels:
            label_oh[unique_labels.index(label)] = 1.0
        labels_oh.append(label_oh)

    return torch.tensor(labels_oh, dtype=torch.float32)  # list of one-hot labels as tensors

    # Example usage
unique_labels = list(all_labels)
encoded_labels = oh_encoder(["B-NAME_STUDENT", "O"], unique_labels)
print(encoded_labels)

tensor([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1.],
        [0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.]])


In [None]:
def tokenize(example, tokenizer, label2id):
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], example["labels"], example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        if t_ws:
            tokens.append(" ")
            labels.append("O")

    text = "".join(tokens)
    tokenized = tokenizer(text, return_offsets_mapping=True, truncation=False)
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
        else:
            if text[start_idx].isspace():
                start_idx += 1
            label_id = label2id[labels[start_idx]]
            token_labels.append(label_id)

    return {**tokenized, "labels": token_labels}

In [None]:
data = json.load(open('train.json'))

from transformers import BartTokenizerFast

tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-large", add_prefix_space=True)

# Assume we are processing the first document for demonstration
d = data[0]
tokenized_output = tokenize(d, tokenizer, label2id)

# Convert tokenized input IDs to tensor for use with PyTorch models
input_ids_tensor = torch.tensor(tokenized_output['input_ids'])
tokens = tokenizer.convert_ids_to_tokens(input_ids_tensor)

# Debugging output to verify tokens
print(tokens)

['<s>', 'ĠDesign', 'ĠThinking', 'Ġfor', 'Ġinnovation', 'Ġreflex', 'ion', '-', 'Av', 'ril', 'Ġ2021', '-', 'N', 'ath', 'al', 'ie', 'ĠSy', 'lla', 'Ċ', 'Ċ', 'Chall', 'enge', 'Ġ&', 'Ġselection', 'Ċ', 'Ċ', 'The', 'Ġtool', 'ĠI', 'Ġuse', 'Ġto', 'Ġhelp', 'Ġall', 'Ġstakeholders', 'Ġfinding', 'Ġtheir', 'Ġway', 'Ġthrough', 'Ġthe', 'Ġcomplexity', 'Ġof', 'Ġa', 'Ġproject', 'Ġis', 'Ġthe', 'Ġ', 'Ġmind', 'Ġmap', '.', 'Ċ', 'Ċ', 'What', 'Ġexactly', 'Ġis', 'Ġa', 'Ġmind', 'Ġmap', '?', 'ĠAccording', 'Ġto', 'Ġthe', 'Ġdefinition', 'Ġof', 'ĠBu', 'zan', 'ĠT', '.', 'Ġand', 'ĠBu', 'zan', 'ĠB', '.', 'Ġ(', '1999', ',', 'ĠD', 'ess', 'ine', '-', 'mo', 'i', 'Ġ', 'Ġl', "'", 'intelligence', '.', 'ĠParis', ':', 'ĠLes', 'ĠÃī', 'd', 'itions', 'Ġd', "'", 'Organ', 'isation', '.),', 'Ġthe', 'Ġmind', 'Ġmap', 'Ġ(', 'or', 'Ġhe', 'uristic', 'Ġdiagram', ')', 'Ġis', 'Ġa', 'Ġgraphic', 'Ġ', 'Ġrepresentation', 'Ġtechnique', 'Ġthat', 'Ġfollows', 'Ġthe', 'Ġnatural', 'Ġfunctioning', 'Ġof', 'Ġthe', 'Ġmind', 'Ġand', 'Ġallows', 'Ġthe', 'Ġbra

# Model: BART

Using a pretrained BART, we will build a classifier head on top of it to predict the class at token level.

In [None]:
from transformers import BartModel
import torch
from torch import nn

class BartForTokenClassification(nn.Module):
    def __init__(self, num_labels):
        super().__init__()
        self.bart = BartModel.from_pretrained("facebook/bart-large")

        # Freeze the BART model parameters
        for param in self.bart.parameters():
            param.requires_grad = False

        self.num_labels = num_labels
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(self.bart.config.hidden_size, num_labels)

    def forward(self, input_ids, attention_mask=None, labels=None):
        outputs = self.bart(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        sequence_output = self.dropout(sequence_output)
        logits = self.classifier(sequence_output)

        if labels is not None:
            loss_fct = nn.CrossEntropyLoss(ignore_index=-100)  # Assuming -100 is used for padding
            if attention_mask is not None:
                active_loss = attention_mask.view(-1) == 1
                active_logits = logits.view(-1, num_labels)  # Corrected to num_labels
                active_labels = torch.where(
                    active_loss, labels.view(-1), torch.tensor(loss_fct.ignore_index).type_as(labels)
                )
                loss = loss_fct(active_logits, active_labels)
            else:
                loss = loss_fct(logits.view(-1, num_labels), labels.view(-1))  # Corrected to num_labels
            return loss, logits
        return logits

In [None]:
# Initialize the model with the number of labels for your classification task.
num_labels = len(label2id)
model = BartForTokenClassification(num_labels=num_labels)

In [None]:
# Calculate the total number of parameters
total_params = sum(p.numel() for p in model.parameters())

# Calculate the number of trainable parameters (parameters where gradients will be computed)
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Parameters: {total_params}")
print(f"Trainable Parameters: {trainable_params}")

Total Parameters: 406304781
Trainable Parameters: 13325


QUICK CHECK: Test model initialization

In [None]:
import torch

# Dummy inputs
input_ids = torch.randint(0, 50265, (1, 10))  # Example input ID for BART, sequence length 10
attention_mask = torch.ones(1, 10)

# Forward pass
with torch.no_grad():  # No need to compute gradients for a simple forward test
    logits = model(input_ids, attention_mask)
    print(f"Logits Shape: {logits.shape}")  # Should be (1, 10, num_labels)

# Check if output shapes are as expected
assert logits.shape == (1, 10, num_labels), "Output shape is incorrect!"

Logits Shape: torch.Size([1, 10, 13])


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model = model.to(device)
epochs = 5
tokenizer = BartTokenizerFast.from_pretrained("facebook/bart-large", add_prefix_space=True)
optimizer = torch.optim.Adam(model.classifier.parameters(), lr=0.001)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=epochs)
criterion = nn.CrossEntropyLoss().to(device)

In [None]:
from sklearn.preprocessing import LabelBinarizer
from sklearn.metrics import precision_recall_fscore_support

def compute_metrics(outputs, labels, label_map):
    # Assuming outputs are logits and labels are indices
    softmax = nn.Softmax(dim=2)
    predictions = torch.argmax(softmax(outputs), dim=2)

    true_labels = []
    true_preds = []

    # Flatten outputs and labels for evaluation ignoring padding index
    for i in range(len(labels)):
        label_mask = labels[i] != -100  # Assume -100 is used to mark padding labels
        true_labels.extend(labels[i][label_mask].cpu().numpy())
        true_preds.extend(predictions[i][label_mask].cpu().numpy())

    # Convert indices to names using label_map
    true_labels = [label_map[label] for label in true_labels]
    true_preds = [label_map[pred] for pred in true_preds]

    lb = LabelBinarizer()
    lb.fit(list(label_map.values()))  # Fit all possible labels

    true_labels_bin = lb.transform(true_labels)
    true_preds_bin = lb.transform(true_preds)

    precision, recall, _ = precision_recall_fscore_support(true_labels_bin, true_preds_bin, average='weighted', zero_division=1)[:3]

    # Compute the F5 score
    beta = 5  # Focus more on recall
    f5_score = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

    return {
        'f5_score': f5_score,
        'recall': recall,
        'precision': precision
    }

In [None]:
#Preparing the datasets for token classification
data = json.load(open('train.json'))

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)

# Slice the first 10 elements for debugging
train_data = train_data[:10]
val_data = val_data[:10]

trainset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in train_data],
    'document': [x['document'] for x in train_data],
    'tokens': [x['tokens'] for x in train_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in train_data],
    'labels' :[x['labels'] for x in train_data]
})

trainset = trainset.map(
    tokenize,
    fn_kwargs={"tokenizer": tokenizer, "label2id": label2id},
    num_proc=3
)

valset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in val_data],
    'document': [x['document'] for x in val_data],
    'tokens': [x['tokens'] for x in val_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in val_data],
    'labels' :[x['labels'] for x in val_data]
})


valset = valset.map(
    tokenize,
    fn_kwargs={"tokenizer": tokenizer, "label2id": label2id},
    num_proc=3
)

del data

  self.pid = os.fork()


Map (num_proc=3):   0%|          | 0/10 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1692 > 1024). Running this sequence through the model will result in indexing errors


Map (num_proc=3):   0%|          | 0/10 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1028 > 1024). Running this sequence through the model will result in indexing errors
Token indices sequence length is longer than the specified maximum sequence length for this model (1238 > 1024). Running this sequence through the model will result in indexing errors


In [None]:
# Simple checks to ensure dataset integrity
assert 'input_ids' in trainset.features, "Input IDs should be part of the dataset."
assert 'labels' in trainset.features, "Labels should be part of the dataset."

print("Training Set Sample:", trainset[0])
print("Validation Set Sample:", valset[0])

Training Set Sample: {'full_text': 'Challenge    Broadly speaking, the challenge is raising money for non-profit organizations. More specifically,  I recently joined the boards of two non-profit organizations for which I have or will have a fund  raising role. The first board I joined supports entrepreneurs and start-ups “from founding  through funding,” and I am a mentor to start-up companies as part of this work. The second  board I joined supports a school; starting in September, I will be leading an effort to raise  money for student travel fellowships.     As I joined the board, the entrepreneur support organization had just begun a $1 million fund  raising campaign that was essential to its sustainability. Then, the COVID-19 pandemic struck. I  was asked to solicit my 30 peers in the mentoring group. I would ask them to join me in  donating $5,000 for each of the next three years.     Selection    The tool I chose was storytelling, and this pre-dated my having begun this design t

In [None]:
print(f"Number of training data: {len(trainset)} || Number of validation data: {len(valset)}")

Number of training data: 10 || Number of validation data: 10


In [None]:
from torch.utils.data import Dataset, DataLoader
class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']
        self.document = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Directly create tensors without re-wrapping them
        return {
            'input_ids': torch.tensor(self.data[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': oh_encoder(self.labels[idx]),  # Using the modified oh_encoder
            'document': torch.tensor(self.document[idx], dtype=torch.long)
        }

custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)

In [None]:
from torch.utils.data import Dataset, DataLoader
class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']
        self.document = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Ensure all elements are converted to tensors before passing to the DataLoader
        return {
            'input_ids': torch.tensor(self.data[idx], dtype=torch.long),
            'attention_mask': torch.tensor(self.attention_mask[idx], dtype=torch.long),
            'labels': oh_encoder(self.labels[idx]),  # Assuming oh_encoder correctly returns a tensor
            'document': torch.tensor(self.document[idx], dtype=torch.long)
        }

In [None]:
from torch.utils.data import Dataset, DataLoader

class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']
        self.document = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.attention_mask[idx]), oh_encoder(self.labels[idx]), torch.tensor(self.document[idx])

custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)


the following chunk for custom_collate is edited

In [None]:
from torch.nn.utils.rnn import pad_sequence
import torch

def custom_collate(batch, device):
    input_ids, attention_mask, labels, doc = zip(*batch)

    # Ensure the tensors are directly moved to device without re-wrapping them
    padded_input_ids = pad_sequence([ids.to(device) for ids in input_ids], batch_first=True, padding_value=0)
    padded_attention_mask = pad_sequence([mask.to(device) for mask in attention_mask], batch_first=True, padding_value=0)
    padded_labels = pad_sequence([l.to(device) for l in labels], batch_first=True, padding_value=-100)

    return {
        'input_ids': padded_input_ids,
        'attention_mask': padded_attention_mask,
        'labels': padded_labels,
        'doc': doc
    }


from torch.utils.data import DataLoader

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

batch_size = 1

# Now, modify the DataLoader to include a lambda function that passes the device to the custom collate function.
train_dataloader = DataLoader(
    custom_train,
    batch_size=batch_size,
    collate_fn=lambda x: custom_collate(x, device),
    shuffle=True
)

val_dataloader = DataLoader(
    custom_val,
    batch_size=batch_size,
    collate_fn=lambda x: custom_collate(x, device),
    shuffle=False
)

In [None]:
def val(model, custom_val, batch_size, custom_collate, criterion, device):
    model.eval()
    avg_val_loss = 0
    avg_val_score = 0
    val_loss = 0
    val_score = 0
    val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate, shuffle=False)

    with torch.no_grad():
        for batch, data in enumerate(val_dataloader):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits if isinstance(outputs, tuple) else outputs  # Adjust this line based on model output
            loss = criterion(logits.view(-1, model.num_labels), labels.view(-1))
            results = compute_metrics(logits, labels)

            val_loss += loss.item()
            val_score += results['f5']

            if batch % 200 == 0 or batch + 1 == len(val_dataloader):
                print("********** For Validation Set **********")
                print(f"Completed {batch+1}/{len(val_dataloader)}, with current val_loss: {loss.item()},\n current results: {results}")

        avg_val_loss = val_loss / len(val_dataloader)
        avg_val_score = val_score / len(val_dataloader)

    print(f"Average val_loss: {avg_val_loss}, average val_score = {avg_val_score}")

    return avg_val_loss, avg_val_score

train function is edited

In [None]:
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, epochs, device):
    for epoch in range(epochs):
        model.train()
        for batch, data in enumerate(train_dataloader):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)  # This should be of shape [batch_size, sequence_length]

            optimizer.zero_grad()
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits  # Ensure logits are [batch_size, sequence_length, num_classes]

            # Print shapes for debugging
            print(f"Logits shape: {logits.shape}, Labels shape: {labels.shape}")

            loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))  # Flatten both logits and labels
            loss.backward()
            optimizer.step()

            # Optionally add scheduler.step() here if using learning rate scheduler

        # Step the scheduler if it's part of the training configuration
        if scheduler:
            scheduler.step()

        # Validation loop
        model.eval()
        with torch.no_grad():
            for batch, data in enumerate(val_dataloader):
                input_ids = data['input_ids'].to(device)
                attention_mask = data['attention_mask'].to(device)
                labels = data['labels'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
                logits = outputs.logits

                # Calculate loss and metrics
                val_loss = criterion(logits.view(-1, logits.size(-1)), labels.view(-1))
                # calculate other metrics...

        # Optionally add learning rate scheduler step here, if needed

    return model  # Return the trained model


In [None]:
train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, epochs, device)


AttributeError: 'Tensor' object has no attribute 'logits'

In [144]:
export CUDA_LAUNCH_BLOCKING=1

SyntaxError: invalid syntax (<ipython-input-144-56efdc3ca8f4>, line 1)

In [143]:
def train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, epochs, device):
    for epoch in range(epochs):
        model.train()  # Set the model to training mode
        train_loss = 0
        train_scores = []

        for batch, data in enumerate(train_dataloader):
            input_ids = data['input_ids'].to(device)
            attention_mask = data['attention_mask'].to(device)
            labels = data['labels'].to(device)

            # Convert one-hot to class indices if necessary
            if labels.dim() == 3 and labels.size(-1) > 1:  # Assuming the last dimension is one-hot
                labels = torch.argmax(labels, dim=-1)  # Convert to class indices

            # Clear previously calculated gradients
            optimizer.zero_grad()

            # Forward pass: compute predicted outputs by passing inputs to the model
            outputs = model(input_ids, attention_mask=attention_mask)
            logits = outputs.logits if isinstance(outputs, tuple) else outputs  # Adjust this line based on model output

            # Print shapes for debugging
            print(f"Logits shape: {logits.shape}, Labels shape: {labels.shape}")

            # Calculate the batch loss
            loss = criterion(logits.view(-1, model.num_labels), labels.view(-1))
            train_loss += loss.item()

            # Perform a backward pass to calculate the gradients
            loss.backward()

            # Update the parameters
            optimizer.step()

            # Compute metrics and store them
            results = compute_metrics(logits, labels)
            train_scores.append(results['f5'])

            if batch % 200 == 0 or batch + 1 == len(train_dataloader):
                print(f"Epoch {epoch+1}, Batch {batch+1}/{len(train_dataloader)}, Loss: {loss.item()}")

        # Step the scheduler if it's part of the training configuration
        if scheduler:
            scheduler.step()

        # Validation step after completing all batches in an epoch
        avg_val_loss, avg_val_score = val(model, val_dataloader, batch_size, custom_collate, criterion, device)
        avg_train_loss = train_loss / len(train_dataloader)
        avg_train_score = sum(train_scores) / len(train_scores)

        print(f"Epoch {epoch+1}/{epochs} completed: Avg Train Loss: {avg_train_loss}, Avg Train F5 Score: {avg_train_score}")
        print(f"Validation: Avg Val Loss: {avg_val_loss}, Avg Val F5 Score: {avg_val_score}")

    print("Training completed.")

# Call the function to start training
train_model(model, train_dataloader, val_dataloader, optimizer, criterion, scheduler, epochs, device)


RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1.
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
from torch.utils.data import DataLoader

def train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion, device):
    model.train()

    # Modify DataLoaders to use lambda for passing device to collate function
    train_dataloader = DataLoader(custom_train, batch_size=batch_size,
                                  collate_fn=lambda x: custom_collate(x, device), shuffle=True)
    val_dataloader = DataLoader(custom_val, batch_size=batch_size,
                                collate_fn=lambda x: custom_collate(x, device), shuffle=False)

    print('Starting training...')
    for epoch in range(epochs):
        avg_train_loss = 0
        avg_train_score = 0
        train_loss = 0
        train_score = 0

        for batch, data in enumerate(train_dataloader):
            input_ids = data['input_ids']
            attention_mask = data['attention_mask']
            labels = data['labels']

            optimizer.zero_grad()
            #outputs = model(input_ids, attention_mask=attention_mask).logits
            outputs = model(input_ids, attention_mask=attention_mask)
            #loss = criterion(outputs, labels.view(-1))
            #loss = criterion(outputs.view(-1, outputs.shape[-1]), labels.view(-1))

            print(f"Outputs shape: {outputs.shape}")  # Should be [batch_size, sequence_length, num_classes]
            print(f"Labels shape: {labels.shape}")    # Should be [batch_size, sequence_length]


            outputs = outputs.view(-1, outputs.size(-1))  # Flatten output for cross-entropy which expects 2D logits
            labels = labels.view(-1)  # Flatten labels to match output



            loss = criterion(outputs, labels)


            loss.backward()
            optimizer.step()

            results = compute_metrics(outputs, labels)
            train_loss += loss.item()
            train_score += results['f5']

            if batch % 200 == 0 or batch + 1 == len(train_dataloader):
                print(f"Completed {batch+1}/{len(train_dataloader)}, with current train_loss: {loss},\n current results:{results}")

        avg_train_loss = train_loss / len(train_dataloader)
        avg_train_score = train_score / len(train_dataloader)

        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss}, average train_score = {avg_train_score}")
        print("Starting to validate")
        val_loss, val_score = val(model, custom_val, batch_size, custom_collate, criterion, device)

    return avg_train_loss, avg_train_score


In [None]:
import torch
torch.cuda.empty_cache()  # Clear unused memory

In [None]:
batch_size = 1
epochs = 2

train(model, custom_train, custom_val, batch_size, custom_collate, epochs, optimizer, criterion, device)

Starting training...


TypeError: oh_encoder() missing 1 required positional argument: 'unique_labels'

# Converting from predictions to NER labels

In [None]:
#Used to label each token at NER stage
def find_span(target: list[str], document: list[str]) -> list[list[int]]:

    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue

    return spans

In [None]:
## DON'T RUN ##
#### From KAGGLE: https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook ####

triplets = []
pairs = set()  # membership operation using set is faster O(1) than that of list O(n)
processed = []
emails = []
phone_nums = []
urls = []
streets = []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc, full_text in zip(
    preds_final,
    ds["token_map"],
    ds["offset_mapping"],
    ds["tokens"],
    ds["document"],
    ds["full_text"]
):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]  # Token ID at start index
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue
        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)

    # email
    for token_idx, token in enumerate(tokens):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": doc, "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )

    # phone number
    matches = phone_num_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": tokens[token_idx]}
            )

    # url
    matches = url_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            urls.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-URL_PERSONAL", "token_str": tokens[token_idx]}
            )

    # street
#     matches = street_regex.findall(full_text)
#     if not matches:
#         continue
#     for match in matches:
#         target = [t.text for t in nlp.tokenizer(match)]
#         matched_spans = find_span(target, tokens)
#     for matched_span in matched_spans:
#         for intermediate, token_idx in enumerate(matched_span):
#             prefix = "I" if intermediate else "B"
#             streets.append(
#                 {"document": doc, "token": token_idx, "label": f"{prefix}-STREET_ADDRESS", "token_str": tokens[token_idx]}
#             )