In [1]:
import json
import datasets
import tqdm
import torch
import numpy as np
import torch.nn as nn
import torch.nn.functional as F

from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer, AutoModelForTokenClassification, DataCollatorForTokenClassification, Trainer, TrainingArguments\
    , AutoModelForSequenceClassification, AutoConfig
from scipy.special import softmax
from sklearn.model_selection import train_test_split
from spacy.lang.en import English
from ignite.metrics import Fbeta
from functools import partial
from torch.nn.utils.rnn import pad_sequence
from collections import Counter

# Data Preparation

### Functions for data prep

In [2]:
# #Finding out the number of labels
# data = json.load(open('data/train.json'))

# all_labels = set()

# for d in data:
#     all_labels = all_labels.union(set(d['labels']))

# print(f"{len(list(all_labels))} labels, with the following labels:\n {list(all_labels)}")
# del data

# label2id = {label:index for index,label in enumerate(all_labels)}
# id2label = {index:label for index,label in enumerate(all_labels)}

In [3]:
#Change to one-hot vector
# def oh_encoder(labels):  #label: array of output for each sentence
#     unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL', 'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
#                      'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']
    
#     labels_oh = []
#     for label in labels:    #label: str
#         label_oh = [float(0)]*len(unique_labels)
#         for k in range(len(unique_labels)):
#             if unique_labels[k] == label:
#                 label_oh[k] = float(1)
#                 labels_oh.append(torch.tensor(label_oh, requires_grad=True))
#                 break
        
#     #return torch.tensor(labels_oh, requires_grad=True)
#     return labels_oh    #list of one-hot labels as tensors

#Change to one-hot vector
def oh_encoder(labels):  #label: array of output for each sentence

    # unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-PHONE_NUM', 'I-PHONE_NUM','B-ID_NUM', 'I-ID_NUM',  'B-URL_PERSONAL','I-URL_PERSONAL',
    #                   'B-STREET_ADDRESS', 'I-STREET_ADDRESS',  'B-EMAIL', 'B-USERNAME']
    
    
    unique_labels = ['O', 'B-NAME_STUDENT','I-NAME_STUDENT','B-URL_PERSONAL', 'B-ID_NUM','I-ID_NUM','B-EMAIL','I-STREET_ADDRESS',
                     'I-PHONE_NUM', 'B-USERNAME', 'B-PHONE_NUM','B-STREET_ADDRESS', 'I-URL_PERSONAL']
    
    labels_oh = []
    for label in labels:    #label: str
        label_oh = [float(0)]*len(unique_labels)
        for k in range(len(unique_labels)):
            if unique_labels[k] == label:
                label_oh[k] = float(1)
                labels_oh.append(torch.tensor(label_oh, requires_grad=True))
                break

    #return torch.tensor(labels_oh, requires_grad=True)
    return labels_oh    #list of one-hot labels as tensors

In [4]:
#Tokenizing sentences.
def tokenize(example, tokenizer):
    # Preprocess the tokens and labels by adding trailing whitespace and labels
    tokens = []
    labels = []
    for token, label, t_ws in zip(example["tokens"], 
                                  example["labels"],
                                  example["trailing_whitespace"]):
        tokens.append(token)
        labels.extend([label] * len(token))
        # Added trailing whitespace and label if true and 
        if t_ws:
            tokens.append(" ")
            labels.append("O")  
    
    text = "".join(tokens)
    # print(f"len(text)={len(text)}, len(tokens)={len(tokens)}")
    # tokenization without truncation
    tokenized = tokenizer(text, return_offsets_mapping=True,
                          truncation=False)
    # labels = np.array(labels)
    # Labels
    token_labels = []
    for start_idx, end_idx in tokenized.offset_mapping:
        # Added 'O' 
        if start_idx == 0 and end_idx == 0:
            #token_labels.append(label2id["O"]) 
            token_labels.append("O") 
        else:
            # case when the text starts with whitespace
            if text[start_idx].isspace():
                start_idx += 1
            # Convert label to id (int)
            #label_id = label2id[labels[start_idx]]
            label_id = labels[start_idx]
            token_labels.append(label_id)

    return {**tokenized, "labels": token_labels, "length": len(tokenized.input_ids)}
    
# def tokenize(example, tokenizer, INFERENCE_MAX_LENGTH=3500):
#     ''' 
#     Arguments:
#     example: sentence
#     tokenizer: following DeBERTa's
#     INFERENCE_MAX_LENGTH: for truncation if needed

#     Returns:
#     dictionary of tokenized word id, with token_map, which maps characters to its initial idx
#     '''
#     text = []
#     token_map = []
#     idx = 0
    
#     for t, ws in zip(example["tokens"], example["trailing_whitespace"]):
#         text.append(t)
#         token_map.extend([idx]*len(t))
#         if ws:
#             text.append(" ")
#             token_map.append(-1)
#         idx += 1
#     tokenized = tokenizer("".join(text), return_offsets_mapping=True, truncation=False, max_length=INFERENCE_MAX_LENGTH, return_tensors='pt'
#                          , add_special_tokens=False)
#     return {
#         **tokenized,
#         "token_map": token_map,
#     }

In [5]:
def to_dict(data):
    '''
    Change from a list of dictionary to a dictionary
    '''
    dict_of_lists = {}
    for d in data:
        for key, value in d.items():
            if key in dict_of_lists:
                dict_of_lists[key].append(value)
            else:
                dict_of_lists[key] = [value]
    return dict_of_lists

### Load Data

In [6]:
# data = json.load(open('../data/train.json'))
# d = data[0]

# input_ids = tokenize(d, tokenizer)['input_ids'].numpy()[0]
# # print(input_ids)
# print(len(input_ids))
# tokens = tokenizer.convert_ids_to_tokens(input_ids)
# # print(tokens)
# # print(d['tokens'])
# print(len(d['tokens']))

In [7]:
#Preparing the datasets for token classification
data = json.load(open('../data/train.json'))
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
tokenizer = AutoTokenizer.from_pretrained(model_name)

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)  
print('Data split completed')
# # train_labels = [oh_encoder(x['labels']) for x in train_data]

trainset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in train_data],
    'document': [x['document'] for x in train_data],
    'tokens': [x['tokens'] for x in train_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in train_data],
    'labels' :[x['labels'] for x in train_data]
    # 'labels' :[oh_encoder(x['labels']) for x in train_data] 
})
print('trainset loaded')

trainset = trainset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
#train_labels = [oh_encoder(x['labels'] for x in train_data)]
print('trainset mapped')

# val_labels = [oh_encoder(x['labels']) for x in val_data]

valset = datasets.Dataset.from_dict({
    'full_text': [x['full_text'] for x in val_data],
    'document': [x['document'] for x in val_data],
    'tokens': [x['tokens'] for x in val_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in val_data],
    'labels' :[x['labels'] for x in val_data]
    # 'labels' :[oh_encoder(x['labels']) for x in val_data]
})
print('valset loaded')

valset = valset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
print('valset mapped')

#val_labels = [oh_encoder(x['labels'] for x in val_data)]

del data

Data split completed
trainset loaded


Map (num_proc=3):   0%|          | 0/5785 [00:00<?, ? examples/s]

trainset mapped
valset loaded


Map (num_proc=3):   0%|          | 0/1022 [00:00<?, ? examples/s]

valset mapped


In [9]:
# Convert from a list of dict to 1 big dict
trainset = to_dict(trainset)
#trainset['labels'] = train_labels

valset = to_dict(valset)
# #valset['labels'] = val_labels

# Apply one hot encoding to labels
trainset['one_hot_labels'] = [[oh_encoder(item) for item in sublist] for sublist in trainset['labels']]
valset['one_hot_labels'] = [[oh_encoder(item) for item in sublist] for sublist in valset['labels']]

In [16]:
class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['one_hot_labels']
        self.doc_no = data_dict['document']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx]), torch.tensor(self.attention_mask[idx]), self.labels[idx] , torch.tensor(self.doc_no[idx])

# Assuming custom_train is properly initialized as Custom_data
custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)

# Largely ok but sth is broken here. 

In [10]:
def custom_collate(batch):
    '''
    For padding
    '''
    input_ids, attention_mask, one_hot_labels, document = zip(*batch)
    # Pad the input_ids and labels
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    
    padded_attention_mask = pad_sequence(attention_mask, batch_first = True, padding_value = 0)
    
    padded_labels = pad_sequence(one_hot_labels, batch_first=True, padding_value=0)  # Padding for labels in token classification
    
    padded_doc = pad_sequence(document, batch_first=True, padding_value=0)
    
    return padded_input_ids, padded_attention_mask, padded_labels, padded_doc

batch_size = 2
train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)

# Rough workings

In [20]:
*custom_train[0]

SyntaxError: can't use starred expression here (4150190818.py, line 1)

In [14]:
len(custom_train[0])

4

In [11]:
len(train_dataloader)

2893

In [63]:
for b in train_dataloader:
    a,b,c,d = batch
    break

TypeError: expected Tensor as element 0 in argument 0, but got list

# Model: DeBerta

### Functions to train and val

In [28]:
def val(model, valset, batch_size, collator, criterion,  f5, device):
    model.eval()

    avg_val_loss = 0
    avg_val_score = 0
    with torch.no_grad:
        val_loss = 0
        val_score = 0
        val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)

        for (input_ids, attention_mask, labels) in val_dataloader:                
            
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            val_loss += criterion(outputs, labels)

            val_score += f5(outputs, labels)

    avg_val_loss = val_loss / len(valset) 
    avg_val_score = val_score/len(valset)

    print(f"Average val_loss: {avg_val_loss}, avgerage val_score = {avg_val_score}")

    return avg_val_loss, avg_val_score

In [29]:
def train(model, trainset, batch_size, collator, epochs, optimizer, criterion,  f5, device):
    model.train()

    for epoch in range(epochs):
        avg_train_loss = 0
        avg_train_score = 0
        train_loss = 0
        train_score = 0

        train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
        
        for (input_ids, attention_mask, labels) in train_dataloader:     
            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).long()

            outputs = model(input_ids, attention_mask).logits
            #print(outputs.logits)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss
            train_score += f5(outputs, labels)

        avg_train_loss = train_loss / len(trainset)    
        avg_train_score = train_score / len(trainset)
        val_loss, val_score = val(model, valset, batch_size, collator, criterion,  f5, device)

        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss}, average train_score = {avg_train_score}")
        

### Transfer Learning

In [24]:
model = AutoModelForSequenceClassification.from_pretrained(model_name)
config = AutoConfig.from_pretrained(model_name)

for param in model.parameters():
    param.requires_grad = False


In [26]:
in_features = model.classifier.in_features
model.classifier = nn.Sequential( 
    nn.Linear(in_features, 13)
)

# Unfreeze parameters in classifier layer
for param in model.classifier.parameters():
    param.requires_grad = True

In [36]:
# # Check which layers are frozen
# for name, param in model.named_parameters():
#     print(f"Layer: {name} | Frozen: {'NO' if param.requires_grad else 'YES'}")

In [33]:
device = 'cuda' 
model = model.to(device)
epochs = 5
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)
criterion = nn.CrossEntropyLoss().to(device)
f5 = Fbeta(beta=5).to(device)

In [None]:
def train(model, trainset, batch_size, collator, epochs, optimizer, criterion,  f5, device):
    model.train()

    for epoch in range(epochs):
        avg_train_loss = 0
        avg_train_score = 0
        train_loss = 0
        train_score = 0

        train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
        
        for (input_ids, attention_mask, labels) in train_dataloader:     
            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).long()

            outputs = model(input_ids, attention_mask).logits
            #print(outputs.logits)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss
            train_score += f5(outputs, labels)

        avg_train_loss = train_loss / len(trainset)    
        avg_train_score = train_score / len(trainset)
        val_loss, val_score = val(model, valset, batch_size, collator, criterion,  f5, device)

        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss}, average train_score = {avg_train_score}")
        

In [34]:
batch_size = 2
train(model, trainset, batch_size, custom_collate, epochs, optimizer, criterion, f5, device)

RuntimeError: Expected floating point type for target with class probabilities, got Long

# Model: DeBERTa

Using a pretrained DeBERTa, we will build a classifier head on top of it to predict the class at token level.

In [4]:
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim = 13):
        super(Classifier, self).__init__()

        self.dropout_prob = 0.3
        self.final_activation = nn.Softmax(dim = -1)

        self.linear = nn.Sequential(
            nn.ReLU(nn.Linear(input_dim, hidden_dim)),
            
            nn.Dropout(self.dropout_prob),
            nn.ReLU(nn.Linear(hidden_dim, hidden_dim*2)),

            nn.Dropout(self.dropout_prob),
            nn.ReLU(nn.Linear(hidden_dim*2, hidden_dim)),

            nn.Linear(hidden_dim, output_dim)
        )
       

    def forward(self, x):
        logit = self.linear(x)
        return self.final_activation(logit)


In [5]:
class Classifier(nn.Module):
    def __init__(self, input_dim, hidden_dim, output_dim = 13):
        super(Classifier, self).__init__()

        self.dropout_prob = 0.3
        self.final_activation = nn.Softmax(dim = -1)

        self.dropout = nn.Dropout(self.dropout_prob)
        self.linear1 = nn.ReLU(nn.Linear(input_dim,hidden_dim))
        self.linear2 = nn.ReLU(nn.Linear(hidden_dim, hidden_dim*2))                            
        self.linear3 = nn.ReLU(nn.Linear(hidden_dim*2, hidden_dim))
        self.output = nn.Linear(hidden_dim,output_dim)

    def forward(self, x):
        print('now start linear1')
        print(x.size())
        x = self.linear1(x)

        print('now start linear2')
        print(x.size())
        x = self.linear2(self.dropout(x))

        print('now start linear3')
        print(x.size())
        x = self.linear3(self.dropout(x))

        print('now start output')
        print(x.size())
        logit= self.output(x)
        
        print('now start activation')
        output =  self.final_activation(logit)
        
        return output


In [6]:
class Deberta_Classif(Classifier):
    def __init__(self, model_name, classif_input, classif_hidden, classif_output = 13, finetune = False):
        super(Deberta_Classif, self).__init__(classif_input, classif_input)
        self.ft = finetune

        self.extractor = AutoModelForTokenClassification.from_pretrained(model_name).base_model

        if not finetune:
            for param in self.extractor.parameters():
                param.requires_grad = False

        self.extractor_num_param =  sum(p.numel() for p in self.extractor.parameters())
        self.extractor_num_param_grad = sum(p.numel() for p in self.extractor.parameters() if p.requires_grad)
        self.extractor_name = "DeBERTa"
                

        self.classifier = Classifier(input_dim=classif_input, hidden_dim=classif_hidden ,output_dim=classif_output)
        self.classifier_num_param = sum(p.numel() for p in self.classifier.parameters() if p.requires_grad)

        
    def count_param(self):
        
        if self.ft:
            type = 'finetuned'
            num_param = self.extractor_num_param_grad + self.classifier_num_param
        else:
            type = 'non-finetuned'
            num_param = self.extractor_num_param + self.classifier_num_param

        print(f"Number of parameters in {type} {self.extractor_name} model is {num_param:,}")
    
    
    def forward(self, input_ids, attention_mask):
        x = self.extractor(input_ids, attention_mask).last_hidden_state
        x = self.classifier(x)

In [7]:
model_name = "MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli"
#model = Deberta_Classif(model_name, classif_input = 768, classif_hidden = 100, classif_output = 13, finetune=False)  #Extractor output has dim 768 
config = AutoModelForTokenClassification.from_pretrained(model_name).config

config.update({
            'num_labels': 13,
            'ignore_mismatched_sizes': True,
        })
model = AutoModelForTokenClassification.from_pretrained(model_name, config = config, ignore_mismatched_sizes=True)

for param in model.base_model.parameters():
    param.requires_grad = False

print(f"Model's trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad): ,}")
print(f"Model's total parameters: {sum(p.numel() for p in model.parameters()): ,}")

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at MoritzLaurer/DeBERTa-v3-base-mnli-fever-anli and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([3]) in the checkpoint and torch.Size([13]) in the model instantiated
- classifier.weight: found shape torch.Size([3, 768]) in the checkpoint and torch.Size([13, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model's trainable parameters:  9,997
Model's total parameters:  183,841,549


In [8]:
device = 'cuda' 
model = model.to(device)
epochs = 5
tokenizer = AutoTokenizer.from_pretrained(model_name)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of = 32, max_length=3500)
optimizer = torch.optim.Adam(model.parameters(), lr = 0.001, weight_decay=0.0005)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max = epochs)
criterion = nn.CrossEntropyLoss().to(device)
f5 = Fbeta(beta=5).to(device)



In [12]:
#Preparing the datasets for token classification
data = json.load(open('../data/train.json'))

train_data, val_data = train_test_split(data, test_size=0.15, random_state=42)  
# train_labels = [oh_encoder(x['labels']) for x in train_data]

trainset = Dataset.from_dict({
    # 'full_text': [x['full_text'] for x in train_data],
    # 'document': [x['document'] for x in train_data],
    'tokens': [x['tokens'] for x in train_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in train_data],
    'labels' :[oh_encoder(x['labels']) for x in train_data]
})

print('trainset loaded')
trainset = trainset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
#train_labels = [oh_encoder(x['labels'] for x in train_data)]
print('trainset mapped')

# val_labels = [oh_encoder(x['labels']) for x in val_data]

valset = Dataset.from_dict({
    # 'full_text': [x['full_text'] for x in val_data],
    # 'document': [x['document'] for x in val_data],
    'tokens': [x['tokens'] for x in val_data],
    'trailing_whitespace': [x['trailing_whitespace'] for x in val_data],
    'labels' :[oh_encoder(x['labels']) for x in val_data]
})
print('valset loaded')
valset = valset.map(tokenize, fn_kwargs = {"tokenizer": tokenizer}, num_proc=3)
print('valset mapped')

#val_labels = [oh_encoder(x['labels'] for x in val_data)]

del data

Map (num_proc=3):   0%|          | 0/5785 [00:00<?, ? examples/s]

Map (num_proc=3):   0%|          | 0/1022 [00:00<?, ? examples/s]

In [13]:
#First item
print(f"Number of training data: {len(trainset)} || Number of validation data: {len(valset)}")

Number of training data: 5785 || Number of validation data: 1022


In [14]:
def to_dict(data):
    dict_of_lists = {}
    for d in data:
        for key, value in d.items():
            if key in dict_of_lists:
                dict_of_lists[key].append(value)
            else:
                dict_of_lists[key] = [value]
    return dict_of_lists

trainset = to_dict(trainset)
#trainset['labels'] = train_labels

valset = to_dict(valset)
#valset['labels'] = val_labels

In [15]:
from torch.utils.data import Dataset, DataLoader
from torch.nn.utils.rnn import pad_sequence


class Custom_data(Dataset):
    def __init__(self, data_dict):
        self.data = data_dict['input_ids']
        self.attention_mask = data_dict['attention_mask']
        self.labels = data_dict['labels']

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        return torch.tensor(self.data[idx][0]), torch.tensor(self.attention_mask[idx][0]), torch.tensor(self.labels[idx][0])

# Assuming custom_train is properly initialized as Custom_data
custom_train = Custom_data(trainset)
custom_val = Custom_data(valset)


In [16]:
def custom_collate(batch):
    input_ids, attention_mask, labels = zip(*batch)
    # Pad the input_ids and labels
    padded_input_ids = pad_sequence(input_ids, batch_first=True, padding_value=0)
    padded_attention_mask = pad_sequence(attention_mask, batch_first = True, padding_value = 0)
    padded_labels = pad_sequence(labels, batch_first=True, padding_value=-100)  # Padding for labels in token classification
    return padded_input_ids, padded_attention_mask, padded_labels

batch_size = 2
train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)

# for i, (input_ids, attention_mask, labels) in enumerate(train_dataloader):
#     print(f'Batch {i + 1}:')
#     print('Input IDs:', input_ids)
#     print('Attention_mask:', attention_mask)
#     print('Labels:', labels)
#     print()

In [19]:
for batch in train_dataloader:
    data, attention, labels = batch
    break

In [22]:
data

tensor([[    1,  6738, 78580,  ..., 31401,   260,     2],
        [    1,  1391,   367,  ...,     0,     0,     0]])

In [95]:
def val(model, valset, batch_size, collator, criterion,  f5, device):
    model.eval()

    avg_val_loss = 0
    avg_val_score = 0
    with torch.no_grad:
        val_loss = 0
        val_score = 0
        val_dataloader = DataLoader(custom_val, batch_size=batch_size, collate_fn=custom_collate)

        for (input_ids, attention_mask, labels) in val_dataloader:                
            
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device)

            outputs = model(input_ids, attention_mask)
            val_loss += criterion(outputs, labels)

            val_score += f5(outputs, labels)

    avg_val_loss = val_loss / len(valset) 
    avg_val_score = val_score/len(valset)

    print(f"Average val_loss: {avg_val_loss}, avgerage val_score = {avg_val_score}")

    return avg_val_loss, avg_val_score
    


In [123]:
def train(model, trainset, batch_size, collator, epochs, optimizer, criterion,  f5, device):
    
    model.train()

    for epoch in range(epochs):
        avg_train_loss = 0
        avg_train_score = 0
        train_loss = 0
        train_score = 0

        
        train_dataloader = DataLoader(custom_train, batch_size=batch_size, collate_fn = custom_collate)
        
        for (input_ids, attention_mask, labels) in train_dataloader:     
            optimizer.zero_grad()
            input_ids = input_ids.to(device)
            attention_mask = attention_mask.to(device)
            labels = labels.to(device).long()

            outputs = model(input_ids, attention_mask).logits
            #print(outputs.logits)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            scheduler.step()

            train_loss += loss
            train_score += f5(outputs, labels)

        avg_train_loss = train_loss / len(trainset)    
        avg_train_score = train_score / len(trainset)
        val_loss, val_score = val(model, valset, batch_size, collator, criterion,  f5, device)


        print(f"Epoch {epoch+1}/{epochs}: Average train_loss = {avg_train_loss}, average train_score = {avg_train_score}")
        

In [124]:
batch_size = 2
train(model, trainset, batch_size, collator, epochs, optimizer, criterion, f5, device)

KeyboardInterrupt: 

# Converting from predictions to NER labels

In [None]:
#Used to label each token at NER stage
def find_span(target: list[str], document: list[str]) -> list[list[int]]:
    
    idx = 0
    spans = []
    span = []

    for i, token in enumerate(document):
        if token != target[idx]:
            idx = 0
            span = []
            continue
        span.append(i)
        idx += 1
        if idx == len(target):
            spans.append(span)
            span = []
            idx = 0
            continue
    
    return spans

In [None]:
## DON'T RUN ##
#### From KAGGLE: https://www.kaggle.com/code/manavtrivedi/0-967-nlp-sakura/notebook ####

triplets = []
pairs = set()  # membership operation using set is faster O(1) than that of list O(n)
processed = []
emails = []
phone_nums = []
urls = []
streets = []

# For each prediction, token mapping, offsets, tokens, and document in the dataset
for p, token_map, offsets, tokens, doc, full_text in zip(
    preds_final, 
    ds["token_map"], 
    ds["offset_mapping"], 
    ds["tokens"], 
    ds["document"],
    ds["full_text"]
):

    # Iterate through each token prediction and its corresponding offsets
    for token_pred, (start_idx, end_idx) in zip(p, offsets):
        label_pred = id2label[str(token_pred)]  # Predicted label from token
        if start_idx + end_idx == 0:
            continue
        if token_map[start_idx] == -1:
            start_idx += 1
        while start_idx < len(token_map) and tokens[token_map[start_idx]].isspace():
            start_idx += 1
        if start_idx >= len(token_map):
            break
        token_id = token_map[start_idx]  # Token ID at start index
        if label_pred in ("O", "B-EMAIL", "B-PHONE_NUM", "I-PHONE_NUM") or token_id == -1:
            continue
        pair = (doc, token_id)
        if pair not in pairs:
            processed.append({"document": doc, "token": token_id, "label": label_pred, "token_str": tokens[token_id]})
            pairs.add(pair)
    
    # email
    for token_idx, token in enumerate(tokens):
        if re.fullmatch(email_regex, token) is not None:
            emails.append(
                {"document": doc, "token": token_idx, "label": "B-EMAIL", "token_str": token}
            )
                
    # phone number
    matches = phone_num_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            phone_nums.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-PHONE_NUM", "token_str": tokens[token_idx]}
            )
    
    # url
    matches = url_regex.findall(full_text)
    if not matches:
        continue
    for match in matches:
        target = [t.text for t in nlp.tokenizer(match)]
        matched_spans = find_span(target, tokens)
    for matched_span in matched_spans:
        for intermediate, token_idx in enumerate(matched_span):
            prefix = "I" if intermediate else "B"
            urls.append(
                {"document": doc, "token": token_idx, "label": f"{prefix}-URL_PERSONAL", "token_str": tokens[token_idx]}
            )
    
    # street
#     matches = street_regex.findall(full_text)
#     if not matches:
#         continue
#     for match in matches:
#         target = [t.text for t in nlp.tokenizer(match)]
#         matched_spans = find_span(target, tokens)
#     for matched_span in matched_spans:
#         for intermediate, token_idx in enumerate(matched_span):
#             prefix = "I" if intermediate else "B"
#             streets.append(
#                 {"document": doc, "token": token_idx, "label": f"{prefix}-STREET_ADDRESS", "token_str": tokens[token_idx]}
#             )