# DeBERTa Model with Truncation
This notebook serves as the main run file for DeBERTa models using truncation to preprocess the data
There are several things that one can change in this file for each run: 
1. Penalty for loss function 
2. Can alter truncate function to change the truncation of tokenization
3. Training parameters such as epochs, learning rate, etc. 
4. Training on data which has been split into train/test sets or training on full dataset (i.e. after final model has been chosen). To do this, change `evaluation_strategy` to `no`, and in the `trainer=Trainer()`, delete `test_ds=...` and change `train_ds=...` to `train_ds=ds`. 

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/data-exploration/deberta_test.csv
/kaggle/input/data-exploration/roberta_test.csv
/kaggle/input/data-exploration/roberta_train.csv
/kaggle/input/data-exploration/__results__.html
/kaggle/input/data-exploration/docs2cut.txt
/kaggle/input/data-exploration/deberta_train.csv
/kaggle/input/data-exploration/__notebook__.ipynb
/kaggle/input/data-exploration/__output__.json
/kaggle/input/data-exploration/train.csv
/kaggle/input/data-exploration/test.csv
/kaggle/input/data-exploration/custom.css
/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv
/kaggle/input/pii-detection-removal-from-educational-data/train.json
/kaggle/input/pii-detection-removal-from-educational-data/test.json


In [2]:
!pip install seqeval evaluate -q
import json
import argparse
from itertools import chain
from functools import partial
import torch
from transformers import AutoTokenizer,AutoModelForTokenClassification,Trainer, TrainingArguments, AutoConfig, DataCollatorForTokenClassification, TrainerCallback
import evaluate
from datasets import Dataset, features
import copy
from ast import literal_eval

2024-05-02 20:13:58.154914: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-02 20:13:58.155044: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-02 20:13:58.245823: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Prepararations for Model

In [3]:
# Model Settings from Hugging Face: Path, Length, Output Directory
TRAINING_MODEL_PATH = "microsoft/deberta-v3-base"
TRAINING_MAX_LENGTH = 1024
OUTPUT_DIR = "output"

In [4]:
train = pd.read_csv(open('/kaggle/input/data-exploration/train.csv'))
test = pd.read_csv(open('/kaggle/input/data-exploration/test.csv'))
train = train[['document', 'deberta_input_ids',
       'deberta_attention_mask', 'deberta_offset_mapping',
       'deberta_token_labels', 'deberta_length']]
test = test[['document', 'deberta_input_ids',
       'deberta_attention_mask', 'deberta_offset_mapping',
       'deberta_token_labels', 'deberta_length']]
print(train.columns)

Index(['document', 'deberta_input_ids', 'deberta_attention_mask',
       'deberta_offset_mapping', 'deberta_token_labels', 'deberta_length'],
      dtype='object')


In [5]:
train['deberta_input_ids'] = train['deberta_input_ids'].apply(literal_eval)
train['deberta_attention_mask'] = train['deberta_attention_mask'].apply(literal_eval)
train['deberta_offset_mapping'] = train['deberta_offset_mapping'].apply(literal_eval)
train['deberta_token_labels'] = train['deberta_token_labels'].apply(literal_eval)
test['deberta_input_ids'] = test['deberta_input_ids'].apply(literal_eval)
test['deberta_attention_mask'] = test['deberta_attention_mask'].apply(literal_eval)
test['deberta_offset_mapping'] = test['deberta_offset_mapping'].apply(literal_eval)
test['deberta_token_labels'] = test['deberta_token_labels'].apply(literal_eval)

In [6]:
def truncate(ls): 
    # Normal truncation
    # return(ls[:TRAINING_MAX_LENGTH])
    
    # Optimized truncation
    length = len(ls)
    if (length > TRAINING_MAX_LENGTH): 
        cutoff = 175
        return(ls[:cutoff] + ls[-(TRAINING_MAX_LENGTH-cutoff):])
    else: return ls

train['input_ids'] = train['deberta_input_ids'].apply(truncate)
train['attention_mask'] = train['deberta_attention_mask'].apply(truncate)
train['offset_mapping'] = train['deberta_offset_mapping'].apply(truncate)
train['labels'] = train['deberta_token_labels'].apply(truncate)
test['input_ids'] = test['deberta_input_ids'].apply(truncate)
test['attention_mask'] = test['deberta_attention_mask'].apply(truncate)
test['offset_mapping'] = test['deberta_offset_mapping'].apply(truncate)
test['labels'] = test['deberta_token_labels'].apply(truncate)
    
train = train.drop(['deberta_input_ids', 'deberta_attention_mask','deberta_offset_mapping', 
                     'deberta_token_labels', 'deberta_length'], axis = 1)
test = test.drop(['deberta_input_ids', 'deberta_attention_mask','deberta_offset_mapping', 
                     'deberta_token_labels', 'deberta_length'], axis = 1)

In [7]:
# DOWNSAMPLING: COMMENT ALL IF NOT DOWNSAMPLING

# file = open("/kaggle/input/data-exploration/docs2cut.txt","r")
# downsample_docs = [int(i) for i in file.read().split('\n')[:-1]]
# print("Number of docs removed: ", len(downsample_docs))

# print("Length of training data: ", len(train))
# print("Length of testing data: ", len(test))

# for d in downsample_docs: 
#     train = train[train.document != d]
#     test = test[test.document != d]
    
# print("After downsampling length of training data: ", len(train))
# print("After downsampling length of testing data: ", len(test))

In [8]:
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)

# For final model
all_data = train.append(test, ignore_index=True)
ds = Dataset.from_pandas(all_data)

In [9]:
len(train.iloc[2].input_ids)

1024

In [10]:
# Obtain all possible labels, along with a map between them and their ids
LABELS = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 
          'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 
          'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']

label2id = {l:i for i, l in enumerate(LABELS)}
id2label = {i:l for i, l in enumerate(LABELS)}
print(LABELS)

['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']


# Metrics


In [11]:
# METRICS

# Investigate adding some metrics to compute finer accuracies

# now let's define the metrics we care about 
from seqeval.metrics import recall_score, precision_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f5': f5_score
    }
    return results

# Custom Loss


In [12]:
# Implement custom loss function 
import torch.nn.functional as F

penalty = .05 # raise this to penalize guesses of PII for non-PII labels more
wts = torch.ones(len(LABELS))
wts[-1] = penalty # weight last class
wts = wts.to(device='cuda')
print('weights:', wts)

def custom_loss(preds, trues):
    """
    cross-entropy loss, but if a token is NOT PII, the loss for the 
    corresponding prediction, instead of being -logsoftmax(logit), 
    is equal to penalty. 
    
    This function will be applied to (logits, labels), i.e. we will
    call custom_loss(logits, labels) in the training step; logits will 
    have shape bs x max_seq_len x num_labels, labels will have shape 
    bs x max_seq_len. 
    
    It seems we can use just the built-in 'weight' parameter passed
    to the cross_entropy function
    """
    loss = F.cross_entropy(preds.permute(0,2,1), trues, 
                           weight=wts)
    
    return loss

# subclass Trainer 
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        # inputs is a dict with keys 'input_ids', a tensor representing the 
        # input sequence as a str of ints which are the vocabulary indices; 
        # token_type_ids and attention_mask, which seem irrelevant for the 
        # compute_loss function; and 'labels' which is the true values       
        labels = inputs.pop('labels') # removes labels from inputs and assigns to new variable
        outputs = model(**inputs)
        logits = outputs[0]
        loss = custom_loss(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

weights: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0500], device='cuda:0')


# Trainer Class

In [13]:
# COLLATE: For batches
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# training args, args setup
lr, epochs = 2e-5, 5
bs, wd = 4, 0.01

# model import and training 
# define model
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = copy.deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f5",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=wd
)

trainer = CustomTrainer(
    model=model, 
    args=args, 
    train_dataset=ds_train,
    eval_dataset=ds_test,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=LABELS),
)

trainer.add_callback(CustomCallback(trainer)) 

tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/579 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]



pytorch_model.bin:   0%|          | 0.00/371M [00:00<?, ?B/s]

  return self.fget.__get__(instance, owner)()
Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Train and Save Results

In [14]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Recall,Precision,F5
0,0.3686,0.010856,0.86435,0.572383,0.847719
2,0.0218,0.010075,0.912556,0.685185,0.901056
4,0.0066,0.011208,0.939462,0.771639,0.931669


In [15]:
trainer.state.log_history

[{'train_loss': 0.014266738668084145,
  'train_recall': 0.8384118190212373,
  'train_precision': 0.7071651090342679,
  'train_f5': 0.8324694100638246,
  'train_runtime': 423.7117,
  'train_samples_per_second': 12.048,
  'train_steps_per_second': 0.755,
  'epoch': 1.0,
  'step': 319},
 {'loss': 0.3686,
  'grad_norm': 555.7958374023438,
  'learning_rate': 1.9400249635128575e-05,
  'epoch': 1.0,
  'step': 319},
 {'eval_loss': 0.01085563562810421,
  'eval_recall': 0.8643497757847534,
  'eval_precision': 0.5723830734966593,
  'eval_f5': 0.8477185266630016,
  'eval_runtime': 140.6644,
  'eval_samples_per_second': 12.1,
  'eval_steps_per_second': 0.761,
  'epoch': 1.0,
  'step': 319},
 {'train_loss': 0.017717411741614342,
  'train_recall': 0.8325638658048631,
  'train_precision': 0.9080228264518295,
  'train_f5': 0.8352334805947461,
  'train_runtime': 424.2324,
  'train_samples_per_second': 12.033,
  'train_steps_per_second': 0.754,
  'epoch': 2.0,
  'step': 639},
 {'loss': 0.0315,
  'grad_no

In [16]:
trainer.save_model("deberta_opttrunc1")
tokenizer.save_pretrained("deberta_opttrunc1")

('deberta_opttrunc1/tokenizer_config.json',
 'deberta_opttrunc1/special_tokens_map.json',
 'deberta_opttrunc1/spm.model',
 'deberta_opttrunc1/added_tokens.json',
 'deberta_opttrunc1/tokenizer.json')