# RoBERTa Model with Partitioning
This notebook serves as the main run file for RoBERTa models using partitioning to preprocess the data
There are several things that one can change in this file for each run: 
1. Penalty for loss function 
2. Choice to downsample or not (uncomment box on downsampling if choosing to do so)
3. Training parameters such as epochs, learning rate, etc. 
4. If training on combined training+testing data, change "evaluation_strategy" to "no", and in the trainer, change 

train_dataset = ds_train 

test_dataset = ds_test 

to just 

train_dataset = ds

(+ delete the line about test_dataset)

In [1]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/pii-detection-removal-from-educational-data/sample_submission.csv
/kaggle/input/pii-detection-removal-from-educational-data/train.json
/kaggle/input/pii-detection-removal-from-educational-data/test.json
/kaggle/input/data-exploration/deberta_test.csv
/kaggle/input/data-exploration/roberta_test.csv
/kaggle/input/data-exploration/roberta_train.csv
/kaggle/input/data-exploration/__results__.html
/kaggle/input/data-exploration/docs2cut.txt
/kaggle/input/data-exploration/deberta_train.csv
/kaggle/input/data-exploration/__notebook__.ipynb
/kaggle/input/data-exploration/__output__.json
/kaggle/input/data-exploration/train.csv
/kaggle/input/data-exploration/test.csv
/kaggle/input/data-exploration/custom.css


In [2]:
!pip install seqeval evaluate -q
import json
import argparse
from itertools import chain
from functools import partial
import torch
from transformers import AutoTokenizer,AutoModelForTokenClassification,Trainer, TrainingArguments, AutoConfig, DataCollatorForTokenClassification, TrainerCallback
import evaluate
from datasets import Dataset, features
import copy
from ast import literal_eval
import ast

2024-05-01 14:26:20.541462: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-01 14:26:20.541590: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-01 14:26:20.695208: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


# Prepararations for Model

In [3]:
# Model Settings from Hugging Face: Path, Length, Output Directory
TRAINING_MODEL_PATH = "FacebookAI/roberta-base"
TRAINING_MAX_LENGTH = 512
OUTPUT_DIR = "output"

In [4]:
train = pd.read_csv(open('/kaggle/input/data-exploration/roberta_train.csv'))
test = pd.read_csv(open('/kaggle/input/data-exploration/roberta_test.csv'))
train = train.drop(['full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-URL_PERSONAL', 'I-NAME_STUDENT', 'B-EMAIL'], axis = 1)
test = test.drop(['full_text', 'tokens', 'trailing_whitespace', 'provided_labels', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-URL_PERSONAL', 'I-NAME_STUDENT', 'B-EMAIL'], axis = 1)
print(train.columns)

Index(['document', 'input_ids', 'attention_mask', 'offset_mapping', 'labels'], dtype='object')


In [5]:
train['input_ids'] = train['input_ids'].apply(literal_eval)
train['attention_mask'] = train['attention_mask'].apply(literal_eval)
train['offset_mapping'] = train['offset_mapping'].apply(literal_eval)
train['labels'] = train['labels'].apply(literal_eval)
test['input_ids'] = test['input_ids'].apply(literal_eval)
test['attention_mask'] = test['attention_mask'].apply(literal_eval)
test['offset_mapping'] = test['offset_mapping'].apply(literal_eval)
test['labels'] = test['labels'].apply(literal_eval)

In [6]:
# DOWNSAMPLING: COMMENT ALL IF NOT DOWNSAMPLING

# file = open("/kaggle/input/data-exploration/docs2cut.txt","r")
# downsample_docs = [int(i) for i in file.read().split('\n')[:-1]]
# print("Number of docs removed: ", len(downsample_docs))

# print("Length of training data: ", len(train))
# print("Length of testing data: ", len(test))

# for d in downsample_docs: 
#     train = train[train.document != d]
#     test = test[test.document != d]
    
# print("After downsampling length of training data: ", len(train))
# print("After downsampling length of testing data: ", len(test))

In [7]:
ds_train = Dataset.from_pandas(train)
ds_test = Dataset.from_pandas(test)

# For final model
all_data = pd.concat([train, test], ignore_index=True, sort=False)
ds = Dataset.from_pandas(all_data)

In [8]:
# Obtain all possible labels, along with a map between them and their ids
LABELS = ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 
          'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 
          'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']

label2id = {l:i for i, l in enumerate(LABELS)}
id2label = {i:l for i, l in enumerate(LABELS)}
print(LABELS)

['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']


# Metrics


In [9]:
# METRICS

# Investigate adding some metrics to compute finer accuracies

# now let's define the metrics we care about 
from seqeval.metrics import recall_score, precision_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f5_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f5': f5_score
    }
    return results

# Custom Loss


In [10]:
# Implement custom loss function 
import torch.nn.functional as F

penalty = 0.05 # raise this to penalize guesses of PII for non-PII labels more
wts = torch.ones(len(LABELS))
wts[-1] = penalty # weight last class
wts = wts.to(device='cuda')
print('weights:', wts)

def custom_loss(preds, trues, epsilon=1):
    """
    cross-entropy loss, but if a token is NOT PII, the loss for the 
    corresponding prediction, instead of being -logsoftmax(logit), 
    is equal to epsilon. 
    
    This function will be applied to (logits, labels), i.e. we will
    call custom_loss(logits, labels) in the training step; logits will 
    have shape bs x max_seq_len x num_labels, labels will have shape 
    bs x max_seq_len. 
    
    It seems we can use just the built-in 'weight' parameter passed
    to the cross_entropy function
    """
    loss = F.cross_entropy(preds.permute(0,2,1), trues, 
                           weight=wts)
    
    return loss

# subclass Trainer 
class CustomTrainer(Trainer):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    def compute_loss(self, model, inputs, return_outputs=False):
        # inputs is a dict with keys 'input_ids', a tensor representing the 
        # input sequence as a str of ints which are the vocabulary indices; 
        # token_type_ids and attention_mask, which seem irrelevant for the 
        # compute_loss function; and 'labels' which is the true values       
        labels = inputs.pop('labels') # removes labels from inputs and assigns to new variable
        outputs = model(**inputs)
        logits = outputs[0]
        loss = custom_loss(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

weights: tensor([1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000, 1.0000,
        1.0000, 1.0000, 1.0000, 0.0500], device='cuda:0')


# Trainer Class

In [11]:
# COLLATE: For batches
tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

# training args, args setup
lr, epochs = 2e-5, 5
bs, wd = 4, 0.01

# model import and training 
# define model
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(LABELS),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)

class CustomCallback(TrainerCallback):
    
    def __init__(self, trainer) -> None:
        super().__init__()
        self._trainer = trainer
    
    def on_epoch_end(self, args, state, control, **kwargs):
        if control.should_evaluate:
            control_copy = copy.deepcopy(control)
            self._trainer.evaluate(eval_dataset=self._trainer.train_dataset, metric_key_prefix="train")
            return control_copy

args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=lr,
    num_train_epochs=epochs,
    per_device_train_batch_size=bs,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f5",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=wd
)

trainer = CustomTrainer(
    model=model, 
    args=args, 
    train_dataset=ds_train,
    eval_dataset=ds_test,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=LABELS),
)

trainer.add_callback(CustomCallback(trainer)) 

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


# Train and save results

In [12]:
train_result = trainer.train()

Epoch,Training Loss,Validation Loss,Recall,Precision,F5
1,0.2068,0.04159,0.806287,0.898941,0.809496
2,0.0361,0.025215,0.877193,0.738007,0.870876
3,0.0171,0.036569,0.85307,0.915294,0.855307
4,0.0093,0.03061,0.853801,0.869048,0.854378
5,0.0046,0.019228,0.914474,0.791271,0.90903


In [13]:
trainer.state.log_history

[{'train_loss': 0.04136740788817406,
  'train_recall': 0.7683929762169371,
  'train_precision': 0.8951320559295701,
  'train_f5': 0.7726002905352555,
  'train_runtime': 255.0129,
  'train_samples_per_second': 41.578,
  'train_steps_per_second': 2.6,
  'epoch': 1.0,
  'step': 663},
 {'loss': 0.2068,
  'grad_norm': 0.6823705434799194,
  'learning_rate': 1.9398526084229788e-05,
  'epoch': 1.0,
  'step': 663},
 {'eval_loss': 0.04158961400389671,
  'eval_recall': 0.8062865497076024,
  'eval_precision': 0.8989405052974735,
  'eval_f5': 0.8094955824653514,
  'eval_runtime': 86.2062,
  'eval_samples_per_second': 40.937,
  'eval_steps_per_second': 2.564,
  'epoch': 1.0,
  'step': 663},
 {'train_loss': 0.017581135034561157,
  'train_recall': 0.9199822182707268,
  'train_precision': 0.8986105080330005,
  'train_f5': 0.9191414490822594,
  'train_runtime': 254.3394,
  'train_samples_per_second': 41.688,
  'train_steps_per_second': 2.607,
  'epoch': 2.0,
  'step': 1326},
 {'loss': 0.0361,
  'grad_no

In [14]:
#trainer.save_model("roberta_partition_downsample")
#tokenizer.save_pretrained("roberta_partition_downsample")

trainer.save_model("roberta_partition")
tokenizer.save_pretrained("roberta_partitione")

('roberta_partitione/tokenizer_config.json',
 'roberta_partitione/special_tokens_map.json',
 'roberta_partitione/vocab.json',
 'roberta_partitione/merges.txt',
 'roberta_partitione/added_tokens.json',
 'roberta_partitione/tokenizer.json')

In [15]:
#df_log_history = pd.DataFrame(trainer.state.log_history)
#df_log_history.to_csv("downsamplepenalty.07epoch6.csv", index=False)