In [8]:
import json
import argparse
from itertools import chain
from functools import partial
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd
import sentencepiece
import sys
sys.path.append('../')
from utils.data_handler import *

In [9]:
DATA_PATH = '/home/kd/Documents/pii_detection/data/'
OUTPUT_DIR = '/home/kd/Documents/pii_detection/models/'
TRAINING_MODEL_PATH = "microsoft/deberta-v3-large"
TRAINING_MAX_LENGTH = 1024

In [10]:
data = do_load_data(DATA_PATH)
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = do_hf_dataset(tokenizer, label2id, data, TRAINING_MAX_LENGTH=512)

original datapoints:  6807
external datapoints:  4434
moredata datapoints:  2000
combined:  7333


Downloading tokenizer_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/580 [00:00<?, ?B/s]

Downloading spm.model:   0%|          | 0.00/2.46M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Map (num_proc=3):   0%|          | 0/7333 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [11]:
def tokenize(example, tokenizer, label2id, max_length):

    # rebuild text from tokens
    text = []
    labels = []

    for t, l, ws in zip(
        example["tokens"], example["provided_labels"], example["trailing_whitespace"]
    ):
        text.append(t)
        labels.extend([l] * len(t))

        if ws:
            text.append(" ")
            labels.append("O")

    # actual tokenization
    tokenized = tokenizer("".join(text), return_offsets_mapping=True, max_length=max_length)

    labels = np.array(labels)

    text = "".join(text)
    token_labels = []

    for start_idx, end_idx in tokenized.offset_mapping:
        # CLS token
        if start_idx == 0 and end_idx == 0:
            token_labels.append(label2id["O"])
            continue

        # case when token starts with whitespace
        if text[start_idx].isspace():
            start_idx += 1

        token_labels.append(label2id[labels[start_idx]])

    length = len(tokenized.input_ids)

    return {**tokenized, "labels": token_labels, "length": length}

In [12]:
ds

Dataset({
    features: ['full_text', 'document', 'tokens', 'trailing_whitespace', 'provided_labels', 'input_ids', 'token_type_ids', 'attention_mask', 'offset_mapping', 'labels', 'length'],
    num_rows: 7333
})

In [13]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

In [14]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

Downloading pytorch_model.bin:   0%|          | 0.00/874M [00:00<?, ?B/s]

Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [15]:
# I actually chose to not use any validation set. This is only for the model I use for submission.
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [16]:
%%time
trainer.train()

  0%|          | 0/2751 [00:00<?, ?it/s]

You're using a DebertaV2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.8193, 'learning_rate': 1.1594202898550726e-06, 'epoch': 0.02}
{'loss': 1.9754, 'learning_rate': 2.6086956521739132e-06, 'epoch': 0.04}
{'loss': 0.4636, 'learning_rate': 4.057971014492754e-06, 'epoch': 0.07}
{'loss': 0.1061, 'learning_rate': 5.507246376811595e-06, 'epoch': 0.09}
{'loss': 0.0533, 'learning_rate': 6.956521739130435e-06, 'epoch': 0.11}
{'loss': 0.037, 'learning_rate': 8.405797101449275e-06, 'epoch': 0.13}
{'loss': 0.0268, 'learning_rate': 9.855072463768118e-06, 'epoch': 0.15}
{'loss': 0.0169, 'learning_rate': 1.1304347826086957e-05, 'epoch': 0.17}
{'loss': 0.0091, 'learning_rate': 1.2753623188405797e-05, 'epoch': 0.2}
{'loss': 0.0136, 'learning_rate': 1.420289855072464e-05, 'epoch': 0.22}
{'loss': 0.0182, 'learning_rate': 1.565217391304348e-05, 'epoch': 0.24}
{'loss': 0.012, 'learning_rate': 1.710144927536232e-05, 'epoch': 0.26}
{'loss': 0.0156, 'learning_rate': 1.8550724637681162e-05, 'epoch': 0.28}
{'loss': 0.015, 'learning_rate': 2e-05, 'epoch': 0.31}
{'loss'

TrainOutput(global_step=2751, training_loss=0.04437256334346128, metrics={'train_runtime': 3556.0537, 'train_samples_per_second': 6.186, 'train_steps_per_second': 0.774, 'train_loss': 0.04437256334346128, 'epoch': 3.0})

In [17]:
trainer.save_model(OUTPUT_DIR + "/deberta_large")
tokenizer.save_pretrained(OUTPUT_DIR + "/deberta_large")

('/home/kd/Documents/pii_detection/models//deberta_large/tokenizer_config.json',
 '/home/kd/Documents/pii_detection/models//deberta_large/special_tokens_map.json',
 '/home/kd/Documents/pii_detection/models//deberta_large/spm.model',
 '/home/kd/Documents/pii_detection/models//deberta_large/added_tokens.json',
 '/home/kd/Documents/pii_detection/models//deberta_large/tokenizer.json')

In [13]:
torch.cuda.empty_cache()