In [1]:
import json
import argparse
from itertools import chain
from functools import partial
import torch
from transformers import AutoTokenizer, Trainer, TrainingArguments
from transformers import AutoModelForTokenClassification, DataCollatorForTokenClassification
from transformers import pipeline
import evaluate
from datasets import Dataset, features
import numpy as np
import pandas as pd
import sentencepiece
# import data handling functions from utils (a diretory above)
import sys
sys.path.append('../')
from utils.data_handler import *

  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  from pandas.core import (


In [2]:
DATA_PATH = '/home/kd/Documents/pii_detection/data/'
OUTPUT_DIR = '/home/kd/Documents/pii_detection/models/'
# TRAINING_MODEL_PATH = 'dslim/bert-base-NER'
TRAINING_MODEL_PATH = 'dslim/bert-large-NER'

In [3]:
# nlp = pipeline("ner", model=model, tokenizer=tokenizer)
# example = "My name is Wolfgang and I live in Berlin"

In [4]:
data = do_load_data(DATA_PATH)
all_labels = sorted(list(set(chain(*[x["labels"] for x in data]))))
label2id = {l: i for i,l in enumerate(all_labels)}
id2label = {v:k for k,v in label2id.items()}

target = [
    'B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 
    'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 
    'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL'
]

tokenizer = AutoTokenizer.from_pretrained(TRAINING_MODEL_PATH)

ds = do_hf_dataset(tokenizer, label2id, data, TRAINING_MAX_LENGTH=512)

original datapoints:  6807
external datapoints:  4434
moredata datapoints:  2000
combined:  7333


Map (num_proc=3):   0%|          | 0/7333 [00:00<?, ? examples/s]

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-s

In [5]:
from seqeval.metrics import recall_score, precision_score
from seqeval.metrics import classification_report
from seqeval.metrics import f1_score

def compute_metrics(p, all_labels):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [all_labels[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [all_labels[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    
    recall = recall_score(true_labels, true_predictions)
    precision = precision_score(true_labels, true_predictions)
    f1_score = (1 + 5*5) * recall * precision / (5*5*precision + recall)
    
    results = {
        'recall': recall,
        'precision': precision,
        'f1': f1_score
    }
    return results

In [6]:
model = AutoModelForTokenClassification.from_pretrained(
    TRAINING_MODEL_PATH,
    num_labels=len(all_labels),
    id2label=id2label,
    label2id=label2id,
    ignore_mismatched_sizes=True
)
collator = DataCollatorForTokenClassification(tokenizer, pad_to_multiple_of=16)

  torch.utils._pytree._register_pytree_node(
Some weights of the model checkpoint at dslim/bert-large-NER were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at dslim/bert-large-NER and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([9]) in the checkpoint and torch.Size([13]) in the model instantiated
- clas

In [7]:
# I actually chose to not use any validation set. This is only for the model I use for submission.
args = TrainingArguments(
    output_dir=OUTPUT_DIR, 
    fp16=True,
    learning_rate=2e-5,
    num_train_epochs=3,
    per_device_train_batch_size=5,
    gradient_accumulation_steps=2,
    report_to="none",
    evaluation_strategy="no",
    do_eval=False,
    save_total_limit=1,
    logging_steps=20,
    lr_scheduler_type='cosine',
    metric_for_best_model="f1",
    greater_is_better=True,
    warmup_ratio=0.1,
    weight_decay=0.01
)

trainer = Trainer(
    model=model, 
    args=args, 
    train_dataset=ds,
    data_collator=collator, 
    tokenizer=tokenizer,
    compute_metrics=partial(compute_metrics, all_labels=all_labels),
)

In [8]:
%%time
trainer.train()

  0%|          | 0/2199 [00:00<?, ?it/s]

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'loss': 2.4989, 'learning_rate': 1.8181818181818183e-06, 'epoch': 0.03}
{'loss': 0.8248, 'learning_rate': 3.6363636363636366e-06, 'epoch': 0.05}
{'loss': 0.1712, 'learning_rate': 5.4545454545454545e-06, 'epoch': 0.08}
{'loss': 0.0863, 'learning_rate': 7.272727272727273e-06, 'epoch': 0.11}
{'loss': 0.0643, 'learning_rate': 9.090909090909091e-06, 'epoch': 0.14}
{'loss': 0.0584, 'learning_rate': 1.0909090909090909e-05, 'epoch': 0.16}
{'loss': 0.0385, 'learning_rate': 1.2727272727272728e-05, 'epoch': 0.19}
{'loss': 0.0389, 'learning_rate': 1.4545454545454546e-05, 'epoch': 0.22}
{'loss': 0.0371, 'learning_rate': 1.6363636363636366e-05, 'epoch': 0.25}
{'loss': 0.0323, 'learning_rate': 1.8181818181818182e-05, 'epoch': 0.27}
{'loss': 0.0353, 'learning_rate': 2e-05, 'epoch': 0.3}
{'loss': 0.0276, 'learning_rate': 1.9994960334973702e-05, 'epoch': 0.33}
{'loss': 0.0301, 'learning_rate': 1.997984641953951e-05, 'epoch': 0.35}
{'loss': 0.0257, 'learning_rate': 1.995467348751164e-05, 'epoch': 0.38}


TrainOutput(global_step=2199, training_loss=0.047519769893960014, metrics={'train_runtime': 2226.734, 'train_samples_per_second': 9.879, 'train_steps_per_second': 0.988, 'train_loss': 0.047519769893960014, 'epoch': 3.0})

In [9]:
trainer.save_model(OUTPUT_DIR + "/bert_large_ner")
tokenizer.save_pretrained(OUTPUT_DIR + "/bert_large_ner")

('/home/kd/Documents/pii_detection/models//bert_large_ner/tokenizer_config.json',
 '/home/kd/Documents/pii_detection/models//bert_large_ner/special_tokens_map.json',
 '/home/kd/Documents/pii_detection/models//bert_large_ner/vocab.txt',
 '/home/kd/Documents/pii_detection/models//bert_large_ner/added_tokens.json',
 '/home/kd/Documents/pii_detection/models//bert_large_ner/tokenizer.json')