In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [19]:
!pip install -q transformers datasets seqeval

In [12]:
cd /content/drive/MyDrive/deid/transformer-deid

/content/drive/MyDrive/deid/transformer-deid


In [13]:
ls

convert_data_to_gs.py  README.md  [0m[01;34mtests[0m/                [01;34mtransformer_deid[0m/
environment.yml        setup.py   train_transformer.py


In [15]:
from datetime import datetime
import logging
from pathlib import Path
import os
import json

import numpy as np

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric

# local packages
from transformer_deid.data import DeidDataset, DeidTask
from transformer_deid.evaluation import compute_metrics
from transformer_deid.tokenization import assign_tags, encode_tags, split_sequences
from transformer_deid.utils import convert_dict_to_native_types

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO
)
logger = logging.getLogger(__name__)


Downloading:   0%|          | 0.00/2.48k [00:00<?, ?B/s]

# Train transformer (adapted from train_transformer.py)

In [16]:
# specify dataset arguments
task_name = 'i2b2_2014'
split_long_sequences = True
label_transform = 'base'

deid_task = DeidTask(
    task_name,
    #data_dir=f'/home/alistairewj/git/deid-gs/{task_name}',
    data_dir=f'../{task_name}',
    label_transform=label_transform
)

train_texts, train_labels = deid_task.train['text'], deid_task.train['ann']
test_texts, test_labels = deid_task.test['text'], deid_task.test['ann']

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')

# split text/labels into multiple examples
# (1) tokenize text
# (2) identify split points
# (3) output text as it was originally
if split_long_sequences:
    train_texts, train_labels = split_sequences(
        train_texts, train_labels, tokenizer
    )
    test_texts, test_labels = split_sequences(
        test_texts, test_labels, tokenizer
    )

train_encodings = tokenizer(
    train_texts,
    is_split_into_words=False,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)
test_encodings = tokenizer(
    test_texts,
    is_split_into_words=False,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)

# use the offset mappings in train_encodings to assign labels to tokens
train_tags = assign_tags(train_encodings, train_labels)
test_tags = assign_tags(test_encodings, test_labels)

# encodings are dicts with three elements:
#   'input_ids', 'attention_mask', 'offset_mapping'
# these are used as kwargs to model training later
train_labels = encode_tags(train_tags, train_encodings, deid_task.label2id)
test_labels = encode_tags(test_tags, test_encodings, deid_task.label2id)

# prepare a dataset compatible with Trainer module
train_encodings.pop("offset_mapping")
test_encodings.pop("offset_mapping")
train_dataset = DeidDataset(train_encodings, train_labels)
test_dataset = DeidDataset(test_encodings, test_labels)

model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(deid_task.labels)
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", training_args.num_train_epochs)

# log top 5 examples
for i in range(min(len(train_dataset), 5)):
    input_ids, attention_mask, token_type_ids, label_ids = train_dataset.get_example(
        i, deid_task.id2label
    )

    # convert ids into human interpretable values
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    labels = [
        '-100' if l == -100 else deid_task.id2label[l] for l in label_ids
    ]

    logger.info("*** Example %d ***", i)
    logger.info("tokens: %s", " ".join(tokens))
    logger.info("labels: %s", " ".join(labels))
    logger.info("input_ids: %s", " ".join(map(str, input_ids)))
    logger.info("label_ids: %s", " ".join(map(str, label_ids)))
    logger.info("input_mask: %s", " ".join(map(str, attention_mask)))

trainer.train()

trainer.save_model(f'results/{task_name}_DistilBert_Model')

trainer.evaluate()

predictions, labels, _ = trainer.predict(test_dataset)
predicted_label = np.argmax(predictions, axis=2)


10/28/2021 21:28:43 - DEBUG - filelock -   Attempting to acquire lock 139788886242320 on /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
10/28/2021 21:28:43 - DEBUG - filelock -   Lock 139788886242320 acquired on /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

10/28/2021 21:28:43 - DEBUG - filelock -   Attempting to release lock 139788886242320 on /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
10/28/2021 21:28:43 - DEBUG - filelock -   Lock 139788886242320 released on /root/.cache/huggingface/transformers/ba377304984dc63e3ede0e23a938bbbf04d5c3835b66d5bb48343aecca188429.437aa611e89f6fc6675a049d2b5545390adbc617e7d655286421c191d2be2791.lock
10/28/2021 21:28:44 - DEBUG - filelock -   Attempting to acquire lock 139788889564816 on /root/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
10/28/2021 21:28:44 - DEBUG - filelock -   Lock 139788889564816 acquired on /root/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

10/28/2021 21:28:45 - DEBUG - filelock -   Attempting to release lock 139788889564816 on /root/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
10/28/2021 21:28:45 - DEBUG - filelock -   Lock 139788889564816 released on /root/.cache/huggingface/transformers/acb5c2138c1f8c84f074b86dafce3631667fccd6efcb1a7ea1320cf75c386a36.3dab63143af66769bbb35e3811f75f7e16b2320e12b7935e216bd6159ce6d9a6.lock
10/28/2021 21:28:46 - DEBUG - filelock -   Attempting to acquire lock 139788894909520 on /root/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
10/28/2021 21:28:46 - DEBUG - filelock -   Lock 139788894909520 acquired on /root/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8f

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

10/28/2021 21:28:47 - DEBUG - filelock -   Attempting to release lock 139788894909520 on /root/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
10/28/2021 21:28:47 - DEBUG - filelock -   Lock 139788894909520 released on /root/.cache/huggingface/transformers/81e970e5e6ec68be12da0f8f3b2f2469c78d579282299a2ea65b4b7441719107.ec5c189f89475aac7d8cbd243960a0655cfadc3d0474da8ff2ed0bf1699c2a5f.lock
10/28/2021 21:28:47 - DEBUG - filelock -   Attempting to acquire lock 139788894908688 on /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a.lock
10/28/2021 21:28:47 - DEBUG - filelock -   Lock 139788894908688 acquired on /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f975

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

10/28/2021 21:28:47 - DEBUG - filelock -   Attempting to release lock 139788894908688 on /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a.lock
10/28/2021 21:28:47 - DEBUG - filelock -   Lock 139788894908688 released on /root/.cache/huggingface/transformers/ebe1ea24d11aa664488b8de5b21e33989008ca78f207d4e30ec6350b693f073f.302bfd1b5e031cc1b17796e0b6e5b242ba2045d31d00f97589e12b458ebff27a.lock
Token indices sequence length is longer than the specified maximum sequence length for this model (1305 > 512). Running this sequence through the model will result in indexing errors
10/28/2021 21:28:49 - INFO - transformer_deid.tokenization -   Determining offsets for splitting long segments.
100%|██████████| 790/790 [01:22<00:00,  9.57it/s]
10/28/2021 21:30:12 - INFO - transformer_deid.tokenization -   Splitting text.
100%|██████████| 790/790 [00:01<00:00, 746.78it/s]
10/28/2021 21:

Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

10/28/2021 21:32:34 - DEBUG - filelock -   Attempting to release lock 139789130699920 on /root/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434eb9f1bc85a23a0.06b428c87335c1bb22eae46fdab31c8286efa0aa09e898a7ac42ddf5c3f5dc19.lock
10/28/2021 21:32:34 - DEBUG - filelock -   Lock 139789130699920 released on /root/.cache/huggingface/transformers/9c9f39769dba4c5fe379b4bc82973eb01297bd607954621434eb9f1bc85a23a0.06b428c87335c1bb22eae46fdab31c8286efa0aa09e898a7ac42ddf5c3f5dc19.lock
Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_projector.bias', 'vocab_projector.weight']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassificati

Step,Training Loss
10,2.3668
20,2.2179
30,1.9066
40,1.3977
50,0.6782
60,0.3301
70,0.2797
80,0.2635
90,0.1801
100,0.2125




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to results/i2b2_2014_DistilBert_Model
Configuration saved in results/i2b2_2014_DistilBert_Model/config.json
Model weights saved in results/i2b2_2014_DistilBert_Model/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1489
  Batch size = 64


***** Running Prediction *****
  Num examples = 1489
  Batch size = 64


# Eval

In [18]:
# curr_dir = Path(__file__).parent
# metric_dir = str(
#     (curr_dir / "transformer_deid/token_evaluation.py").absolute()
# )
metric_dir = "transformer_deid/token_evaluation.py"
metric = load_metric(metric_dir)
results = compute_metrics(
    predictions, labels, deid_task.labels, metric=metric
)

print(results)

# convert values within results dict from numpy types to python types
# makes it easier to serialize to json
results = convert_dict_to_native_types(results)

# output results to a log file
if not os.path.exists(training_args.output_dir):
    os.mkdirs(training_args.output_dir)

# add a few arguments to the results json
result_time = datetime.now().strftime('%Y-%m-%dT%H%M%S')
results['params'] = {
    'task_name': task_name,
    'label_transform': label_transform,
    'split_long_sequences': split_long_sequences,
}
with open(
    os.path.join(
        training_args.output_dir,
        f'{result_time}_{task_name}_DistilBert.json'
    ), 'w'
) as fp:
    json.dump(results, fp)

{'AGE': {'precision': 0.9818181818181818, 'recall': 0.8150943396226416, 'f1': 0.8907216494845361, 'number': 795}, 'CONTACT': {'precision': 0.9402697495183044, 'recall': 0.8442906574394463, 'f1': 0.8896991795806746, 'number': 578}, 'DATE': {'precision': 0.9527717300284709, 'recall': 0.9464315421726834, 'f1': 0.9495910532465364, 'number': 18033}, 'ID': {'precision': 0.8798586572438163, 'recall': 0.6812585499316005, 'f1': 0.7679259830377795, 'number': 1462}, 'LOCATION': {'precision': 0.9230416857535502, 'recall': 0.6886534518113465, 'f1': 0.7888040712468193, 'number': 2926}, 'NAME': {'precision': 0.8840293453724605, 'recall': 0.6920698034018113, 'f1': 0.7763598067154008, 'number': 4527}, 'PROFESSION': {'precision': 0.9361702127659575, 'recall': 0.3848396501457726, 'f1': 0.5454545454545454, 'number': 343}, 'overall_precision': 0.938180285144872, 'overall_recall': 0.8539980463298912, 'overall_f1': 0.8941120607787275, 'overall_accuracy': 0.9879510090146593}
