#Initializations

In [None]:
import torch
from tqdm import tqdm
from google.colab import drive
drive.mount('/content/drive')
from google.colab import auth
auth.authenticate_user()

# https://cloud.google.com/resource-manager/docs/creating-managing-projects
project_id = 'kind-lab'
!gcloud config set project {project_id}

!pip install -q transformers datasets seqeval

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Updated property [core/project].


# Copy from google cloud

In [None]:
!mkdir i2b2_2014
!gsutil -m -q cp -r gs://deid-data/i2b2_2014/train/ i2b2_2014/
!gsutil -m -q cp -r gs://deid-data/i2b2_2014/test/ i2b2_2014/

mkdir: cannot create directory ‘i2b2_2014’: File exists


# Clone repo

In [None]:
!git clone https://xhuang98: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@github.com/alistairewj/transformer-deid.git


Cloning into 'transformer-deid'...
remote: Enumerating objects: 181, done.[K
remote: Counting objects: 100% (181/181), done.[K
remote: Compressing objects: 100% (90/90), done.[K
remote: Total 181 (delta 113), reused 153 (delta 90), pack-reused 0[K
Receiving objects: 100% (181/181), 68.32 KiB | 9.76 MiB/s, done.
Resolving deltas: 100% (113/113), done.


In [None]:
!git checkout xidev

Branch 'xidev' set up to track remote branch 'xidev' from 'origin'.
Switched to a new branch 'xidev'


In [None]:
cd transformer-deid

/content/transformer-deid/transformer-deid


In [None]:
from datetime import datetime
import logging
from pathlib import Path
import os
import json

import numpy as np

from transformers import DistilBertTokenizerFast
from transformers import DistilBertForTokenClassification, BertForTokenClassification
from transformers import Trainer, TrainingArguments
from datasets import load_metric

# local packages
from transformer_deid.data import DeidDataset, DeidTask
from transformer_deid.evaluation import compute_metrics
from transformer_deid.tokenization import assign_tags, encode_tags, split_sequences
from transformer_deid.utils import convert_dict_to_native_types

logging.basicConfig(
    format='%(asctime)s - %(levelname)s - %(name)s -   %(message)s',
    datefmt='%m/%d/%Y %H:%M:%S',
    level=logging.INFO
)
logger = logging.getLogger(__name__)


# Load data

In [None]:
# specify dataset arguments
task_name = 'i2b2_2014'
split_long_sequences = True
label_transform = 'base'

deid_task = DeidTask(
    task_name,
    #data_dir=f'/home/alistairewj/git/deid-gs/{task_name}',
    data_dir=f'../{task_name}',
    label_transform=label_transform
)

train_texts, train_labels = deid_task.train['text'], deid_task.train['ann']
test_texts, test_labels = deid_task.test['text'], deid_task.test['ann']

tokenizer = DistilBertTokenizerFast.from_pretrained('distilbert-base-cased')


Downloading:   0%|          | 0.00/208k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/426k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/29.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/411 [00:00<?, ?B/s]

# Data preprocessing

In [None]:

# split text/labels into multiple examples
# (1) tokenize text
# (2) identify split points
# (3) output text as it was originally
if split_long_sequences:
    train_texts, train_labels = split_sequences(
        tokenizer, train_texts, train_labels
    )
    test_texts, test_labels = split_sequences(
        tokenizer, test_texts, test_labels
    )

# look at one element of train encodings: transformers.tokenization_utils_base.BatchEncoding
train_encodings = tokenizer(
    train_texts,
    is_split_into_words=False,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)  
test_encodings = tokenizer(
    test_texts,
    is_split_into_words=False,
    return_offsets_mapping=True,
    padding=True,
    truncation=True
)

# use the offset mappings in train_encodings to assign labels to tokens
train_tags = assign_tags(train_encodings, train_labels)
test_tags = assign_tags(test_encodings, test_labels)

# encodings are dicts with three elements:
#   'input_ids', 'attention_mask', 'offset_mapping'
# these are used as kwargs to model training later
train_tags = encode_tags(train_tags, train_encodings, deid_task.label2id)
test_tags = encode_tags(test_tags, test_encodings, deid_task.label2id)

# prepare a dataset compatible with Trainer module
train_encodings.pop("offset_mapping")
test_encodings.pop("offset_mapping")
train_dataset = DeidDataset(train_encodings, train_tags)
test_dataset = DeidDataset(test_encodings, test_tags)


Token indices sequence length is longer than the specified maximum sequence length for this model (1125 > 512). Running this sequence through the model will result in indexing errors
12/18/2021 07:26:25 - INFO - transformer_deid.tokenization -   Determining offsets for splitting long segments.
100%|██████████| 790/790 [01:22<00:00,  9.63it/s]
12/18/2021 07:27:47 - INFO - transformer_deid.tokenization -   Splitting text.
100%|██████████| 790/790 [00:00<00:00, 865.92it/s]
12/18/2021 07:27:49 - INFO - transformer_deid.tokenization -   Determining offsets for splitting long segments.
100%|██████████| 514/514 [00:46<00:00, 10.96it/s]
12/18/2021 07:28:36 - INFO - transformer_deid.tokenization -   Splitting text.
100%|██████████| 514/514 [00:00<00:00, 927.04it/s]


# Train transformer (skip if loading model)

In [None]:
model = DistilBertForTokenClassification.from_pretrained(
    'distilbert-base-cased', num_labels=len(deid_task.labels)
)

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

logger.info("***** Running training *****")
logger.info("  Num examples = %d", len(train_dataset))
logger.info("  Num Epochs = %d", training_args.num_train_epochs)

# log top 5 examples
for i in range(min(len(train_dataset), 5)):
    input_ids, attention_mask, token_type_ids, label_ids = train_dataset.get_example(
        i, deid_task.id2label
    )

    # convert ids into human interpretable values
    tokens = tokenizer.convert_ids_to_tokens(input_ids)
    labels = [
        '-100' if l == -100 else deid_task.id2label[l] for l in label_ids
    ]

    logger.info("*** Example %d ***", i)
    logger.info("tokens: %s", " ".join(tokens))
    logger.info("labels: %s", " ".join(labels))
    logger.info("input_ids: %s", " ".join(map(str, input_ids)))
    logger.info("label_ids: %s", " ".join(map(str, label_ids)))
    logger.info("input_mask: %s", " ".join(map(str, attention_mask)))

trainer.train()

trainer.save_model(f'results/{task_name}_DistilBert_Model')

trainer.evaluate()

predictions, labels, _ = trainer.predict(test_dataset)
predicted_label = np.argmax(predictions, axis=2)


Downloading:   0%|          | 0.00/251M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-cased were not used when initializing DistilBertForTokenClassification: ['vocab_layer_norm.bias', 'vocab_projector.weight', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

Step,Training Loss
10,1.9629
20,1.8171
30,1.5547
40,1.1777
50,0.6982
60,0.3665
70,0.2567
80,0.2139
90,0.2019
100,0.1466




Training completed. Do not forget to share your model on huggingface.co/models =)


Saving model checkpoint to results/i2b2_2014_DistilBert_Model
Configuration saved in results/i2b2_2014_DistilBert_Model/config.json
Model weights saved in results/i2b2_2014_DistilBert_Model/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 1489
  Batch size = 64


***** Running Prediction *****
  Num examples = 1489
  Batch size = 64


# Load model

In [None]:

model = DistilBertForTokenClassification.from_pretrained(
    '../drive/MyDrive/deid/transformer-deid/results/i2b2_2014_DistilBert_Model', num_labels=len(deid_task.labels)
)


# Run dataset through model

In [None]:

training_args = TrainingArguments(
    output_dir='./results',
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=64,
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=10,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset
)

predictions, labels, _ = trainer.predict(test_dataset)
predicted_label = np.argmax(predictions, axis=2)

PyTorch: setting up devices
The default value for the training argument `--report_to` will change in v5 (from all installed integrations to none). In v5, you will need to use `--report_to all` to get the same behavior as now. You should start updating your code and make this info disappear :-).
***** Running Prediction *****
  Num examples = 1489
  Batch size = 64


# Eval

In [None]:
import pprint
metric_dir = "transformer_deid/token_evaluation.py"
metric = load_metric(metric_dir)
results = compute_metrics(
    predicted_label, labels, deid_task.labels, metric=metric
)

pprint.pprint(results)

{'AGE': {'f1': 0.9216944801026957,
         'number': 795,
         'precision': 0.9410222804718218,
         'recall': 0.9031446540880503},
 'CONTACT': {'f1': 0.8563162970106076,
             'number': 578,
             'precision': 0.9673202614379085,
             'recall': 0.7681660899653979},
 'DATE': {'f1': 0.9459564757611177,
          'number': 18033,
          'precision': 0.9494441651304396,
          'recall': 0.9424943159762658},
 'ID': {'f1': 0.7676198147402334,
        'number': 1462,
        'precision': 0.9333986287952988,
        'recall': 0.6518467852257182},
 'LOCATION': {'f1': 0.8175895765472313,
              'number': 2926,
              'precision': 0.8688461538461538,
              'recall': 0.7720437457279562},
 'NAME': {'f1': 0.8084411798997319,
          'number': 4527,
          'precision': 0.8560493827160494,
          'recall': 0.7658493483543185},
 'PROFESSION': {'f1': 0.7018121911037891,
                'number': 343,
                'precision': 0.80681

# Test example

In [None]:
cd transformer_deid/

In [None]:
from predict import *
text = test_texts[0]
deid_text = deid_example(text, model)

print(text)
print('===================')
print(deid_text)