## STEPS

* We go through the 4 steps that are required to de-identify a dataset (i.e run the forward pass on this dataset using a trained model)

## STEP 0: LIBRARIES

In [20]:
import json

# ***Modified***

In [21]:
from transformers import HfArgumentParser, TrainingArguments

*   regex~=2021.11.2
*   typing~=3.10.0.0
*   setuptools~=58.5.2
*   configparser~=5.1.0
*   transformers==4.11.3
*   tensorflow==2.7.0
*   torch==1.10.0
*   numpy==1.21.2
*   scipy==1.7.2
*   pandas==1.3.3
*   scikit-learn==1.0.1
*   jupyterlab==3.2.9
*   ipywidgets==7.6.5
*   matplotlib==3.4.3
*   beautifulsoup4==4.10.0
*   spacy==3.0.6
*   nodejs==0.1.1
*   stanza==1.3.0
*   tensorflow-addons==0.15.0
*   wget==3.2
*   seqeval
*   scispacy==0.4.0
*   datasets==1.18.3
*   allennlp==2.9.0
*   pycorenlp==0.3.0
*   https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
*   https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz
   



# ***Modified***

### Note that the github requires spacy==3.0.6 but downloading it gives bunch of conflicts in dependencies + also need <= 3.0.8 to match en_core_sci version so downloading 3.0.7


In [None]:
!pip install -q pycorenlp
!pip install -q datasets
!pip install -q robust-deid
!pip install -q load_metric
!pip uninstall -q datasets
!pip install -q datasets==1.18.3
!pip install -q scispacy==0.4.0
!pip install -q seqeval
# Note that the github requires spacy==3.0.6 but downloading it gives bunch of conflicts in dependencies + also need <= 3.0.8 to match en_core_sci version so downloading 3.0.7
!pip install -q spacy==3.0.7
!pip install -q https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_sm-0.4.0.tar.gz
!pip install -q https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.4.0/en_core_sci_lg-0.4.0.tar.gz

  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for pycorenlp (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m30.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m179.3/179.3 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m19.4 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
gcsfs 2024.10.0 requires fsspec==2024.10.0, but you have fsspec 2024.9.0 which is incompatibl

In [22]:
from robust_deid.ner_datasets import DatasetCreator
from robust_deid.sequence_tagging import SequenceTagger
from robust_deid.sequence_tagging.arguments import (
    ModelArguments,
    DataTrainingArguments,
    EvaluationArguments,
)
from robust_deid.deid import TextDeid
from google.colab import files

## STEP 1: INITIALIZE

In [None]:
# # Initialize the path where the dataset is located (input_file).
# # Input dataset
# input_file = '../../data/notes/notes.jsonl'
# # Initialize the location where we will store the sentencized and tokenized dataset (ner_dataset_file)
# ner_dataset_file = '../../data/ner_datasets/test.jsonl'
# # Initialize the location where we will store the model predictions (predictions_file)
# # Verify this file location - Ensure it's the same location that you will pass in the json file
# # to the sequence tagger model. i.e. output_predictions_file in the json file should have the same
# # value as below
# predictions_file = '../../data/predictions/predictions.jsonl'
# # Initialize the file that will contain the original note text and the de-identified note text
# deid_file = '../../data/notes/deid.jsonl'
# # Initialize the model config. This config file contains the various parameters of the model.
# model_config = './run/i2b2/predict_i2b2.json'

# ***Modified***

In [23]:
uploaded = files.upload()
input_filename = list(uploaded.keys())[0]

input_file = input_filename
ner_dataset_file = 'test.jsonl'
predictions_file = 'predictions.jsonl'
deid_file = 'deid.jsonl'

Saving output1113.jsonl to output1113 (1).jsonl


## STEP 2: NER DATASET
* Sentencize and tokenize the raw text. We used sentences of length 128, which includes an additional 32 context tokens on either side of the sentence. These 32 tokens serve (from the previous & next sentence) serve as additional context to the current sentence.
* We used the en_core_sci_lg sentencizer and a custom tokenizer (can be found in the preprocessing module)
* The dataset stored in the ner_dataset_file will be used as input to the sequence tagger model

In [24]:
# Create the dataset creator object
dataset_creator = DatasetCreator(
    sentencizer='en_core_sci_sm',
    tokenizer='clinical',
    max_tokens=128,
    max_prev_sentence_token=32,
    max_next_sentence_token=32,
    default_chunk_size=32,
    ignore_label='NA'
)

In [25]:
# This function call sentencizes and tokenizes the dataset
# It returns a generator that iterates through the sequences.
# We write the output to the ner_dataset_file (in json format)
ner_notes = dataset_creator.create(
    input_file=input_file,
    mode='predict',
    notation='BILOU',
    token_text_key='text',
    metadata_key='meta',
    note_id_key='note_id',
    label_key='label',
    span_text_key='spans'
)
# Write to file
with open(ner_dataset_file, 'w') as file:
    for ner_sentence in ner_notes:
        file.write(json.dumps(ner_sentence) + '\n')

## STEP 3: SEQUENCE TAGGING
* Run the sequence model - specify parameters to the sequence model in the config file (model_config). The model will be run with the specified parameters. For more information of these parameters, please refer to huggingface (or use the docs provided).
* This file uses the argmax output. To use the recall threshold models (running the forward pass with a recall biased threshold for aggressively removing PHI) use the other config files.
* The config files in the i2b2 direct`ory specify the model trained on only the i2b2 dataset. The config files in the mgb_i2b2 directory is for the model trained on both MGB and I2B2 datasets.
* You can manually pass in the parameters instead of using the config file. The config file option is recommended. In our example we are passing the parameters through a config file. If you do not want to use the config file, skip the next code block and manually enter the values in the following code blocks. You will still need to read in the training args using huggingface and change values in the training args according to your needs.

# ***Modified***

In [26]:
uploaded = files.upload()
model_config = list(uploaded.keys())[0]

Saving predict_i2b2.json to predict_i2b2.json


In [27]:
parser = HfArgumentParser((
    ModelArguments,
    DataTrainingArguments,
    EvaluationArguments,
    TrainingArguments
))
# If we pass only one argument to the script and it's the path to a json file,
# let's parse it to get our arguments.
model_args, data_args, evaluation_args, training_args = parser.parse_json_file(json_file=model_config)

[INFO|training_args.py:2169] 2024-11-13 21:35:40,816 >> PyTorch: setting up devices


In [28]:
# Initialize the sequence tagger
sequence_tagger = SequenceTagger(
    task_name=data_args.task_name,
    notation=data_args.notation,
    ner_types=data_args.ner_types,
    model_name_or_path=model_args.model_name_or_path,
    config_name=model_args.config_name,
    tokenizer_name=model_args.tokenizer_name,
    post_process=model_args.post_process,
    cache_dir=model_args.cache_dir,
    model_revision=model_args.model_revision,
    use_auth_token=model_args.use_auth_token,
    threshold=model_args.threshold,
    do_lower_case=data_args.do_lower_case,
    fp16=training_args.fp16,
    seed=training_args.seed,
    local_rank=training_args.local_rank
)



config.json:   0%|          | 0.00/2.39k [00:00<?, ?B/s]

[INFO|configuration_utils.py:679] 2024-11-13 21:35:42,331 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/config.json
[INFO|configuration_utils.py:746] 2024-11-13 21:35:42,337 >> Model config BertConfig {
  "_name_or_path": "obi/deid_bert_i2b2",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "ner",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-AGE",
    "1": "B-DATE",
    "2": "B-EMAIL",
    "3": "B-HOSP",
    "4": "B-ID",
    "5": "B-LOC",
    "6": "B-OTHERPHI",
    "7": "B-PATIENT",
    "8": "B-PATORG",
    "9": "B-PHONE",
    "10": "B-STAFF",
    "11": "I-AGE",
    "12": "I-DATE",
    "13": "I-EMAIL",
    "14": "I-HOSP",
    "15": "I-ID",
    "16": "I-LOC",
    "17": "I-OTHERPH

tokenizer_config.json:   0%|          | 0.00/272 [00:00<?, ?B/s]

[INFO|configuration_utils.py:679] 2024-11-13 21:35:42,886 >> loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/config.json
[INFO|configuration_utils.py:746] 2024-11-13 21:35:42,890 >> Model config BertConfig {
  "_name_or_path": "obi/deid_bert_i2b2",
  "architectures": [
    "BertForTokenClassification"
  ],
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "finetuning_task": "ner",
  "gradient_checkpointing": false,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "B-AGE",
    "1": "B-DATE",
    "2": "B-EMAIL",
    "3": "B-HOSP",
    "4": "B-ID",
    "5": "B-LOC",
    "6": "B-OTHERPHI",
    "7": "B-PATIENT",
    "8": "B-PATORG",
    "9": "B-PHONE",
    "10": "B-STAFF",
    "11": "I-AGE",
    "12": "I-DATE",
    "13": "I-EMAIL",
    "14": "I-HOSP",
    "15": "I-ID",
    "16": "I-LOC",
    "17": "I-OTHERPH

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

[INFO|tokenization_utils_base.py:2211] 2024-11-13 21:35:45,298 >> loading file vocab.txt from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/vocab.txt
[INFO|tokenization_utils_base.py:2211] 2024-11-13 21:35:45,300 >> loading file tokenizer.json from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/tokenizer.json
[INFO|tokenization_utils_base.py:2211] 2024-11-13 21:35:45,302 >> loading file added_tokens.json from cache at None
[INFO|tokenization_utils_base.py:2211] 2024-11-13 21:35:45,304 >> loading file special_tokens_map.json from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/special_tokens_map.json
[INFO|tokenization_utils_base.py:2211] 2024-11-13 21:35:45,306 >> loading file tokenizer_config.json from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d

pytorch_model.bin:   0%|          | 0.00/431M [00:00<?, ?B/s]

[INFO|modeling_utils.py:3937] 2024-11-13 21:35:49,143 >> loading weights file pytorch_model.bin from cache at /root/.cache/huggingface/hub/models--obi--deid_bert_i2b2/snapshots/d2c629e162c16be3a8ccc6e8e4a61405dde93cc2/pytorch_model.bin
[INFO|modeling_utils.py:4800] 2024-11-13 21:35:49,230 >> All model checkpoint weights were used when initializing BertForTokenClassification.

[INFO|modeling_utils.py:4808] 2024-11-13 21:35:49,232 >> All the weights of BertForTokenClassification were initialized from the model checkpoint at obi/deid_bert_i2b2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use BertForTokenClassification for predictions without further training.


In [29]:
# Load the required functions of the sequence tagger
sequence_tagger.load()

In [30]:
# Set the required data and predictions of the sequence tagger
# Can also use data_args.test_file instead of ner_dataset_file (make sure it matches ner_dataset_file)
sequence_tagger.set_predict(
    test_file=ner_dataset_file,
    max_test_samples=data_args.max_predict_samples,
    preprocessing_num_workers=data_args.preprocessing_num_workers,
    overwrite_cache=data_args.overwrite_cache
)

[INFO|safetensors_conversion.py:61] 2024-11-13 21:35:49,389 >> Attempting to create safetensors variant
INFO:datasets.builder:Generating dataset json (/root/.cache/huggingface/datasets/json/default-925f179cbf4a4d51/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b)


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-925f179cbf4a4d51/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:datasets.utils.download_manager:Downloading took 0.0 min
INFO:datasets.utils.download_manager:Checksum Computation took 0.0 min


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:datasets.utils.info_utils:Unable to verify checksums.
INFO:datasets.builder:Generating split test
[INFO|safetensors_conversion.py:24] 2024-11-13 21:35:50,146 >> Attempting to convert .bin model on the fly to safetensors.
INFO:datasets.utils.info_utils:Unable to verify splits sizes.


Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-925f179cbf4a4d51/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

INFO:datasets.fingerprint:Parameter 'function'=<function DatasetTokenizer.tokenize_and_align_labels at 0x7af76476caf0> of the transform datasets.arrow_dataset.Dataset._map_single couldn't be hashed properly, a random hash was used instead.


  0%|          | 0/5 [00:00<?, ?ba/s]

INFO:datasets.arrow_dataset:Caching processed dataset at /root/.cache/huggingface/datasets/json/default-925f179cbf4a4d51/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b/cache-1c80317fa3b1799d.arrow


In [31]:
# Initialize the huggingface trainer
sequence_tagger.setup_trainer(training_args=training_args)

INFO:robust_deid.sequence_tagging.sequence_tagger:Training/evaluation parameters TrainingArguments(
_n_gpu=1,
accelerator_config={'split_batches': False, 'dispatch_batches': None, 'even_batches': True, 'use_seedable_sampler': True, 'non_blocking': False, 'gradient_accumulation_kwargs': None, 'use_configured_state': False},
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
average_tokens_across_devices=False,
batch_eval_metrics=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_persistent_workers=False,
dataloader_pin_memory=True,
dataloader_prefetch_factor=None,
ddp_backend=None,
ddp_broadcast_buffers=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
dispatch_batches=None,
do_eval=False,
do_predict=True,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_do_concat_batches=True,


In [32]:
# Store predictions in the specified file
predictions = sequence_tagger.predict()
# Write predictions to a file
with open(predictions_file, 'w') as file:
    for prediction in predictions:
        file.write(json.dumps(prediction) + '\n')

INFO:robust_deid.sequence_tagging.sequence_tagger:*** Predict ***
[INFO|trainer.py:4128] 2024-11-13 21:35:53,650 >> 
***** Running Prediction *****
[INFO|trainer.py:4130] 2024-11-13 21:35:53,650 >>   Num examples = 4481
[INFO|trainer.py:4133] 2024-11-13 21:35:53,653 >>   Batch size = 16
[INFO|trainer_utils.py:830] 2024-11-13 21:35:53,671 >> The following columns in the test set don't have a corresponding argument in `BertForTokenClassification.forward` and have been ignored: current_sent_info, tokens, note_sent_info. If current_sent_info, tokens, note_sent_info are not expected by `BertForTokenClassification.forward`,  you can safely ignore this message.


model.safetensors:   0%|          | 0.00/431M [00:00<?, ?B/s]

***** test metrics *****
  predict_loss                   =     0.1238
  predict_model_preparation_time =     0.0032
  predict_runtime                = 0:00:29.12
  predict_samples_per_second     =    153.859
  predict_steps_per_second       =      9.648


## STEP 4: DE-IDENTIFY TEXT

* This step uses the predictions from the previous step to de-id the text. We pass the original input file where the original text is present. We look at this text and the predictions and use both of these to de-id the text.

In [33]:
# Initialize the text deid object
text_deid = TextDeid(notation='BILOU', span_constraint='super_strict')

# ***Modified***

In [34]:
# De-identify the text - using deid_strategy=replace_informative doesn't drop the PHI from the text, but instead
# labels the PHI - which you can use to drop the PHI or do any other processing.
# If you want to drop the PHI automatically, you can use deid_strategy=remove
deid_notes = text_deid.run_deid(
    input_file=input_file,
    predictions_file=predictions_file,
    deid_strategy='replace_informative',
    # deid_strategy='remove',
    keep_age=False,
    metadata_key='meta',
    note_id_key='note_id',
    tokens_key='tokens',
    predictions_key='predictions',
    text_key='text',

)

In [35]:
with open(deid_file, 'w') as file:
    for deid_note in deid_notes:
        file.write(json.dumps(deid_note) + '\n')

# Roberta

In [None]:
with open(deid_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        print(json.dumps(data, indent=2))

{
  "text": "Trip to New York City to see a Knicks Game\n\nHe calls his friend Mike to see if he wants to coordinate a day that they can meet in NYC to watch a Knicks game\nThey agree on a date (about a month from now)\nHe purchases tickets for the game (online)\nHe plans out a budget for his trip\nHe speaks to his wife and lets her know of the plan. She gives him the OK\nHe is now riding the train toward New York City (specifies it takes about 1 hour)\nHe arrives at Grand Central Station and meets his friend\nThey both walk over to Madison Square Garden together\nThey enter the stadium together and immediately feel the \"buzz of the atmosphere\"\nThey are both having a great time enjoying the game\nHe orders several hot dogs throughout the course of the game (he specifies 3-4)\nThe Knicks win!\nHe walks back to the train station, parts ways with his friend, and rides back home\nHe arrives back home very late at night\nHe hops into his bed with a smile on his face\n\nMadison Square Gar

# Bert

In [36]:
with open(deid_file, 'r') as file:
    for line in file:
        data = json.loads(line)
        print(json.dumps(data, indent=2))

{
  "text": "Trip to New York City to see a Knicks Game\n\nHe calls his friend Mike to see if he wants to coordinate a day that they can meet in NYC to watch a Knicks game\nThey agree on a date (about a month from now)\nHe purchases tickets for the game (online)\nHe plans out a budget for his trip\nHe speaks to his wife and lets her know of the plan. She gives him the OK\nHe is now riding the train toward New York City (specifies it takes about 1 hour)\nHe arrives at Grand Central Station and meets his friend\nThey both walk over to Madison Square Garden together\nThey enter the stadium together and immediately feel the \"buzz of the atmosphere\"\nThey are both having a great time enjoying the game\nHe orders several hot dogs throughout the course of the game (he specifies 3-4)\nThe Knicks win!\nHe walks back to the train station, parts ways with his friend, and rides back home\nHe arrives back home very late at night\nHe hops into his bed with a smile on his face\n\nMadison Square Gar