In [1]:
pip install -q datasets transformers

In [2]:
import re
def get_tokens_with_entities(raw_text: str):
    # split the text by spaces only if the space does not occur between square brackets
    # we do not want to split "multi-word" entity value yet
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)

    # a regex for matching the annotation according to our notation [entity_value](entity_name)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            # we prefix the name of entity differently
            # B- indicates beginning of an entity
            # I- indicates the token is not a new entity itself but rather a part of existing one
            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities

In [3]:
print(get_tokens_with_entities("[physical therapy](procedure) reimburse physical therapy at $[322.00 for initial visit](initial_visit)  [subsequent visits at $212](subsequent_visit) per visit, per type of therapy. [initial evaluation: 97161-97164](initial_visit_cpt_codes) [subsequent visits: 97010-97036, 97110-97140, 97530, 97750-97763](subsequent_visit_cpt_codes)"))


print(get_tokens_with_entities("[physical therapy](procedure) reimburse at [$70.00](amount) per visit, per day, per type of therapy billed. any associated ancillary services are to be included in the per visit rate."))


[('physical', 'B-procedure'), ('therapy', 'I-procedure'), ('reimburse', 'O'), ('physical', 'O'), ('therapy', 'O'), ('at', 'O'), ('$[322.00 for initial visit](initial_visit)', 'O'), ('', 'O'), ('subsequent', 'B-subsequent_visit'), ('visits', 'I-subsequent_visit'), ('at', 'I-subsequent_visit'), ('$212', 'I-subsequent_visit'), ('per', 'O'), ('visit,', 'O'), ('per', 'O'), ('type', 'O'), ('of', 'O'), ('therapy.', 'O'), ('initial', 'B-initial_visit_cpt_codes'), ('evaluation:', 'I-initial_visit_cpt_codes'), ('97161-97164', 'I-initial_visit_cpt_codes'), ('subsequent', 'B-subsequent_visit_cpt_codes'), ('visits:', 'I-subsequent_visit_cpt_codes'), ('97010-97036,', 'I-subsequent_visit_cpt_codes'), ('97110-97140,', 'I-subsequent_visit_cpt_codes'), ('97530,', 'I-subsequent_visit_cpt_codes'), ('97750-97763', 'I-subsequent_visit_cpt_codes')]
[('physical', 'B-procedure'), ('therapy', 'I-procedure'), ('reimburse', 'O'), ('at', 'O'), ('$70.00', 'B-amount'), ('per', 'O'), ('visit,', 'O'), ('per', 'O'), ('

In [4]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")

In [5]:
sample_input = "[physical therapy](procedure) reimburse at [$70.00](amount) per visit, per day, per type of therapy billed. any associated ancillary services are to be included in the per visit rate."
tokens, entities = list(zip(*get_tokens_with_entities(sample_input)))
tokenized_input = tokenizer(tokens, is_split_into_words=True)
print("Original tokens           : ", tokens)
print("After subword tokenization: ", tokenizer.convert_ids_to_tokens(tokenized_input['input_ids']))

Original tokens           :  ('physical', 'therapy', 'reimburse', 'at', '$70.00', 'per', 'visit,', 'per', 'day,', 'per', 'type', 'of', 'therapy', 'billed.', 'any', 'associated', 'ancillary', 'services', 'are', 'to', 'be', 'included', 'in', 'the', 'per', 'visit', 'rate.')
After subword tokenization:  ['[CLS]', 'physical', 'therapy', 'rei', '##mb', '##urse', 'at', '$', '70', '.', '00', 'per', 'visit', ',', 'per', 'day', ',', 'per', 'type', 'of', 'therapy', 'billed', '.', 'any', 'associated', 'an', '##ci', '##llary', 'services', 'are', 'to', 'be', 'included', 'in', 'the', 'per', 'visit', 'rate', '.', '[SEP]']


In [6]:
import re
def get_tokens_with_entities(raw_text: str):
    raw_tokens = re.split(r"\s(?![^\[]*\])", raw_text)
    entity_value_pattern = r"\[(?P<value>.+?)\]\((?P<entity>.+?)\)"
    entity_value_pattern_compiled = re.compile(entity_value_pattern, flags=re.I|re.M)

    tokens_with_entities = []

    for raw_token in raw_tokens:
        match = entity_value_pattern_compiled.match(raw_token)
        if match:
            raw_entity_name, raw_entity_value = match.group("entity"), match.group("value")

            for i, raw_entity_token in enumerate(re.split("\s", raw_entity_value)):
                entity_prefix = "B" if i == 0 else "I"
                entity_name = f"{entity_prefix}-{raw_entity_name}"
                tokens_with_entities.append((raw_entity_token, entity_name))
        else:
            tokens_with_entities.append((raw_token, "O"))

    return tokens_with_entities


class NERDataMaker:
    def __init__(self, texts):
        self.unique_entities = []
        self.processed_texts = []

        temp_processed_texts = []
        for text in texts:
            tokens_with_entities = get_tokens_with_entities(text)
            for _, ent in tokens_with_entities:
                if ent not in self.unique_entities:
                    self.unique_entities.append(ent)
            temp_processed_texts.append(tokens_with_entities)

        self.unique_entities.sort(key=lambda ent: ent if ent != "O" else "")

        for tokens_with_entities in temp_processed_texts:
            self.processed_texts.append([(t, self.unique_entities.index(ent)) for t, ent in tokens_with_entities])

    @property
    def id2label(self):
        return dict(enumerate(self.unique_entities))

    @property
    def label2id(self):
        return {v:k for k, v in self.id2label.items()}

    def __len__(self):
        return len(self.processed_texts)

    def __getitem__(self, idx):
        def _process_tokens_for_one_text(id, tokens_with_encoded_entities):
            ner_tags = []
            tokens = []
            for t, ent in tokens_with_encoded_entities:
                ner_tags.append(ent)
                tokens.append(t)

            return {
                "id": id,
                "ner_tags": ner_tags,
                "tokens": tokens
            }

        tokens_with_encoded_entities = self.processed_texts[idx]
        if isinstance(idx, int):
            return _process_tokens_for_one_text(idx, tokens_with_encoded_entities)
        else:
            return [_process_tokens_for_one_text(i+idx.start, tee) for i, tee in enumerate(tokens_with_encoded_entities)]

    def as_hf_dataset(self, tokenizer):
        from datasets import Dataset, Features, Value, ClassLabel, Sequence
        def tokenize_and_align_labels(examples):
            tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

            labels = []
            for i, label in enumerate(examples[f"ner_tags"]):
                word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
                previous_word_idx = None
                label_ids = []
                for word_idx in word_ids:  # Set the special tokens to -100.
                    if word_idx is None:
                        label_ids.append(-100)
                    elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                        label_ids.append(label[word_idx])
                    else:
                        label_ids.append(-100)
                    previous_word_idx = word_idx
                labels.append(label_ids)

            tokenized_inputs["labels"] = labels
            return tokenized_inputs

        ids, ner_tags, tokens = [], [], []
        for i, pt in enumerate(self.processed_texts):
            ids.append(i)
            pt_tokens,pt_tags = list(zip(*pt))
            ner_tags.append(pt_tags)
            tokens.append(pt_tokens)
        data = {
            "id": ids,
            "ner_tags": ner_tags,
            "tokens": tokens
        }
        features = Features({
            "tokens": Sequence(Value("string")),
            "ner_tags": Sequence(ClassLabel(names=dm.unique_entities)),
            "id": Value("int32")
        })
        ds = Dataset.from_dict(data, features)
        tokenized_ds = ds.map(tokenize_and_align_labels, batched=True)
        return tokenized_ds

# usage
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
# dm = NERDataMaker(["I come from [Kathmanduu valley,](location) [Nepal](location)"])
# test_dm.as_hf_dataset(tokenizer=tokenizer)

In [7]:
raw_text = """
[physical therapy](procedure) reimburse physical therapy at [$322.00 for initial visit](initial_visit)  [subsequent visits at $212](subsequent_visit) per visit, per type of therapy. [initial evaluation: 97161-97164](initial_visit_cpt_codes) [subsequent visits: 97010-97036, 97110-97140, 97530, 97750-97763](subsequent_visit_cpt_codes)
[physical therapy](procedure) reimburse at [$70.00](amount) per visit, per day, per type of therapy billed. any associated ancillary services are to be included in the per visit rate.
[physical therapy](procedure)  [treatment visit at 30.00](initial_visit_cpt_codes) per visit and [97750 eval visit 41.00](initial_visit_cpt_codes) per visit
[physical therapy](procedure) therapy is per visit per type of therapy per day - reimburse therapy services [initial eval $75.00](initial_visit) or [subsequent eval $65.00.](subsequent_visit)
[physical therapy](procedure)  reimburse physical therapy at [$ 135.00 for initial visit](initial_visit)  [subsequent visits at $75.00](subsequent_visit) per visit per type of therapy.
[speech therapy](procedure) therapy is per visit per type of therapy per day - reimburse therapy services [initial eval $75.00](initial_visit) or [subsequent eval $65.00.](subsequent_visit)
[occupational therapy](procedure) therapy is per visit per type of therapy per day - reimburse therapy services [initial eval $75.00](initial_visit) or [subsequent eval $65.00.](subsequent_visit)
[occupational therapy](procedure)  [treatment visit at 30.00](initial_visit_cpt_codes) per visit and [97750 eval visit 41.00](initial_visit_cpt_codes) per visit
[speech therapy](procedure)  [92506 diagnostic eval 40.00](initial_visit_cpt_codes) per visit and [92507 tratment visit 30.00](initial_visit_cpt_codes) per visit
[occupational therapy](procedure) reimburse occupational therapy at [$322.00 for initial visit](initial_visit)  [subsequent visits at $212](subsequent_visit) per visit, per type of therapy [initial evaluation: 97165-97168](initial_visit_cpt_codes) [subsequent visits: 97127, 97530-97546, 97750-97763, g0129, g0515](subsequent_visit_cpt_codes)
[speech therapy](procedure) reimburse at [$70.00](amount) per visit, per day, per type of therapy billed. any associated ancillary services are to be included in the per visit rate.
[occupational therapy](procedure)  reimburse occupational therapy at [$ 135.00 for initial visit](initial_visit)  [subsequent visits at $75.00](subsequent_visit) per visit per type of therapy.
[speech therapy](procedure) reimburse speech therapy at [$322 for initial visit](initial_visit)  [subsequent visits at $212](subsequent_visit per visit, per type of therapy [initial evaluation: 92521-92524, 92610](initial_visit_cpt_codes) [subsequent visits: 92507, 92526, 92609](subsequent_visit_cpt_codes)
[speech therapy](procedure)  all inclusive per visit rate for therapies. [initial subsequent visit](initial_visit [97001, 97002, 97003, 97004, 92506, 92610 allow at $135.00.](initial_visit_cpt_codes) [all other coding allowed at $75.00.](other_visit) pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.
[physical therapy](procedure)  all inclusive per visit rate for therapies. [initial subsequent visit](initial_visit [97001, 97002, 97003, 97004, 92506, 92610 allow at $135.00.](initial_visit_cpt_codes) [all other coding allowed at $75.00.](other_visit) pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.
[occupational therapy](procedure)  all inclusive per visit rate for therapies. [initial subsequent visit](initial_visit) [97001, 97002, 97003, 97004, 92506, 92610 allow at $135.00.](initial_visit_cpt_codes) [all other coding allowed at $75.00.](other_visit) pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.
[occupational therapy](procedure) all inclusive per visit rate for therapies. [initial   subsequent visit](initial_visit) [97165-97168 allow at $135.00](initial_visit_cpt_codes)  [modality only visits allowed at $100.00.](other_visit) pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.
[speech therapy](procedure) all inclusive per visit rate for therapies. [initial   subsequent visit](initial_visit) [92506 allow at $135.00](initial_visit_cpt_codes  [modality only visits allowed at $100.00.](other_visit) pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.
[speech therapy](procedure) for [initial visit and subsequent visits](initial_visit) (per visit, per type of therapy) reimburse speech therapy and additional codes [at code rates 92507 85.00 92521 150.00 92522 150.00 92523 150.00 92524 150.00 92526 85.00 92610 85.00 92605 111.00 92606 94.00 92607 162.00 92608 54.00 92609 95.00 92618 62.00](initial_visit_cpt_codes)
[speech therapy](procedure) FOR [INITIAL VISIT AND SUBSEQUENT VISITS](initial_visit (PER VISIT, PER TYPE OF THERAPY). REIMBURSE SPEECH THERAPY [ATCODE RATE 92521 $125.00 92522 $125.00 92523 $125.00 92524 $125.00 92507 $70.00 92526 $70.00 92610 $70.00](initial_visit_cpt_codes)
"""

dm = NERDataMaker(raw_text.split("\n"))
print(f"total examples = {len(dm)}")
print(dm[0:3])


total examples = 22
[{'id': 0, 'ner_tags': [0], 'tokens': ['']}, {'id': 1, 'ner_tags': [5, 11, 0, 0, 0, 0, 2, 8, 8, 8, 0, 6, 12, 12, 12, 0, 0, 0, 0, 0, 0, 3, 9, 9, 7, 13, 13, 13, 13, 13], 'tokens': ['physical', 'therapy', 'reimburse', 'physical', 'therapy', 'at', '$322.00', 'for', 'initial', 'visit', '', 'subsequent', 'visits', 'at', '$212', 'per', 'visit,', 'per', 'type', 'of', 'therapy.', 'initial', 'evaluation:', '97161-97164', 'subsequent', 'visits:', '97010-97036,', '97110-97140,', '97530,', '97750-97763']}, {'id': 2, 'ner_tags': [5, 11, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0], 'tokens': ['physical', 'therapy', 'reimburse', 'at', '$70.00', 'per', 'visit,', 'per', 'day,', 'per', 'type', 'of', 'therapy', 'billed.', 'any', 'associated', 'ancillary', 'services', 'are', 'to', 'be', 'included', 'in', 'the', 'per', 'visit', 'rate.']}]


In [8]:
from transformers import AutoTokenizer, DataCollatorForTokenClassification, AutoModelForTokenClassification, TrainingArguments, Trainer
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)
model = AutoModelForTokenClassification.from_pretrained("distilbert-base-uncased", num_labels=len(dm.unique_entities), id2label=dm.id2label, label2id=dm.label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
!pip install transformers[torch]
!pip install accelerate -U

Collecting accelerate>=0.20.3 (from transformers[torch])
  Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m258.1/258.1 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: accelerate
Successfully installed accelerate-0.23.0


In [9]:
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=40,
    weight_decay=0.01,
)

train_ds = dm.as_hf_dataset(tokenizer=tokenizer)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=train_ds, # eval on training set! ONLY for DEMO!!
    tokenizer=tokenizer,
    data_collator=data_collator,
)

trainer.train()

Map:   0%|          | 0/22 [00:00<?, ? examples/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss
1,No log,2.310295
2,No log,2.038906
3,No log,1.784371
4,No log,1.560076
5,No log,1.418466
6,No log,1.359078
7,No log,1.290694
8,No log,1.198398
9,No log,1.113725
10,No log,1.051698


TrainOutput(global_step=80, training_loss=0.8063291549682617, metrics={'train_runtime': 596.2326, 'train_samples_per_second': 1.476, 'train_steps_per_second': 0.134, 'total_flos': 22936125625848.0, 'train_loss': 0.8063291549682617, 'epoch': 40.0})

In [None]:
model.to('cuda')

In [13]:
from transformers import pipeline
pipe = pipeline("ner", model=model, tokenizer=tokenizer, aggregation_strategy="simple") # pass device=0 if using gpu
pipe("""physical therapy  all inclusive per visit rate for therapies. initial subsequent visit 97001, 97002, 97003, 97004, 92506, 92610 allow at $135.00. all other coding allowed at $75.00. pay only one per visit per day for all therapy. if more than one cpt is billed for a specific dos, roll all payable charges into highest per visit allowable.""")

[{'entity_group': 'procedure',
  'score': 0.7974425,
  'word': 'physical therapy',
  'start': 0,
  'end': 16},
 {'entity_group': 'initial_visit',
  'score': 0.5583614,
  'word': 'initial subsequent visit',
  'start': 62,
  'end': 86},
 {'entity_group': 'initial_visit_cpt_codes',
  'score': 0.8787856,
  'word': '97001, 97002, 97003, 97004, 92506, 92610 allow at $ 135. 00',
  'start': 87,
  'end': 144},
 {'entity_group': 'other_visit',
  'score': 0.6606693,
  'word': '. all other coding allowed at $ 75. 00',
  'start': 144,
  'end': 180}]