In [1]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [2]:
from transformers import TFAutoModelForTokenClassification, AutoTokenizer

In [3]:
import tensorflow as tf
from transformers import AutoTokenizer, TFAutoModelForTokenClassification

In [4]:
import sys

sys.path.append('../')
from config import entity_to_acronyms, acronyms_to_entities

In [22]:
model_dir = '../models'

## Model Definition

In [5]:
model_name = "d4data/biomedical-ner-all"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = TFAutoModelForTokenClassification.from_pretrained(model_name, from_pt=True)

All PyTorch model weights were used when initializing TFDistilBertForTokenClassification.

All the weights of TFDistilBertForTokenClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertForTokenClassification for predictions without further training.


## Define training parameters

In [6]:
BATCH_SIZE = 32
NUM_EPOCHS = 2
LEARNING_RATE = 1e-5

## Prepare the dataset to fine tune the Pretrained DistilBERT base uncased

In [7]:
MAX_LENGTH = 200

In [8]:
bio_files_dir = '../data/bio_data_files'

In [9]:
import os
import numpy as np

def read_file(file_path):
    """Helper function to read data from a single file."""
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()
        sentences = []
        labels = []
        sentence = []
        label = []
        for line in lines:
            if line == '\n':
                if sentence:
                    sentences.append(" ".join(sentence))
                    labels.append(" ".join(label))
                    sentence = []
                    label = []
            else:
                word, tag = line.strip().split("\t")
                sentence.append(word)
                if tag != 'O':
                    tag = tag[:2] + acronyms_to_entities[tag[2:]]
                label.append(tag)
        if sentence:
            sentences.append(" ".join(sentence))
            labels.append(" ".join(label))
        return sentences, labels

def prepare_data(directory_path):
    """Read data from all files in the given directory and prepare for fine-tuning."""
    train_sentences = []
    train_labels = []
    val_sentences = []
    val_labels = []
    test_sentences = []
    test_labels = []
    for i, filename in enumerate(os.listdir(directory_path)):
        file_path = os.path.join(directory_path, filename)
        sentences, labels = read_file(file_path)
        if i % 5 == 0:  # 20% of data for validation
            val_sentences.extend(sentences)
            val_labels.extend(labels)
        elif i % 5 == 1:  # 20% of data for testing
            test_sentences.extend(sentences)
            test_labels.extend(labels)
        else:  # 60% of data for training
            train_sentences.extend(sentences)
            train_labels.extend(labels)
            
    train_data = {"input_ids": tokenizer(train_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                  "attention_mask": tokenizer(train_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                  "labels": np.array([[model.config.label2id[token] for token in label.split()] for label in train_labels], dtype='object')}
    
    val_data = {"input_ids": tokenizer(val_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                "attention_mask": tokenizer(val_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                "labels": np.array([[model.config.label2id[token] for token in label.split()] for label in val_labels], dtype='object')}
    
    test_data = {"input_ids": tokenizer(test_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["input_ids"],
                 "attention_mask": tokenizer(test_sentences, truncation=True, max_length=MAX_LENGTH, padding='max_length', return_tensors="tf")["attention_mask"],
                 "labels": np.array([[model.config.label2id[token] for token in label.split()] for label in test_labels], dtype='object')}

    
    # assuming train_labels is a list of lists of integer-encoded labels
    padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
        train_data['labels'],
        maxlen=MAX_LENGTH,
        padding='post',
        truncating='post',
        value=0  # or any other value to use for padding
    )

    # Convert to tensor
    train_data['labels'] = tf.convert_to_tensor(padded_labels)
    
    # assuming train_labels is a list of lists of integer-encoded labels
    padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
        val_data['labels'],
        maxlen=MAX_LENGTH,
        padding='post',
        truncating='post',
        value=0  # or any other value to use for padding
    )

    # Convert to tensor
    val_data['labels'] = tf.convert_to_tensor(padded_labels)
    
    # assuming train_labels is a list of lists of integer-encoded labels
    padded_labels = tf.keras.preprocessing.sequence.pad_sequences(
        test_data['labels'],
        maxlen=MAX_LENGTH,
        padding='post',
        truncating='post',
        value=0  # or any other value to use for padding
    )

    # Convert to tensor
    test_data['labels'] = tf.convert_to_tensor(padded_labels)
    
    return train_data, val_data, test_data

train_data, val_data, test_data =  prepare_data(bio_files_dir)

In [10]:

print("TRAINING DATA")
print(f"The shape of input ids tensor of train data is {train_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of train data is {train_data['attention_mask'].shape}")
print(f"The shape of labels tensor of train data is {train_data['labels'].shape}")

print("\nVALIDATION DATA")
print(f"The shape of input ids tensor of validation data is {val_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of validation data is {val_data['attention_mask'].shape}")
print(f"The shape of labels tensor of validation data is {val_data['labels'].shape}")

print("\nTEST DATA")
print(f"The shape of input ids tensor of test data is {test_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of test data is {test_data['attention_mask'].shape}")
print(f"The shape of labels tensor of test data is {test_data['labels'].shape}")

TRAINING DATA
The shape of input ids tensor of train data is (2696, 200)
The shape of attention masks tensor of train data is (2696, 200)
The shape of labels tensor of train data is (2696, 200)

VALIDATION DATA
The shape of input ids tensor of validation data is (907, 200)
The shape of attention masks tensor of validation data is (907, 200)
The shape of labels tensor of validation data is (907, 200)

TEST DATA
The shape of input ids tensor of test data is (938, 200)
The shape of attention masks tensor of test data is (938, 200)
The shape of labels tensor of test data is (938, 200)


## Creating Tensorflow Datasets from preprocessed data

In [11]:
# Create TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((train_data["input_ids"], train_data["attention_mask"], train_data["labels"])).batch(BATCH_SIZE)
val_dataset = tf.data.Dataset.from_tensor_slices((val_data["input_ids"], val_data["attention_mask"], val_data["labels"])).batch(BATCH_SIZE)
test_dataset = tf.data.Dataset.from_tensor_slices((test_data["input_ids"], test_data["attention_mask"], test_data["labels"])).batch(BATCH_SIZE)

In [12]:
print("TRAINING DATASET")
print(f"Number of batches in train dataset: {len(train_dataset)}")
print(f"Shape of the batches: {train_dataset.element_spec}", )

print("\nVALIDATION DATASET")
print(f"Number of batches in validation dataset: {len(val_dataset)}")
print(f"Shape of the batches: {val_dataset.element_spec}", )

print("\nTEST DATASET")
print(f"Number of batches in test dataset: {len(test_dataset)}")
print(f"Shape of the batches: {test_dataset.element_spec}", )

TRAINING DATASET
Number of batches in train dataset: 85
Shape of the batches: (TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None))

VALIDATION DATASET
Number of batches in validation dataset: 29
Shape of the batches: (TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None))

TEST DATASET
Number of batches in test dataset: 30
Shape of the batches: (TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None), TensorSpec(shape=(None, 200), dtype=tf.int32, name=None))


In [64]:
# Define the optimizer, loss function and metrics for training
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metrics = [tf.keras.metrics.SparseCategoricalAccuracy("accuracy")]

## Compile the model

In [65]:
model.compile(optimizer=optimizer, loss=loss, metrics=metrics)

## Train the model

In [17]:
history = model.fit(
    x = [train_data['input_ids'], train_data['attention_mask']],
    y = train_data['labels'],
    validation_data=([val_data['input_ids'], val_data['attention_mask']], val_data['labels']), 
    epochs=NUM_EPOCHS
)

Epoch 1/2


2023-04-09 11:10:51.904811: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/2


<keras.callbacks.History at 0x2c408c8e0>

In [None]:
import matplotlib.pyplot as plt

plt.plot(history.history['accuracy'])
plt.show()

In [19]:
# Evaluate the model on the train data
model.evaluate(
    x = [train_data['input_ids'], train_data['attention_mask']],
    y = train_data['labels']
)



[0.3379139304161072, 0.9468490481376648]

In [20]:
# Evaluate the model on the train data
model.evaluate(
    x = [test_data['input_ids'], test_data['attention_mask']],
    y = test_data['labels']
)



[0.3348983824253082, 0.9477131962776184]

## Save the Model

In [66]:
model.save(os.path.join(model_dir, 'model_9'))

























INFO:tensorflow:Assets written to: ../models/model_9/assets


INFO:tensorflow:Assets written to: ../models/model_9/assets


In [67]:
# Load the model
loaded_model = tf.keras.models.load_model(os.path.join(model_dir, 'model_9'))

In [72]:
# Extract the model from the Loader object
loaded_model_1 = loaded_model.signatures['serving_default']

## Prediction

In [68]:
loaded_model.summary()

Model: "tf_distil_bert_for_token_classification_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 distilbert (TFDistilBertMai  multiple                 66362880  
 nLayer)                                                         
                                                                 
 dropout_58 (Dropout)        multiple                  0         
                                                                 
 classifier (Dense)          multiple                  64596     
                                                                 
Total params: 66,427,476
Trainable params: 66,427,476
Non-trainable params: 0
_________________________________________________________________


In [84]:
import spacy
from spacy import displacy

def display_pred(text, entities):
    nlp = spacy.load("en_core_web_sm", disable=['ner'])
    # Generate the entities in Spacy format
    doc = nlp(text)
    # Add the predicted named entities to the Doc object
    for start, end, label in entities:
        span = doc.char_span(start, end, label=label)
        if span is not None:
            doc.ents += tuple([span])

    colors = {"Activity": "#f9d5e5",
              "Administration": "#f7a399",
              "Age": "#f6c3d0",
              "Area": "#fde2e4",
              "Biological_attribute": "#d5f5e3",
              "Biological_structure": "#9ddfd3",
              "Clinical_event": "#77c5d5",
              "Color": "#a0ced9",
              "Coreference": "#e3b5a4",
              "Date": "#f1f0d2",
              "Detailed_description": "#ffb347",
              "Diagnostic_procedure": "#c5b4e3",
              "Disease_disorder": "#c4b7ea",
              "Distance": "#bde0fe",
              "Dosage": "#b9e8d8",
              "Duration": "#ffdfba",
              "Family_history": "#e6ccb2",
              "Frequency": "#e9d8a6",
              "Height": "#f2eecb",
              "History": "#e2f0cb",
              "Lab_value": "#f4b3c2",
              "Mass": "#f4c4c3",
              "Medication": "#f9d5e5",
              "Nonbiological_location": "#f7a399",
              "Occupation": "#f6c3d0",
              "Other_entity": "#d5f5e3",
              "Other_event": "#9ddfd3",
              "Outcome": "#77c5d5",
              "Personal_background": "#a0ced9",
              "Qualitative_concept": "#e3b5a4",
              "Quantitative_concept": "#f1f0d2",
              "Severity": "#ffb347",
              "Sex": "#c5b4e3",
              "Shape": "#c4b7ea",
              "Sign_symptom": "#bde0fe",
              "Subject": "#b9e8d8",
              "Texture": "#ffdfba",
              "Therapeutic_procedure": "#e6ccb2",
              "Time": "#e9d8a6",
              "Volume": "#f2eecb",
              "Weight": "#e2f0cb"}
    options = {"compact": True, "bg": "#F8F8F8",
               "ents": list(colors.keys()),
               "colors": colors}

    # Generate the HTML visualization
    html = displacy.render(doc, style="ent", options=options)

In [83]:
text = "A 57-year-old man presented to the emergency department with a 2-day history of worsening shortness of breath and chest pain. He reported no recent travel or sick contacts. His medical history was significant for hypertension, dyslipidemia, and type 2 diabetes mellitus. On examination, he was tachycardic and tachypneic, with oxygen saturation of 88% on room air. Chest radiography revealed bilateral opacities consistent with pulmonary edema. The patient was admitted to the intensive care unit for management of acute decompensated heart failure. He was started on intravenous diuretics and inotropic support with dobutamine. Over the next several days, his symptoms improved and he was discharged to home with instructions to follow up with his primary care provider in 1 week."
# Tokenize the input sentence
encoded = tokenizer.encode_plus(text, return_tensors="tf", return_offsets_mapping=True)

input_ids = encoded['input_ids']
attention_mask = encoded['attention_mask']

inputs = {
    'input_ids': input_ids,
    'attention_mask': attention_mask
}

offsets = encoded['offset_mapping'][0].numpy()


# Get the model predictions
outputs = loaded_model_1(input_ids=input_ids, attention_mask=attention_mask)['logits']
predictions = tf.argmax(outputs, axis=-1)

# # Convert the predicted label ids to label names

predicted_labels = [model.config.id2label[prediction] for prediction in predictions[0].numpy()]

entities = []
prev_tag = None
prev_end = -1

for start_end, label in zip(offsets, predicted_labels):
    start = start_end[0]
    end = start_end[1]
    if label != 'O':
        tag = label[2:]
        if len(entities) > 0:
                prev_end = entities[-1][1]
                prev_start = entities[-1][0]
                prev_tag = entities[-1][2]
        if prev_tag == tag and (prev_end == start or  prev_end+1 == start):
            entities[-1] = (prev_start, end, tag)
        else:
            entities.append((start, end, tag))

In [85]:
display_pred(text, entities)