In [29]:
import os
import numpy as np
import random
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense

In [30]:
bio_files_dir = '../data/NEW_BIO_FILES'

## Defining Parameters

In [31]:
VOCAB_SIZE = 500000
EMBEDDING_DIM = 512
MAX_LENGTH = 200
LSTM_UNITS = 64
NUM_EPOCHS = 10

TEST_SIZE = 0.2

## Data Cleaning

In [32]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## Preprocess and Load Data

In [33]:

def load_data(data_dir, max_length=None):

    # Load all files in the data directory
    all_files = os.listdir(data_dir)

    # Filter only the files with the .bio extension
    bio_files = [f for f in all_files if f.endswith('.bio')]

    # Initialize lists to hold sentences and labels
    sentences = []
    labels = []

    # Loop through each file and read the sentences and labels
    for file in bio_files:
        with open(os.path.join(data_dir, file), 'r', encoding='utf-8') as f:
            current_sentences = []
            current_labels = []
            for line in f:
                if line.strip() == '':
                    # If we encounter a blank line, it means we've reached the end of a sentence
                    if len(current_sentences) > 0:
                        # Add the current sentence and labels to the list
                        sentences.append(current_sentences)
                        labels.append(current_labels)
                        # Reset the current sentence and labels lists
                        current_sentences = []
                        current_labels = []
                else:
                    # Otherwise, split the line into its word and label components
                    word, label = line.strip().split('\t')
                    current_sentences.append(clean_text(word))
                    current_labels.append(label)

    # Shuffle the sentences and labels
    combined = list(zip(sentences, labels))
    random.shuffle(combined)
    sentences[:], labels[:] = zip(*combined)

    # Split the data into training, validation, and test sets

    num_sentences = len(sentences)
    num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))
    num_valid = int(num_sentences * 0.1)

    train_sentences = sentences[:num_train]
    train_labels = labels[:num_train]

    valid_sentences = sentences[num_train:num_train+num_valid]
    valid_labels = labels[num_train:num_train+num_valid]

    test_sentences = sentences[num_train+num_valid:]
    test_labels = labels[num_train+num_valid:]

    # Convert the labels to one-hot encoding
    unique_labels = set(element for sublist in labels for element in sublist)
    label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
    index_to_label = {id: label for label, id in label_to_index.items()}

    # Add the new label and ID to the dictionaries
    label_to_index['<PAD>'] = 0
    index_to_label[0] = '<PAD>'

    num_classes = len(index_to_label) - 1

    train_labels = [[label_to_index[label] for label in labels] for labels in train_labels]
    train_labels = pad_sequences(train_labels, maxlen=max_length, padding='post', value=num_classes)
    train_labels = to_categorical(train_labels, num_classes=num_classes+1)

    valid_labels = [[label_to_index[label] for label in labels] for labels in valid_labels]
    valid_labels = pad_sequences(valid_labels, maxlen=max_length, padding='post', value=num_classes)
    valid_labels = to_categorical(valid_labels, num_classes=num_classes+1)

    test_labels = [[label_to_index[label] for label in labels] for labels in test_labels]
    test_labels = pad_sequences(test_labels, maxlen=max_length, padding='post', value=num_classes)
    test_labels = to_categorical(test_labels, num_classes=num_classes+1)

    return (train_sentences, train_labels), (valid_sentences, valid_labels), (test_sentences, test_labels), label_to_index, index_to_label

In [34]:
(train_sentences, train_labels), (val_sentences, val_labels), (test_sentences, test_labels), label2id, id2label = load_data(bio_files_dir, MAX_LENGTH)

In [35]:
NUM_CLASSES = len(id2label)

In [36]:
len(train_sentences), train_labels.shape, len(val_sentences), val_labels.shape, len(test_sentences), test_labels.shape

(3178, (3178, 200, 35), 454, (454, 200, 35), 909, (909, 200, 35))

## Creating Sequence and Padding

In [40]:
# Convert the input sentences to sequences of word indices
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad the sequences to a fixed length
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

## Building Model

In [49]:
# Define the model architecture
model = tf.keras.models.Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),
    Dense(NUM_CLASSES, activation='softmax')
])

#
# model = tf.keras.models.Sequential([
#     Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
#     Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),
#     Dense(64, activation='relu'),
#     Dense(NUM_CLASSES, activation='softmax')
# ])


## Compile the model

In [50]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [51]:
model.summary()

Model: "sequential_10"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 512)          256000512 
                                                                 
 bidirectional_10 (Bidirecti  (None, 200, 128)         295424    
 onal)                                                           
                                                                 
 dense_10 (Dense)            (None, 200, 35)           4515      
                                                                 
Total params: 256,300,451
Trainable params: 256,300,451
Non-trainable params: 0
_________________________________________________________________


In [52]:
# Train the model
model.fit(train_sequences_padded, train_labels, epochs=NUM_EPOCHS, validation_data=(val_sequences_padded, val_labels))

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences_padded, test_labels)

# Print the test accuracy
print('Test accuracy:', test_acc)

Epoch 1/10
 15/100 [===>..........................] - ETA: 53s - loss: 1.6075 - accuracy: 0.8995

KeyboardInterrupt: 

## Prediction

In [48]:
import re
import string
def tokenize_text(text):
    # Tokenize the text into a list of words
    tokens = []
    for sentence in text.split('\n'):
        for word in sentence.split():
            # Remove trailing punctuation marks from the word
            while word and word[-1] in string.punctuation:
                word = word[:-1]
            tokens.append(word)
    return tokens

def predict(text):
    # tokens = re.findall(r'\b\w+\b', text)
    tokens = [clean_text(token) for token in tokenize_text(text)]

    sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')

    # Make the prediction
    prediction = model.predict(np.array(padded_sequence))

    # Decode the prediction
    predicted_labels = np.argmax(prediction, axis=-1)
    predicted_labels = [id2label[i] for i in predicted_labels[0]]

    # Print the predicted named entities
    print("Predicted Named Entities:")
    for i in range(len(tokens)):
        print(f"{tokens[i]}: {predicted_labels[i]}")


In [49]:
predict("The patient is a 55-year-old male with a history of hypertension and diabetes. He presented to the emergency department with complaints of chest pain, shortness of breath, and dizziness. The patient's blood pressure was 180/110 mmHg and his heart rate was 110 beats per minute.")

Predicted Named Entities:
the: O
patient: O
is: O
a: O
55yearold: O
male: O
with: O
a: O
history: O
of: O
hypertension: O
and: O
diabetes: O
he: O
presented: O
to: O
the: O
emergency: O
department: O
with: O
complaints: O
of: O
chest: O
pain: O
shortness: O
of: O
breath: O
and: O
dizziness: O
the: O
patients: O
blood: O
pressure: O
was: O
180110: O
mmhg: O
and: O
his: O
heart: O
rate: O
was: O
110: O
beats: O
per: O
minute: O


In [50]:
predict("The patient's cranial nerves were intact during the physical exam.")

Predicted Named Entities:
the: O
patients: O
cranial: O
nerves: O
were: O
intact: O
during: O
the: O
physical: O
exam: O


In [51]:
predict("The patient presented with acute abdominal pain, nausea, and vomiting, and was diagnosed with acute appendicitis.")

Predicted Named Entities:
the: O
patient: O
presented: O
with: O
acute: O
abdominal: O
pain: O
nausea: O
and: O
vomiting: O
and: O
was: O
diagnosed: O
with: O
acute: O
appendicitis: O


In [52]:
predict("The biopsies revealed the presence of malignancy in the patient's tissue samples.")

Predicted Named Entities:
the: O
biopsies: O
revealed: O
the: O
presence: O
of: O
malignancy: O
in: O
the: O
patients: O
tissue: O
samples: O


In [53]:
predict("The patient was prescribed prednisone to help manage their autoimmune disorder.")

Predicted Named Entities:
the: O
patient: O
was: O
prescribed: O
prednisone: O
to: O
help: O
manage: O
their: O
autoimmune: O
disorder: O


In [54]:
predict("The patient underwent successful removal of a nodule from their thyroid gland.")

Predicted Named Entities:
the: O
patient: O
underwent: O
successful: O
removal: O
of: O
a: O
nodule: O
from: O
their: O
thyroid: O
gland: O


In [55]:
id2label

{1: 'B-MedicalCondition',
 2: 'B-Medicine',
 3: 'B-Pathogen',
 4: 'I-MedicalCondition',
 5: 'I-Medicine',
 6: 'I-Pathogen',
 7: 'O',
 0: '<PAD>'}

In [65]:
num_examples, max_seq_length, num_classes = train_labels.shape

class_counts = np.sum(train_labels, axis=(0, 1))  # sum along first two axes to get class counts

max_class_index = np.argmax(class_counts)

print(f"The class with index {max_class_index} has the maximum number of examples: {class_counts[max_class_index]}")


The class with index 7 has the maximum number of examples: 15076.0


In [67]:
id2label[7]

'O'

In [16]:
# get the shape of the array
shape = train_labels.shape

# loop over the first axis and count the number of examples
count = np.zeros(shape[2])
for i in range(shape[0]):
    for j in range(shape[1]):
        for k in range(shape[2]):
            if train_labels[i, j, k] == 1:
                count[k] += 1

# find the num_class with the maximum number of examples
max_class = np.argmax(count)

print("Num_class with maximum examples:", max_class)
print("Number of examples:", count[max_class])

# print the indices in sorting order of which has maximum examples
print("Indices in sorting order of which has maximum examples:")
for i in np.argsort(-count):
    print("Num_class", i, ":", count[i])

Num_class with maximum examples: 34
Number of examples: 613156.0
Indices in sorting order of which has maximum examples:
Num_class 34 : 613156.0
Num_class 5 : 3231.0
Num_class 22 : 2440.0
Num_class 15 : 2328.0
Num_class 3 : 2041.0
Num_class 11 : 1922.0
Num_class 20 : 1654.0
Num_class 28 : 1459.0
Num_class 31 : 981.0
Num_class 6 : 905.0
Num_class 27 : 892.0
Num_class 13 : 752.0
Num_class 16 : 667.0
Num_class 23 : 555.0
Num_class 4 : 425.0
Num_class 24 : 390.0
Num_class 32 : 383.0
Num_class 25 : 313.0
Num_class 10 : 245.0
Num_class 7 : 238.0
Num_class 30 : 194.0
Num_class 1 : 140.0
Num_class 14 : 132.0
Num_class 8 : 67.0
Num_class 21 : 40.0
Num_class 18 : 29.0
Num_class 2 : 6.0
Num_class 17 : 3.0
Num_class 33 : 3.0
Num_class 29 : 2.0
Num_class 12 : 2.0
Num_class 9 : 2.0
Num_class 26 : 2.0
Num_class 19 : 1.0
Num_class 0 : 0.0


In [19]:
id2label[5]

'B-Diagnostic_procedure'

In [10]:
# Define the CRF model
def crf_model(features, labels, mode):
    # Create a word embeddings layer
    word_embeddings = tf.keras.layers.Embedding(input_dim=VOCAB_SIZE, output_dim=50)(features['word'])

    # Create a Bidirectional LSTM layer
    lstm = tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=50, return_sequences=True))(word_embeddings)

    # Create a CRF layer
    crf = tf.keras.layers.CRF(VOCAB_SIZE, name='crf_layer')
    output = crf(lstm)

    # Compile the model
    model = tf.keras.Model(inputs=features, outputs=output)
    model.compile(optimizer='adam', loss=crf.loss, metrics=[crf.accuracy])

    return model

In [11]:
train_sequences_padded.shape, train_labels.shape

((3178, 200), (3178, 200, 35))

In [44]:
import tensorflow as tf
import numpy as np
import sklearn_crfsuite
from sklearn_crfsuite import metrics

# Define the CRF model
def crf_model(input_shape, num_labels):
    model = tf.keras.Sequential([
        tf.keras.layers.Masking(mask_value=0, input_shape=input_shape),
        tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(units=50, return_sequences=True)),
        tf.keras.layers.Dense(num_labels),
        tf.keras.layers.Activation('softmax')
    ])

    # crf = sklearn_crfsuite.metrics.flat_f1_score
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

# Remove the extra dimension and add the timesteps dimension to train_sequences_padded
# train_sequences_padded = np.expand_dims(train_sequences_padded, axis=-1)

# Define the input shape and number of labels
input_shape = train_sequences_padded.shape[1:]
num_labels = train_labels.shape[-1]

# Define the CRF model
model = crf_model(input_shape, num_labels)

# Train the model
model.fit(train_sequences_padded, train_labels, batch_size=32, epochs=10, validation_split=0.2)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x296d37310>

In [41]:
train_sequences_padded.shape

(3178, 200)

In [42]:
train_sequences_padded.shape, train_labels.shape

((3178, 200), (3178, 200, 35))

In [45]:
import re
import string


In [46]:
def tokenize_text(text):
    # Tokenize the text into a list of words
    tokens = []
    for sentence in re.split(r'\n', text):
        for word in sentence.split():

            # Remove the format [%d]
            word = re.sub(r'\[?\d+\]', '', word)

            word = word.strip()

            # Remove trailing punctuation marks from the word
            while word and word[-1] in string.punctuation:
                word = word[:-1]

            # Remove leading punctutation marks from the word
            while word and word[0] in string.punctuation:
                word = word[1:]
            if word:
                tokens.append(word)

        if tokens[-1] != "<NEWL>":
            tokens.append("<NEWL>")

    return tokens

In [47]:

def predict(text):
    # tokens = re.findall(r'\b\w+\b', text)
    tokens = [clean_text(token) for token in tokenize_text(text)]

    sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')

    # Add an extra dimension to match the input shape of the model
    padded_sequence_with_batch_size = np.expand_dims(padded_sequence, axis=-1)

    # Make the prediction
    prediction = model.predict(padded_sequence_with_batch_size)

    # Decode the prediction
    predicted_labels = np.argmax(prediction, axis=-1)
    predicted_labels = [id2label[i] for i in predicted_labels[0]]

    # Print the predicted named entities
    print("Predicted Named Entities:")
    for i in range(len(tokens)):
        print(f"{tokens[i]}: {predicted_labels[i]}")


In [48]:
predict("A 54-year-old man with a history of hypertension, hyperlipidemia, and a previous myocardial infarction presents to the emergency department with severe chest pain. He reports the pain began suddenly and has been getting progressively worse over the last hour. He also reports shortness of breath and nausea. On physical exam, his blood pressure is 180/100 mmHg, heart rate is 120 beats per minute, and respiratory rate is 24 breaths per minute. An electrocardiogram reveals ST-segment elevation in leads II, III, and aVF. The patient is immediately started on aspirin, heparin, and nitroglycerin, and is taken to the cardiac catheterization lab for emergent angiography and possible percutaneous coronary intervention.")

Predicted Named Entities:
a: O
54yearold: O
man: O
with: O
a: O
history: O
of: O
hypertension: O
hyperlipidemia: O
and: O
a: O
previous: O
myocardial: O
infarction: O
presents: O
to: O
the: O
emergency: O
department: O
with: O
severe: O
chest: O
pain: O
he: O
reports: O
the: O
pain: O
began: O
suddenly: O
and: O
has: O
been: O
getting: O
progressively: O
worse: O
over: O
the: O
last: O
hour: O
he: O
also: O
reports: O
shortness: O
of: O
breath: O
and: O
nausea: O
on: O
physical: O
exam: O
his: O
blood: O
pressure: O
is: O
180100: O
mmhg: O
heart: O
rate: O
is: O
120: O
beats: O
per: O
minute: O
and: O
respiratory: O
rate: O
is: O
24: O
breaths: O
per: O
minute: O
an: O
electrocardiogram: O
reveals: O
stsegment: O
elevation: O
in: O
leads: O
ii: O
iii: O
and: O
avf: O
the: O
patient: O
is: O
immediately: O
started: O
on: O
aspirin: O
heparin: O
and: O
nitroglycerin: O
and: O
is: O
taken: O
to: O
the: O
cardiac: O
catheterization: O
lab: O
for: O
emergent: O
angiography: O
and: O
possibl

In [None]:
# Assume you have a new input sequence stored in a variable called 'new_sequence'
# It should be of shape (1, sequence_length), where sequence_length is the length of the sequence you want to label

# Pad the new sequence to make it the same length as the training sequences
new_sequence_padded = pad_sequences([new_sequence], maxlen=MAX_LENGTH, padding='post', truncating='post')

# Add an extra dimension to match the input shape of the model
new_sequence_padded = np.expand_dims(new_sequence_padded, axis=-1)

# Use the model to predict the labels for the new sequence
predicted_labels = model.predict(new_sequence_padded)

# The predicted_labels variable will be an array of shape (1, sequence_length, num_labels)
# You can use np.argmax to get the index of the highest probability label for each token in the sequence
predicted_labels = np.argmax(predicted_labels, axis=-1)

# The predicted_labels variable will now be an array of shape (1, sequence_length) containing the predicted labels for each token in the sequence
