In [6]:
import os
import numpy as np
import random
import re

import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense

In [1]:
bio_files_dir = '../data/BIO_FILES'

## Defining Parameters

In [2]:
VOCAB_SIZE = 100000
EMBEDDING_DIM = 128
MAX_LENGTH = 200
NUM_CLASSES = 35
LSTM_UNITS = 64
NUM_EPOCHS = 10

TEST_SIZE = 0.2

## Data Cleaning

In [7]:
def clean_text(text):
    # Remove URLs
    text = re.sub(r'http\S+', '', text)
    # Remove punctuation and convert to lowercase
    text = re.sub(r'[^\w\s]', '', text).lower()
    # Remove extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

## Preprocess and Load Data

In [8]:

def load_data(data_dir, max_length=None):

    # Load all files in the data directory
    all_files = os.listdir(data_dir)

    # Filter only the files with the .bio extension
    bio_files = [f for f in all_files if f.endswith('.bio')]

    # Initialize lists to hold sentences and labels
    sentences = []
    labels = []

    # Loop through each file and read the sentences and labels
    for file in bio_files:
        with open(os.path.join(data_dir, file), 'r', encoding='utf-8') as f:
            current_sentences = []
            current_labels = []
            for line in f:
                if line.strip() == '':
                    # If we encounter a blank line, it means we've reached the end of a sentence
                    if len(current_sentences) > 0:
                        # Add the current sentence and labels to the list
                        sentences.append(current_sentences)
                        labels.append(current_labels)
                        # Reset the current sentence and labels lists
                        current_sentences = []
                        current_labels = []
                else:
                    # Otherwise, split the line into its word and label components
                    word, label = line.strip().split('\t')
                    current_sentences.append(clean_text(word))
                    current_labels.append(label)

    # Shuffle the sentences and labels
    combined = list(zip(sentences, labels))
    random.shuffle(combined)
    sentences[:], labels[:] = zip(*combined)

    # Split the data into training, validation, and test sets

    num_sentences = len(sentences)
    num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))
    num_valid = int(num_sentences * 0.1)

    train_sentences = sentences[:num_train]
    train_labels = labels[:num_train]

    valid_sentences = sentences[num_train:num_train+num_valid]
    valid_labels = labels[num_train:num_train+num_valid]

    test_sentences = sentences[num_train+num_valid:]
    test_labels = labels[num_train+num_valid:]

    # Convert the labels to one-hot encoding
    unique_labels = set(element for sublist in labels for element in sublist)
    label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
    index_to_label = {id: label for label, id in label_to_index.items()}

    # Add the new label and ID to the dictionaries
    label_to_index['<PAD>'] = 0
    index_to_label[0] = '<PAD>'

    num_classes = len(index_to_label) - 1

    train_labels = [[label_to_index[label] for label in labels] for labels in train_labels]
    train_labels = pad_sequences(train_labels, maxlen=max_length, padding='post', value=num_classes)
    train_labels = to_categorical(train_labels, num_classes=num_classes+1)

    valid_labels = [[label_to_index[label] for label in labels] for labels in valid_labels]
    valid_labels = pad_sequences(valid_labels, maxlen=max_length, padding='post', value=num_classes)
    valid_labels = to_categorical(valid_labels, num_classes=num_classes+1)

    test_labels = [[label_to_index[label] for label in labels] for labels in test_labels]
    test_labels = pad_sequences(test_labels, maxlen=max_length, padding='post', value=num_classes)
    test_labels = to_categorical(test_labels, num_classes=num_classes+1)

    return (train_sentences, train_labels), (valid_sentences, valid_labels), (test_sentences, test_labels), label_to_index, index_to_label

In [9]:
(train_sentences, train_labels), (val_sentences, val_labels), (test_sentences, test_labels), label2id, id2label = load_data(bio_files_dir, MAX_LENGTH)

In [10]:
len(train_sentences), train_labels.shape, len(val_sentences), val_labels.shape, len(test_sentences), test_labels.shape

(3178, (3178, 200, 35), 454, (454, 200, 35), 909, (909, 200, 35))

## Creating Sequence and Padding

In [11]:
# Convert the input sentences to sequences of word indices
tokenizer = Tokenizer(num_words=VOCAB_SIZE)
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(val_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad the sequences to a fixed length
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

## Building Model

In [12]:
# Define the model architecture
model = tf.keras.models.Sequential([
    Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
    Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),
    Dense(NUM_CLASSES, activation='softmax')
])

#
# model = tf.keras.models.Sequential([
#     Embedding(input_dim=VOCAB_SIZE, output_dim=EMBEDDING_DIM, input_length=MAX_LENGTH),
#     Bidirectional(LSTM(units=LSTM_UNITS, return_sequences=True)),
#     Dense(64, activation='relu'),
#     Dense(NUM_CLASSES, activation='softmax')
# ])


## Compile the model

In [13]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [14]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 128)          12800000  
                                                                 
 bidirectional (Bidirectiona  (None, 200, 128)         98816     
 l)                                                              
                                                                 
 dense (Dense)               (None, 200, 35)           4515      
                                                                 
Total params: 12,903,331
Trainable params: 12,903,331
Non-trainable params: 0
_________________________________________________________________


In [15]:
# Train the model
model.fit(train_sequences_padded, train_labels, epochs=NUM_EPOCHS, validation_data=(val_sequences_padded, val_labels))

# Evaluate the model
test_loss, test_acc = model.evaluate(test_sequences_padded, test_labels)

# Print the test accuracy
print('Test accuracy:', test_acc)

Epoch 1/10


2023-04-03 18:47:42.570335: W tensorflow/core/platform/profile_utils/cpu_utils.cc:128] Failed to get CPU frequency: 0 Hz


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.9680582880973816


## Prediction

In [16]:
import re
import string
def tokenize_text(text):
    # Tokenize the text into a list of words
    tokens = []
    for sentence in text.split('\n'):
        for word in sentence.split():
            # Remove trailing punctuation marks from the word
            while word and word[-1] in string.punctuation:
                word = word[:-1]
            tokens.append(word)
    return tokens

def predict(text):
    # tokens = re.findall(r'\b\w+\b', text)
    tokens = [clean_text(token) for token in tokenize_text(text)]

    sequence = tokenizer.texts_to_sequences([' '.join(token for token in tokens)])
    padded_sequence = pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post')

    # Make the prediction
    prediction = model.predict(np.array(padded_sequence))

    # Decode the prediction
    predicted_labels = np.argmax(prediction, axis=-1)
    predicted_labels = [id2label[i] for i in predicted_labels[0]]

    # Print the predicted named entities
    print("Predicted Named Entities:")
    for i in range(len(tokens)):
        print(f"{tokens[i]}: {predicted_labels[i]}")


In [40]:
predict("The patient is a 55-year-old male with a history of hypertension and diabetes. He presented to the emergency department with complaints of chest pain, shortness of breath, and dizziness. The patient's blood pressure was 180/110 mmHg and his heart rate was 110 beats per minute.")

Predicted Named Entities:
the: O
patient: O
is: O
a: O
55yearold: B-Disease_disorder
male: B-Medication
with: O
a: O
history: O
of: O
hypertension: I-History
and: O
diabetes: I-History
he: O
presented: I-History
to: O
the: O
emergency: O
department: O
with: O
complaints: O
of: O
chest: B-Biological_structure
pain: B-Sign_symptom
shortness: B-Sign_symptom
of: O
breath: I-Sign_symptom
and: O
dizziness: B-Sign_symptom
the: O
patients: O
blood: B-Diagnostic_procedure
pressure: I-Diagnostic_procedure
was: O
180110: I-Lab_value
mmhg: O
and: O
his: B-Diagnostic_procedure
heart: I-Diagnostic_procedure
rate: O
was: O
110: I-Lab_value
beats: I-Lab_value
per: I-Lab_value
minute: O


In [41]:
predict("The patient's cranial nerves were intact during the physical exam.")

Predicted Named Entities:
the: O
patients: O
cranial: B-Biological_structure
nerves: I-Biological_structure
were: O
intact: O
during: O
the: O
physical: B-Diagnostic_procedure
exam: O


In [42]:
predict("The patient presented with acute abdominal pain, nausea, and vomiting, and was diagnosed with acute appendicitis.")

Predicted Named Entities:
the: O
patient: O
presented: O
with: O
acute: O
abdominal: B-Biological_structure
pain: B-Sign_symptom
nausea: B-Sign_symptom
and: O
vomiting: B-Sign_symptom
and: O
was: O
diagnosed: O
with: O
acute: O
appendicitis: B-Disease_disorder


In [43]:
predict("The biopsies revealed the presence of malignancy in the patient's tissue samples.")

Predicted Named Entities:
the: O
biopsies: B-Diagnostic_procedure
revealed: O
the: O
presence: O
of: O
malignancy: B-Sign_symptom
in: O
the: O
patients: O
tissue: O
samples: O


In [44]:
predict("The patient was prescribed prednisone to help manage their autoimmune disorder.")

Predicted Named Entities:
the: O
patient: O
was: O
prescribed: O
prednisone: B-Medication
to: O
help: O
manage: O
their: B-Medication
autoimmune: B-Medication
disorder: O


In [45]:
predict("The patient underwent successful removal of a nodule from their thyroid gland.")

Predicted Named Entities:
the: O
patient: O
underwent: O
successful: O
removal: O
of: O
a: O
nodule: B-Sign_symptom
from: O
their: O
thyroid: B-Disease_disorder
gland: O


In [21]:
id2label

{1: 'B-Age',
 2: 'B-Biological_attribute',
 3: 'B-Biological_structure',
 4: 'B-Clinical_event',
 5: 'B-Diagnostic_procedure',
 6: 'B-Disease_disorder',
 7: 'B-Dosage',
 8: 'B-Family_history',
 9: 'B-Height',
 10: 'B-History',
 11: 'B-Lab_value',
 12: 'B-Mass',
 13: 'B-Medication',
 14: 'B-Sex',
 15: 'B-Sign_symptom',
 16: 'B-Therapeutic_procedure',
 17: 'B-Weight',
 18: 'I-Age',
 19: 'I-Biological_attribute',
 20: 'I-Biological_structure',
 21: 'I-Clinical_event',
 22: 'I-Diagnostic_procedure',
 23: 'I-Disease_disorder',
 24: 'I-Dosage',
 25: 'I-Family_history',
 26: 'I-Height',
 27: 'I-History',
 28: 'I-Lab_value',
 29: 'I-Mass',
 30: 'I-Medication',
 31: 'I-Sign_symptom',
 32: 'I-Therapeutic_procedure',
 33: 'I-Weight',
 34: 'O',
 0: '<PAD>'}