In [1]:
import os
import re
import nltk
import spacy
import string
import random
import pickle

import numpy as np
import matplotlib.pyplot as plt

from pprint import pprint
from nltk.corpus import stopwords
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer

In [2]:
nltk.download('punkt')
nltk.download('stopwords')

nlp = spacy.load('en_core_web_sm')

STOP_WORDS = stopwords.words('english')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dever\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\dever\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import os

bio_files_dir = os.path.join("..", "annotated_dict")

# Check if the directory exists and list all .bio files
if os.path.exists(bio_files_dir):
    bio_files = [os.path.join(bio_files_dir, f) for f in os.listdir(bio_files_dir) if f.endswith('.bio')]
    pprint("Bio files identified")
else:
    pprint("Directory not found")

pprint(f"The total number of .bio files available in the dataset is {len(bio_files)}")

'Bio files identified'
'The total number of .bio files available in the dataset is 200'


In [4]:
STOP_WORDS = set(stopwords.words('english'))

# Check if any of the stopwords contain B-tag
for bio_file in bio_files:
    try:
        with open(bio_file, "r", encoding='utf-8') as f:
            for line in f:
                if line.strip() == '':
                    continue
                word, tag = line.strip().split('\t')
                if word.lower() in STOP_WORDS and tag.startswith('B'):
                    print(f"Stopword tagged as an entity in {os.path.basename(bio_file)}: {line.strip()}")
    except Exception as e:
        print(f"Error reading file {bio_file}: {e}")

Stopword tagged as an entity in 15939911.bio: up	B-Detailed_description
Stopword tagged as an entity in 15939911.bio: to	B-Sign_symptom
Stopword tagged as an entity in 15939911.bio: up	B-Clinical_event
Stopword tagged as an entity in 16778410.bio: both	B-Biological_structure
Stopword tagged as an entity in 16778410.bio: S	B-Diagnostic_procedure
Stopword tagged as an entity in 16778410.bio: no	B-Sign_symptom
Stopword tagged as an entity in 16778410.bio: for	B-Duration
Stopword tagged as an entity in 16778410.bio: both	B-Biological_structure
Stopword tagged as an entity in 16778410.bio: some	B-Detailed_description
Stopword tagged as an entity in 16778410.bio: few	B-Detailed_description
Stopword tagged as an entity in 16778410.bio: S	B-Diagnostic_procedure
Stopword tagged as an entity in 16778410.bio: doing	B-Outcome
Stopword tagged as an entity in 16778410.bio: no	B-Sign_symptom
Stopword tagged as an entity in 17803823.bio: no	B-History
Stopword tagged as an entity in 17803823.bio: no	B-

In [5]:
def clean_word(word: str) -> str:
    """
    Cleans and lemmatizes a word.

    Args:
        word (str): The word to clean and lemmatize.

    Returns:
        str: The lemmatized form of the word if it is not a stop word, otherwise an empty string.
    """
    # Remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]', '', word)
    word = re.sub(r'\s+', ' ', word).strip()

    word = word.lower()

    # Lemmatize the word
    doc = nlp(word)
    if not doc:
        return ''
    
    lemma = doc[0].lemma_

    # Check if the lemma is a stop word
    if lemma not in STOP_WORDS:
        return lemma

    return ''

In [6]:
def parse_data_from_file(bio_file):
    """
    Parses sentences and their corresponding BIO tags from a BIO-formatted file.

    Args:
        bio_file (str): Path to the BIO file containing token-tag pairs per line.

    Returns:
        list: A list of sentences, where each sentence is a list of tokens.
        list: A list of tag lists corresponding to the sentences.
    """
    if not os.path.exists(bio_file):
        raise FileNotFoundError(f"The file {bio_file} does not exist.")

    sentences = list()
    labels = list()
    current_sentences = list()
    current_labels = list()

    with open(bio_file, "r", encoding='utf-8') as file:
        for line in file:
            stripped_line = line.strip()
            if not stripped_line:
                if current_sentences:
                    sentences.append(current_sentences)
                    labels.append(current_labels)
                    current_sentences = list()
                    current_labels = list()
            else:
                word, tag = stripped_line.split('\t')
                cleaned_word = clean_word(word)
                if cleaned_word:
                    current_sentences.append(cleaned_word)
                    current_labels.append(tag)
        if current_sentences:  
            sentences.append(current_sentences)
            labels.append(current_labels)

    return sentences, labels

In [7]:
def parse_bio_files(bio_files):
    """
    Parses multiple BIO-formatted files to extract sentences and corresponding BIO tags.
    
    Args:
        bio_files (list): A list of paths to BIO-formatted files.
    
    Returns:
        tuple: Two lists, the first containing lists of words for each sentence across all files,
               the second containing lists of corresponding BIO tags for each sentence.
    """
    sentences = list()
    labels = list()
    
    for idx, bio_file in enumerate(bio_files):
        try:
            curr_sentences, curr_labels = parse_data_from_file(bio_file)
            if curr_sentences:
                sentences.extend(curr_sentences)
                labels.extend(curr_labels)
        except Exception as e:
            print(f"Error processing {bio_file}: {e}")
        
        if (idx + 1) % 20 == 0:
            print(f'{idx + 1} files processed.')

    # Log completion of all files
    if idx + 1 < 20 or (idx + 1) % 20 != 0:
        print(f'All {idx + 1} files processed.')

    return sentences, labels

In [8]:
sentences, labels = parse_bio_files(bio_files)

20 files processed.
40 files processed.
60 files processed.
80 files processed.
100 files processed.
120 files processed.
140 files processed.
160 files processed.
180 files processed.
200 files processed.


In [9]:
pprint(f"Dataset contains {len(sentences)} examples.")

'Dataset contains 4541 examples.'


In [10]:
data_indices = np.arange(len(sentences))
np.random.shuffle(data_indices)

TEST_SIZE = 0.2

# Splitting indices for training, validation, and test sets
num_train = int(len(sentences) * (1 - TEST_SIZE - 0.1))
num_valid = int(len(sentences) * 0.1)

train_indices = data_indices[:num_train]
valid_indices = data_indices[num_train:num_train+num_valid]
test_indices = data_indices[num_train+num_valid:]

# Using indices to create the datasets
train_sentences = [sentences[i] for i in train_indices]
train_labels = [labels[i] for i in train_indices]

valid_sentences = [sentences[i] for i in valid_indices]
valid_labels = [labels[i] for i in valid_indices]

test_sentences = [sentences[i] for i in test_indices]
test_labels = [labels[i] for i in test_indices]

print(f"Training set: {len(train_sentences)} samples")
print(f"Validation set: {len(valid_sentences)} samples")
print(f"Test set: {len(test_sentences)} samples")

Training set: 3178 samples
Validation set: 454 samples
Test set: 909 samples


In [11]:
unique_labels = set(element for sublist in labels for element in sublist)

label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
index_to_label = {id: label for label, id in label_to_index.items()}

# Adding padding to the dictionary
label_to_index['<PAD>'] = 79
index_to_label[79] = '<PAD>'

NUM_CLASSES = len(index_to_label)
NUM_CLASSES

pprint(label_to_index)

{'<PAD>': 79,
 'B-Activity': 1,
 'B-Administration': 2,
 'B-Age': 3,
 'B-Area': 4,
 'B-Biological_attribute': 5,
 'B-Biological_structure': 6,
 'B-Clinical_event': 7,
 'B-Color': 8,
 'B-Coreference': 9,
 'B-Date': 10,
 'B-Detailed_description': 11,
 'B-Diagnostic_procedure': 12,
 'B-Disease_disorder': 13,
 'B-Distance': 14,
 'B-Dosage': 15,
 'B-Duration': 16,
 'B-Family_history': 17,
 'B-Frequency': 18,
 'B-Height': 19,
 'B-History': 20,
 'B-Lab_value': 21,
 'B-Mass': 22,
 'B-Medication': 23,
 'B-Nonbiological_location': 24,
 'B-Occupation': 25,
 'B-Other_entity': 26,
 'B-Other_event': 27,
 'B-Outcome': 28,
 'B-Personal_background': 29,
 'B-Qualitative_concept': 30,
 'B-Quantitative_concept': 31,
 'B-Severity': 32,
 'B-Sex': 33,
 'B-Shape': 34,
 'B-Sign_symptom': 35,
 'B-Subject': 36,
 'B-Texture': 37,
 'B-Therapeutic_procedure': 38,
 'B-Time': 39,
 'B-Volume': 40,
 'B-Weight': 41,
 'I-Activity': 42,
 'I-Administration': 43,
 'I-Age': 44,
 'I-Area': 45,
 'I-Biological_attribute': 46,
 

In [12]:
MAX_LENGTH = 100  # Maximum length of sequences
NUM_CLASSES = len(label_to_index) + 1  # Total number of classes

def prepare_labels(labels_data, label_to_index, max_length, num_classes):
    """
    Convert text labels into padded and one-hot encoded numerical format.
    
    Args:
        labels_data (list of lists): The labels for each data point (each list of labels corresponds to a sequence).
        label_to_index (dict): Mapping from label names to indices.
        max_length (int): The maximum length for padding sequences.
        num_classes (int): The total number of classes.
    
    Returns:
        numpy array: Padded and one-hot encoded labels.
    """
    # Convert labels to index
    indexed_labels = [[label_to_index.get(label, num_classes-1) for label in sequence] for sequence in labels_data]
    
    # Pad sequences
    padded_labels = pad_sequences(indexed_labels, maxlen=max_length, padding='post', value=num_classes-1)
    
    one_hot_labels = to_categorical(padded_labels, num_classes=num_classes)
    
    return one_hot_labels

train_labels = prepare_labels(train_labels, label_to_index, MAX_LENGTH, NUM_CLASSES)
valid_labels = prepare_labels(valid_labels, label_to_index, MAX_LENGTH, NUM_CLASSES)
test_labels = prepare_labels(test_labels, label_to_index, MAX_LENGTH, NUM_CLASSES)

pprint(train_labels[0])
pprint(valid_labels[0])
pprint(test_labels[0])

array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])
array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])
array([[0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.],
       [0., 0., 0., ..., 0., 0., 1.]])


In [13]:
MAX_LENGTH = 100  # Assuming MAX_LENGTH is defined

# Initialize tokenizer with out-of-vocabulary token
tokenizer = Tokenizer(oov_token="<OOV>")

# Fit tokenizer on training sentences
tokenizer.fit_on_texts(train_sentences)

# Convert sentences to sequences
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(valid_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad sequences
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [14]:
pprint(train_sequences_padded)
pprint(val_sequences_padded)
pprint(test_sequences_padded)

array([[  39,  363,  842, ...,    0,    0,    0],
       [2569,    4, 1199, ...,    0,    0,    0],
       [ 932, 3694,  108, ...,    0,    0,    0],
       ...,
       [7035, 1579, 1112, ...,    0,    0,    0],
       [7037,   56,  246, ...,    0,    0,    0],
       [   7, 7038,    3, ...,    0,    0,    0]])
array([[ 460,    5,   24, ...,    0,    0,    0],
       [ 218,  111,  232, ...,    0,    0,    0],
       [ 300,  912,  426, ...,    0,    0,    0],
       ...,
       [ 299,  479,  267, ...,    0,    0,    0],
       [1367, 4484,  227, ...,    0,    0,    0],
       [ 238,  137, 1341, ...,    0,    0,    0]])
array([[3163,  110,   19, ...,    0,    0,    0],
       [ 157,    5,   24, ...,    0,    0,    0],
       [1681, 2860,  179, ...,    0,    0,    0],
       ...,
       [ 849, 2390,  600, ...,    0,    0,    0],
       [ 168,  171,   34, ...,    0,    0,    0],
       [ 123, 2576,    4, ...,    0,    0,    0]])


###  Saving transformed data as a .npz file:

In [15]:
file_path = os.path.join("..", "final_npz.npz")

# Save the datasets and label mappings into a compressed .npz file
np.savez(
    file_path,
    train_sequences_padded=train_sequences_padded,
    train_labels=train_labels,
    val_sequences_padded=val_sequences_padded,
    val_labels=valid_labels,
    test_sequences_padded=test_sequences_padded,
    test_labels=test_labels,
    label_to_index=label_to_index,
    index_to_label=index_to_label
)

if os.path.exists(file_path):
    print(f"File saved successfully at {file_path}")
else:
    print("Failed to save the file.")

print("Shape of train_labels:", train_labels.shape)

File saved successfully at ..\final_npz.npz
Shape of train_labels: (3178, 100, 84)


In [16]:
# Load the .npz file, ensuring it is allowed to load pickled objects
data = np.load(file_path, allow_pickle=True)

# List the arrays stored in the .npz file
array_names = data.files
print("Arrays in the .npz file:", array_names)

# Access and view the contents of individual arrays
for array_name in array_names:
    array_data = data[array_name]
    print(f"Array '{array_name}':")
    if isinstance(array_data, np.ndarray) and array_data.size < 10:
        pprint(array_data)
    else:
        pprint(f"Data too large to display, shape: {array_data.shape}")

Arrays in the .npz file: ['train_sequences_padded', 'train_labels', 'val_sequences_padded', 'val_labels', 'test_sequences_padded', 'test_labels', 'label_to_index', 'index_to_label']
Array 'train_sequences_padded':
'Data too large to display, shape: (3178, 100)'
Array 'train_labels':
'Data too large to display, shape: (3178, 100, 84)'
Array 'val_sequences_padded':
'Data too large to display, shape: (454, 100)'
Array 'val_labels':
'Data too large to display, shape: (454, 100, 84)'
Array 'test_sequences_padded':
'Data too large to display, shape: (909, 100)'
Array 'test_labels':
'Data too large to display, shape: (909, 100, 84)'
Array 'label_to_index':
array({'B-Activity': 1, 'B-Administration': 2, 'B-Age': 3, 'B-Area': 4, 'B-Biological_attribute': 5, 'B-Biological_structure': 6, 'B-Clinical_event': 7, 'B-Color': 8, 'B-Coreference': 9, 'B-Date': 10, 'B-Detailed_description': 11, 'B-Diagnostic_procedure': 12, 'B-Disease_disorder': 13, 'B-Distance': 14, 'B-Dosage': 15, 'B-Duration': 16, 'B-