In [2]:
import os
import random
import numpy as np
import pickle

import matplotlib.pyplot as plt

In [3]:
import spacy
import nltk
import string
#nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
STOP_WORDS = stopwords.words('english')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10060] A
[nltk_data]     connection attempt failed because the connected party
[nltk_data]     did not properly respond after a period of time, or
[nltk_data]     established connection failed because connected host
[nltk_data]     has failed to respond>


In [4]:
bio_files_dir = r'/Users/saikiran/Documents/NLP/MedNer-main/annotated_dictionary'
bio_files = [os.path.join(bio_files_dir, f) for f in os.listdir(r'/Users/saikiran/Documents/NLP/MedNer-main/annotated_dictionary') if f.endswith('.bio')]

In [5]:
print(f"The number of .bio files is {len(bio_files)}")

The number of .bio files is 200


In [6]:
# Check if any of the stopwords contain B-tag
for bio_file in bio_files:
    with open(bio_file, "r", encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                continue
            word, tag = line.strip().split('\t')
            if word in STOP_WORDS and tag.startswith('B'):
                print(line)

In [8]:
nlp = spacy.load('en_core_web_sm')
def clean_word(word):
    # remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]','',word)
    word = re.sub(r'\s+',' ',word)
    
    # convert to lowercase
    word = word.lower()

    # lemmatize the word
    lemma = nlp(word)[0].lemma_
    
    # check if the lemma is a stop word
    if lemma not in STOP_WORDS:
        return lemma
    
    return ''


In [9]:
def parse_data_from_file(bio_file):
    """
      Reads the input file, which is assumed to have one token per line with tab-separated word and tag, 
      and extracts the sentences and corresponding BIO tags. 
      The returned tuple allows for easy access to both the words and their corresponding tags for further processing.
    """
    sentences = []
    labels = []    
    with open(bio_file, "r", encoding='utf-8') as f:
        current_sentences = []
        current_labels = []
        for line in f:
            if line.strip() == '':                  
                if len(current_sentences) > 0:                 
                    sentences.append(current_sentences)
                    labels.append(current_labels)
                    # Reset the current sentence and labels lists
                    current_sentences = []
                    current_labels = []
                    continue          
            word, tag = line.strip().split('\t')
            word = clean_word(word) 
            if word.strip():
                current_sentences.append(word)
                if len(current_labels) > 0:
                    if tag[2:] == current_labels[-1][2:] and tag[:2] == "B-":
                        tag = f"I-{tag[2:]}"
                current_labels.append(tag)      
    return sentences, labels

In [10]:
def parse_bio_files(bio_files): 
    sentences = []
    labels = []    
    for idx, bio_file in enumerate(bio_files):        
        curr_sentences, curr_labels = parse_data_from_file(bio_file)        
        if len(curr_sentences) > 0:
            sentences.extend(curr_sentences)
            labels.extend(curr_labels)            
        if (idx+1) % 20 == 0:
            print(f'{idx+1} completed')
    return sentences, labels

In [11]:
sentences,labels = parse_bio_files(bio_files)

20 completed
40 completed
60 completed
80 completed
100 completed
120 completed
140 completed
160 completed
180 completed
200 completed


In [12]:
print(f"Dataset contains {len(sentences)} examples\n")

Dataset contains 4341 examples



In [13]:
# Split the data into training, validation, and test sets

TEST_SIZE = 0.2

num_sentences = len(sentences)
num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))
num_valid = int(num_sentences * 0.1)

In [14]:
num_train

3038

In [15]:
train_sentences = sentences[:num_train]
train_labels = labels[:num_train]



In [16]:
valid_sentences = sentences[num_train:num_train+num_valid]
valid_labels = labels[num_train:num_train+num_valid]

test_sentences = sentences[num_train+num_valid:]
test_labels = labels[num_train+num_valid:]

In [17]:
unique_labels = set(element for sublist in labels for element in sublist)

In [18]:
label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
index_to_label = {id: label for label, id in label_to_index.items()}

In [19]:
# Adding padding to the dictionary
label_to_index['<PAD>'] = 79
index_to_label[79] = '<PAD>'

In [20]:
NUM_CLASSES = len(index_to_label)
NUM_CLASSES

79

In [21]:
print(label_to_index)

{'B-ACT': 1, 'B-ADM': 2, 'B-AGE': 3, 'B-ARA': 4, 'B-BAT': 5, 'B-BST': 6, 'B-CLE': 7, 'B-COL': 8, 'B-COR': 9, 'B-DAT': 10, 'B-DET': 11, 'B-DIA': 12, 'B-DIS': 13, 'B-DOS': 14, 'B-DUR': 15, 'B-FAM': 16, 'B-FRE': 17, 'B-HEI': 18, 'B-HIS': 19, 'B-LAB': 20, 'B-MAS': 21, 'B-MED': 22, 'B-NBL': 23, 'B-OCC': 24, 'B-OTE': 25, 'B-OTH': 26, 'B-OUT': 27, 'B-PER': 28, 'B-QUC': 29, 'B-SEV': 30, 'B-SEX': 31, 'B-SHA': 32, 'B-SIG': 33, 'B-SUB': 34, 'B-TEX': 35, 'B-THP': 36, 'B-TIM': 37, 'B-VOL': 38, 'B-WEI': 39, 'I-ACT': 40, 'I-ADM': 41, 'I-AGE': 42, 'I-ARA': 43, 'I-BAT': 44, 'I-BST': 45, 'I-CLE': 46, 'I-COL': 47, 'I-COR': 48, 'I-DAT': 49, 'I-DET': 50, 'I-DIA': 51, 'I-DIS': 52, 'I-DOS': 53, 'I-DUR': 54, 'I-FAM': 55, 'I-FRE': 56, 'I-HEI': 57, 'I-HIS': 58, 'I-LAB': 59, 'I-MAS': 60, 'I-MED': 61, 'I-NBL': 62, 'I-OCC': 63, 'I-OTE': 64, 'I-OTH': 65, 'I-OUT': 66, 'I-PER': 67, 'I-QUC': 68, 'I-SEV': 69, 'I-SHA': 70, 'I-SIG': 71, 'I-SUB': 72, 'I-TEX': 73, 'I-THP': 74, 'I-TIM': 75, 'I-VOL': 76, 'I-WEI': 77, 'O': 78

In [22]:
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
MAX_LENGTH = 100
#transforming the labels to numerical representation
train_labels = [[label_to_index[label] for label in labels] for labels in train_labels]
train_labels = pad_sequences(train_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
train_labels = to_categorical(train_labels, num_classes=NUM_CLASSES)

valid_labels = [[label_to_index[label] for label in labels] for labels in valid_labels]
valid_labels = pad_sequences(valid_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
valid_labels = to_categorical(valid_labels, num_classes=NUM_CLASSES)

test_labels = [[label_to_index[label] for label in labels] for labels in test_labels]
test_labels = pad_sequences(test_labels, maxlen=MAX_LENGTH, padding='post', value=NUM_CLASSES-1)
test_labels = to_categorical(test_labels, num_classes=NUM_CLASSES)

In [23]:
print(train_labels[0],valid_labels[0],test_labels[0])

[[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]] [[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]] [[0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]


In [24]:
# Convert the input sentences to sequences of word indices
from keras.preprocessing.text import Tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(valid_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# Pad the sequences to a fixed length
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')

In [25]:
test_sequences_padded[0]

array([   1, 2726,   32,  448, 2018, 1420,    5,   81,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0])

###  save to a .npz file:

In [26]:
np.savez(
    r'/Users/saikiran/Documents/NLP/MedNer-main/final_npz',
     train_sequences_padded=train_sequences_padded,
     train_labels=train_labels,
     val_sequences_padded=val_sequences_padded,
     val_labels=valid_labels,
     test_sequences_padded=test_sequences_padded,
     test_labels=test_labels,
     label_to_index=label_to_index,
     index_to_label=index_to_label
)

In [27]:
train_labels.shape

(3038, 100, 79)

In [28]:
import numpy as np

# Load the .npz file
data = np.load(r'/Users/saikiran/Documents/NLP/MedNer-main/final_npz.npz',allow_pickle=True)

# List the arrays stored in the .npz file
array_names = data.files
print("Arrays in the .npz file:", array_names)

# Access and view the contents of individual arrays
for array_name in array_names:
    array_data = data[array_name]
    print(f"Array '{array_name}':")
    print(array_data)


Arrays in the .npz file: ['train_sequences_padded', 'train_labels', 'val_sequences_padded', 'val_labels', 'test_sequences_padded', 'test_labels', 'label_to_index', 'index_to_label']
Array 'train_sequences_padded':
[[ 301  522    5 ...    0    0    0]
 [ 102  614  910 ...    0    0    0]
 [ 673  285  523 ...    0    0    0]
 ...
 [ 122  177    2 ...    0    0    0]
 [  91  179 1566 ...    0    0    0]
 [ 156 1948 2528 ...    0    0    0]]
Array 'train_labels':
[[[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 0.]
  [0. 0. 0. ... 0. 0. 0.]
  ...
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. ... 0. 0. 1.]]

 ...

 [[0. 0. 0. ... 0. 0. 1.]
  [0. 0. 0. 

In [29]:
print("TRAINING DATA")
print(f"The shape of input ids tensor of train data is {train_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of train data is {train_data['attention_mask'].shape}")
print(f"The shape of labels tensor of train data is {train_data['labels'].shape}")

print("\nVALIDATION DATA")
print(f"The shape of input ids tensor of validation data is {val_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of validation data is {val_data['attention_mask'].shape}")
print(f"The shape of labels tensor of validation data is {val_data['labels'].shape}")

print("\nTEST DATA")
print(f"The shape of input ids tensor of test data is {test_data['input_ids'].shape}")
print(f"The shape of attention masks tensor of test data is {test_data['attention_mask'].shape}")
print(f"The shape of labels tensor of test data is {test_data['labels'].shape}")

TRAINING DATA
The shape of input ids tensor of train data is (2696, 200)
The shape of attention masks tensor of train data is (2696, 200)
The shape of labels tensor of train data is (2696, 200)

VALIDATION DATA
The shape of input ids tensor of validation data is (955, 200)
The shape of attention masks tensor of validation data is (955, 200)
The shape of labels tensor of validation data is (955, 200)

TEST DATA
The shape of input ids tensor of test data is (929, 200)
The shape of attention masks tensor of test data is (929, 200)
The shape of labels tensor of test data is (929, 200)
