In [55]:
import os
import random
import numpy as np
import pickle

import matplotlib.pyplot as plt

In [56]:
import re
import spacy

import nltk
import string
#nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer

STOP_WORDS = stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ajaykarthicksenthilkumar/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [57]:
bio_files_dir = '../data/bio_data_files'
bio_files = [os.path.join(bio_files_dir, f) for f in os.listdir('../data/bio_data_files') if f.endswith('.bio')]

In [58]:
print(f"The number of .bio files is {len(bio_files)}")

The number of .bio files is 200


## Check if any of the stopwords contain B-tag

In [59]:
for bio_file in bio_files:
    with open(bio_file, "r", encoding='utf-8') as f:
        for line in f:
            if line.strip() == '':
                continue
            word, tag = line.strip().split('\t')
            if word in STOP_WORDS and tag.startswith('B'):
                print(line)

## Data Cleaning

In [60]:
def clean_word(word):
    
    # remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]','',word)
    word = re.sub(r'\s+',' ',word)
    
    # convert to lowercase
    word = word.lower()
    
    if word not in STOP_WORDS:
        return word
    
    return ''

In [61]:
# Load a pre-trained Spacy model and set the stop words
nlp = spacy.load('en_core_web_sm')

def clean_word(word):
    # remove non-alphanumeric characters and extra whitespaces
    word = re.sub(r'[^\w\s]','',word)
    word = re.sub(r'\s+',' ',word)
    
    # convert to lowercase
    word = word.lower()

    # lemmatize the word
    lemma = nlp(word)[0].lemma_
    
    # check if the lemma is a stop word
    if lemma not in STOP_WORDS:
        return lemma
    
    return ''


In [62]:
def parse_data_from_file(bio_file):
    """
    Reads a file in BIO format (one token per line, with tab-separated word and tag),
    and extracts the sentences and labels as lists of lists. Each inner list represents
    a sentence, and contains the words of the sentence in order. Each corresponding inner
    list in the 'labels' list contains the BIO tags for the words in the corresponding
    sentence, in the same order.
    
    Args:
    - bio_file (str): the path to the BioNLP file to read
    
    Returns:
    - A tuple containing:
        - sentences (List[List[str]]): a list of lists, where each inner list represents
          a sentence and contains the words of the sentence in order
        - labels (List[List[str]]): a list of lists, where each inner list corresponds
          to a sentence in the 'sentences' list and contains the BIO tags for the words
          in the corresponding sentence, in the same order.
    """
    sentences = []
    labels = []
    
    with open(bio_file, "r", encoding='utf-8') as f:
        
        current_sentences = []
        current_labels = []
        
        for line in f:
            
            if line.strip() == '':
                # If we encounter a blank line, it means we've reached the end of a sentence
                if len(current_sentences) > 0:
                    
                    # Add the current sentence and labels to the list
                    sentences.append(current_sentences)
                    labels.append(current_labels)
                    
                    # Reset the current sentence and labels lists
                    current_sentences = []
                    current_labels = []
                    continue
                    
            word, tag = line.strip().split('\t')
            word = clean_word(word)
            
            if word.strip():
                current_sentences.append(word)
                
                if len(current_labels) > 0:
                    if tag[2:] == current_labels[-1][2:] and tag[:2] == "B-":
                        tag = f"I-{tag[2:]}"
                current_labels.append(tag)
        
    return sentences, labels

In [63]:
def parse_bio_files(bio_files):
    
    sentences = []
    labels = []
    
    for idx, bio_file in enumerate(bio_files):
        
        curr_sentences, curr_labels = parse_data_from_file(bio_file)
        
        if len(curr_sentences) > 0:
            sentences.extend(curr_sentences)
            labels.extend(curr_labels)
            
        if (idx+1) % 20 == 0:
            print(f'{idx+1} completed')

    return sentences, labels

In [64]:
sentences, labels = parse_bio_files(bio_files)

20 completed
40 completed
60 completed
80 completed
100 completed
120 completed
140 completed
160 completed
180 completed
200 completed


In [65]:
print(f"Dataset contains {len(sentences)} examples\n")

Dataset contains 4341 examples



## Shuffle the sentences and labels

In [66]:
combined = list(zip(sentences, labels))
random.shuffle(combined)
sentences[:], labels[:] = zip(*combined)

## Train Test Validation Split

In [67]:
# Split the data into training, validation, and test sets

TEST_SIZE = 0.2

num_sentences = len(sentences)
num_train = int(num_sentences * (1 - TEST_SIZE - 0.1))
num_valid = int(num_sentences * 0.1)

In [68]:
train_sentences = sentences[:num_train]
train_labels = labels[:num_train]

valid_sentences = sentences[num_train:num_train+num_valid]
valid_labels = labels[num_train:num_train+num_valid]

test_sentences = sentences[num_train+num_valid:]
test_labels = labels[num_train+num_valid:]

## Tokenization - Sequences and padding

In [69]:
unique_labels = set(element for sublist in labels for element in sublist)

In [70]:
label_to_index = {label: id+1 for id, label in enumerate(sorted(unique_labels))}
index_to_label = {id: label for label, id in label_to_index.items()}

In [71]:
# Add the new label and ID to the dictionaries
label_to_index['<PAD>'] = 0
index_to_label[0] = '<PAD>'

In [72]:
NUM_CLASSES = len(index_to_label)

In [73]:
index_to_label

{1: 'B-ACT',
 2: 'B-ADM',
 3: 'B-AGE',
 4: 'B-ARA',
 5: 'B-BAT',
 6: 'B-BST',
 7: 'B-CLE',
 8: 'B-COL',
 9: 'B-COR',
 10: 'B-DAT',
 11: 'B-DET',
 12: 'B-DIA',
 13: 'B-DIS',
 14: 'B-DOS',
 15: 'B-DUR',
 16: 'B-FAM',
 17: 'B-FRE',
 18: 'B-HEI',
 19: 'B-HIS',
 20: 'B-LAB',
 21: 'B-MAS',
 22: 'B-MED',
 23: 'B-NBL',
 24: 'B-OCC',
 25: 'B-OTE',
 26: 'B-OTH',
 27: 'B-OUT',
 28: 'B-PER',
 29: 'B-QUC',
 30: 'B-SEV',
 31: 'B-SEX',
 32: 'B-SHA',
 33: 'B-SIG',
 34: 'B-SUB',
 35: 'B-TEX',
 36: 'B-THP',
 37: 'B-TIM',
 38: 'B-VOL',
 39: 'B-WEI',
 40: 'I-ACT',
 41: 'I-ADM',
 42: 'I-AGE',
 43: 'I-ARA',
 44: 'I-BAT',
 45: 'I-BST',
 46: 'I-CLE',
 47: 'I-COL',
 48: 'I-COR',
 49: 'I-DAT',
 50: 'I-DET',
 51: 'I-DIA',
 52: 'I-DIS',
 53: 'I-DOS',
 54: 'I-DUR',
 55: 'I-FAM',
 56: 'I-FRE',
 57: 'I-HEI',
 58: 'I-HIS',
 59: 'I-LAB',
 60: 'I-MAS',
 61: 'I-MED',
 62: 'I-NBL',
 63: 'I-OCC',
 64: 'I-OTE',
 65: 'I-OTH',
 66: 'I-OUT',
 67: 'I-PER',
 68: 'I-QUC',
 69: 'I-SEV',
 70: 'I-SHA',
 71: 'I-SIG',
 72: 'I-SUB',
 

In [74]:
MAX_LENGTH = 100

# Convert the labels to indices
train_labels_indices = [[label_to_index[label] for label in labels] for labels in train_labels]
valid_labels_indices = [[label_to_index[label] for label in labels] for labels in valid_labels]
test_labels_indices = [[label_to_index[label] for label in labels] for labels in test_labels]

# Pad the sequences to a fixed length with the new label ('<PAD>' value = 0)
train_labels_padded = pad_sequences(train_labels_indices, maxlen=MAX_LENGTH, padding='post', value=0)
valid_labels_padded = pad_sequences(valid_labels_indices, maxlen=MAX_LENGTH, padding='post', value=0)
test_labels_padded = pad_sequences(test_labels_indices, maxlen=MAX_LENGTH, padding='post', value=0)


# Convert the labels to categorical format
train_labels_categorical = to_categorical(train_labels_padded, num_classes=NUM_CLASSES)
valid_labels_categorical = to_categorical(valid_labels_padded, num_classes=NUM_CLASSES)
test_labels_categorical = to_categorical(test_labels_padded, num_classes=NUM_CLASSES)

In [75]:
# Convert the input sentences to sequences of word indices
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train_sentences)

tokenizer.word_index['<PAD>'] = 0
tokenizer.index_word[0] = '<PAD>'

# Calculate the vocabulary size
VOCAB_SIZE = len(tokenizer.word_index)
print(f"Vocabulary size: {VOCAB_SIZE}")



Vocabulary size: 6860


In [76]:
# Convert the input sentences to sequences of word indices
train_sequences = tokenizer.texts_to_sequences(train_sentences)
val_sequences = tokenizer.texts_to_sequences(valid_sentences)
test_sequences = tokenizer.texts_to_sequences(test_sentences)

# oov_words_train = [word for sentence in train_sentences for word in sentence if word not in tokenizer.word_index]
# oov_words_val = [word for sentence in valid_sentences for word in sentence if word not in tokenizer.word_index]
# oov_words_test = [word for sentence in test_sentences for word in sentence if word not in tokenizer.word_index]

# print(f"Number of OOV words in train: {len(oov_words_train)}")
# print(f"Number of OOV words in val: {len(oov_words_val)}")
# print(f"Number of OOV words in test: {len(oov_words_test)}")


In [77]:
# pad the sequences
train_sequences_padded = pad_sequences(train_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
val_sequences_padded = pad_sequences(val_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')
test_sequences_padded = pad_sequences(test_sequences, maxlen=MAX_LENGTH, padding='post', truncating='post')


###  save to a .npz file:

In [78]:
np.savez(
    '../data/data.npz',
     train_sequences_padded=train_sequences_padded,
     train_labels=train_labels_categorical,
     val_sequences_padded=val_sequences_padded,
     val_labels=valid_labels_categorical,
     test_sequences_padded=test_sequences_padded,
     test_labels=test_labels_categorical,
     label_to_index=label_to_index,
     index_to_label=index_to_label
)

In [54]:
with open('../data/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)