## CHEMDNER Data Preprocessing

In [27]:
from nltk.tokenize import punkt
from nltk.tokenize import word_tokenize
import numpy as np
import itertools
import pickle

* Paths to raw files provided by the dataset; Do NOT modify

In [2]:
FILE_TRAINING_TEXT = './chemdner_corpus/training.abstracts.txt'
FILE_TRAINING_LBL = './chemdner_corpus/training.annotations.txt'

FILE_DEV_TEXT = './chemdner_corpus/development.abstracts.txt'
FILE_DEV_LBL = './chemdner_corpus/development.annotations.txt'

FILE_EVAL_TEXT = './chemdner_corpus/evaluation.abstracts.txt'
FILE_EVAL_LBL = './chemdner_corpus/evaluation.annotations.txt'

* A string of all the raw texts in the CHEMDNER dataset, used to train the NLTK tokenizer.

In [3]:
all_texts = ''

* Loading the Abstracts into a dictionary `{id: [title, abstract]}`

In [4]:
def read_abstracts(file: str) -> dict:
    global all_texts
    with open(file, 'r') as f:
        ret = dict()
        for line in f:
            if line[-1] == '\n':
                line = line[:-1]  # removing the EOL character
            line_list = line.split('\t')
            assert len(line_list) == 3, f"ERROR1: This line dose not have 3 columns:\nFILE: {file}\n{line_list}"
            all_texts = all_texts + line_list[1] + ' ' + line_list[2] + ' '
            ret[line_list[0]] = line_list
    return ret

In [5]:
train_txt = read_abstracts(FILE_TRAINING_TEXT)
dev_txt = read_abstracts(FILE_DEV_TEXT)
eval_txt = read_abstracts(FILE_EVAL_TEXT)
print(f"Done!\n# of sentences read: Train: {len(train_txt)}, Dev: {len(dev_txt)}, Eval: {len(eval_txt)}")

Done!
# of sentences read: Train: 3500, Dev: 3500, Eval: 3000


* Read the annotations and group them by Artical Identifier

In [6]:
 def read_annotations(file: str) -> dict:
    with open(file, 'r') as f:
        ret = dict()
        for line in f:
            if line[-1] == '\n':
                line = line[:-1]  # removing EOL
            line_list = line.split('\t')
            assert len(line_list) == 6, f"ERROR2: This line dose not have 6 columns:\n{line_list}"
            line_list[2] = int(line_list[2])
            line_list[3] = int(line_list[3])
            if line_list[0] not in ret:
                ret[line_list[0]] = {'T': list(), 'A': list()}
            ret[line_list[0]][line_list[1]].append(line_list)
    return ret

In [7]:
train_anno = read_annotations(FILE_TRAINING_LBL)
dev_anno = read_annotations(FILE_DEV_LBL)
eval_anno = read_annotations(FILE_EVAL_LBL)
print(f"Done!")

Done!


* Train NLTK tokenizer

In [8]:
punkt_tokenizer = punkt.PunktSentenceTokenizer(all_texts)

* Generate Labels

In [9]:
def _generate_labels(sentence: str, anno_list:list) -> list:
    anno_list.sort(key = lambda x:x[2])
    last_pos = 0
    sentence_lst = list()
    label_lst = list()
    for item in anno_list:
        start_pos = item[2]
        end_pos = item[3]
        part = word_tokenize(sentence[last_pos:start_pos])
        sentence_lst.extend(part)
        label_lst.extend([('O', '')] * len(part))
        part = word_tokenize(sentence[start_pos:end_pos])
        sentence_lst.extend(part)
        label_lst.extend([('B', item[5])] + [('I', item[5])] * (len(part)-1))
        last_pos = end_pos
    part = word_tokenize(sentence[last_pos:])
    sentence_lst.extend(part)
    label_lst.extend([('O', '')] * len(part))
    assert len(sentence_lst) == len(label_lst), f"ERROR3: Label and tokenized sentence length mismatch!\n" \
                    f"{sentence}\n{list(itertools.zip_longest(sentence_lst, label_lst))}\n{anno_list}"
    return (sentence_lst, label_lst)
    
def get_labels(text: dict, annotations: dict):
    sentence_lst = list()
    label_lst = list()
    for pmid in annotations:
        dct = annotations[pmid]
        if dct['T']:
            sentence = text[pmid][1]
            anno_list = dct['T']
            lst1, lst2 = _generate_labels(sentence, anno_list)
            sentence_lst.append(lst1)
            label_lst.append(lst2)
        if dct['A']:
            full_abstract = text[pmid][2]
            sentences = punkt_tokenizer.tokenize(full_abstract)
            anno_list = dct['A']
            anno_list.sort(key=lambda x:x[2])
            for sentence in sentences:
                sentence_offset_in_abstract = full_abstract.find(sentence)
                anno_list_for_this_sentence = list()
                if anno_list:
                    next_start_pos = anno_list[0][2]                    
                    while next_start_pos < sentence_offset_in_abstract + len(sentence):
                        tmp_anno = anno_list[0]
                        del anno_list[0]
                        tmp_anno[2] = tmp_anno[2] - sentence_offset_in_abstract
                        tmp_anno[3] = tmp_anno[3] - sentence_offset_in_abstract
                        anno_list_for_this_sentence.append(tmp_anno)
                        if anno_list:
                            next_start_pos = anno_list[0][2]
                        else:
                            break
                lst1, lst2 = _generate_labels(sentence, anno_list_for_this_sentence)
                sentence_lst.append(lst1)
                label_lst.append(lst2)
    return (sentence_lst, label_lst)

In [10]:
train_tokenized_txt, train_label = get_labels(train_txt, train_anno)
dev_tokenized_txt, dev_label = get_labels(dev_txt, dev_anno)
eval_tokenized_txt, eval_label = get_labels(eval_txt, eval_anno)

Check the generated label of a random training example.

In [11]:
print(train_label[8754])

[('B', 'ABBREVIATION'), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', ''), ('O', '')]


Put the tokens and the labels together to make sure the labels are correctly assigned.

In [12]:
print(list(zip(train_tokenized_txt[8754], train_label[8754])))

[('PBDE', ('B', 'ABBREVIATION')), ('-mediated', ('O', '')), ('MD', ('O', '')), ('per', ('O', '')), ('se', ('O', '')), ('or', ('O', '')), ('enhanced', ('O', '')), ('by', ('O', '')), ('a', ('O', '')), ('background', ('O', '')), ('that', ('O', '')), ('confers', ('O', '')), ('susceptibility', ('O', '')), ('to', ('O', '')), ('this', ('O', '')), ('exposure', ('O', '')), ('may', ('O', '')), ('have', ('O', '')), ('profound', ('O', '')), ('implications', ('O', '')), ('in', ('O', '')), ('the', ('O', '')), ('energy', ('O', '')), ('balance', ('O', '')), ('of', ('O', '')), ('brain', ('O', '')), ('.', ('O', ''))]


* Determine the max length of a sentence that is allowed in our model.

In [13]:
train_lengths = np.array([len(x) for x in train_tokenized_txt])
dev_lengths = np.array([len(x) for x in dev_tokenized_txt])
eval_lengths = np.array([len(x) for x in eval_tokenized_txt])

In [14]:
print(len(train_lengths), len(dev_lengths), len(eval_lengths))
print(np.percentile(train_lengths,  99), np.percentile(dev_lengths,  99), np.percentile(eval_lengths,  99))
print(np.percentile(train_lengths,  98), np.percentile(dev_lengths,  98), np.percentile(eval_lengths,  98))

24732 24799 21167
69.0 70.0 69.34000000000015
60.38000000000102 60.0 61.0


From the results of above three code blocks, we can see that if the maximum allowed length of a sentence is set to 75,  only less than 1% will be removed.

In [15]:
max_len = 75

In [16]:
def padding_2D(pylist: list, max_len: int, padding_val):
    for row in pylist:
        if len(row) < max_len:
            row += [padding_val for _ in range(max_len - len(row))]
        else:
            row = row[:max_len]

In [17]:
padding_2D(train_tokenized_txt, max_len, '')
padding_2D(dev_tokenized_txt, max_len, '')
padding_2D(eval_tokenized_txt, max_len, '')
padding_2D(train_label, max_len, ('P', ''))
padding_2D(dev_label, max_len, ('P', ''))
padding_2D(eval_label, max_len, ('P', ''))

* Save to pickle file

In [28]:
def save_to_pickle(obj, file):
    with open(file, 'wb') as f:
        pickle.dump(obj, f)

In [31]:
save_to_pickle(train_tokenized_txt, './x_train.pickle')
save_to_pickle(dev_tokenized_txt, './x_dev.pickle')
save_to_pickle(eval_tokenized_txt, './x_eval.pickle')
save_to_pickle(train_label, './y_train.pickle')
save_to_pickle(dev_label, './y_dev.pickle')
save_to_pickle(eval_label, './y_eval.pickle')