In [1]:
import json
import os
import re
import glob
from tqdm import tqdm
from string import punctuation

import copy

from bioc import pubtator
import pandas as pd

## Build Ontologies for Disease
- Mainly constructed from MEDIC vocab
- Add train samples for Dev Ontology, and Add train+dev for Test ontology

In [2]:
class TextPreprocess():
    """
    Text Preprocess module
    Support lowercase, removing punctuation, typo correction
    """
    def __init__(self, 
            lowercase=True, 
            remove_punctuation=True,
            ignore_punctuations="",
            typo_path=None):
        """
        Parameters
        ==========
        typo_path : str
            path of known typo dictionary
        """
        self.lowercase = lowercase
        self.typo_path = typo_path
        self.rmv_puncts = remove_punctuation
        self.punctuation = punctuation
        for ig_punc in ignore_punctuations:
            self.punctuation = self.punctuation.replace(ig_punc,"")
        self.rmv_puncts_regex = re.compile(r'[\s{}]+'.format(re.escape(self.punctuation)))
        
        if typo_path:
            self.typo2correction = self.load_typo2correction(typo_path)
        else:
            self.typo2correction = {}

    def load_typo2correction(self, typo_path):
        typo2correction = {}
        with open(typo_path, mode='r', encoding='utf-8') as f:
            lines = f.readlines()
            for line in lines:
                s = line.strip()
                tokens = s.split("||")
                value = "" if len(tokens) == 1 else tokens[1]
                typo2correction[tokens[0]] = value    
        return typo2correction 

    def remove_punctuation(self,phrase):
        phrase = self.rmv_puncts_regex.split(phrase)
        phrase = ' '.join(phrase).strip()
        return phrase

    def correct_spelling(self, phrase):
        phrase_tokens = phrase.split()
        phrase = ""

        for phrase_token in phrase_tokens:
            if phrase_token in self.typo2correction.keys():
                phrase_token = self.typo2correction[phrase_token]
            phrase += phrase_token + " "
       
        phrase = phrase.strip()
        return phrase

    def run(self, text):
        if self.lowercase:
            text = text.lower()
        if self.typo_path:
            text = self.correct_spelling(text)
        if self.rmv_puncts:
            text = self.remove_punctuation(text)
        text = text.strip()
        return text

In [3]:
# load MEDIC: the ontology for Disease
with open("./data/ResCNN/CTD_diseases.tsv", 'r') as f:
    lines = f.readlines()
lines = [lines[27]] + lines[29:]
lines = [line.split('\t') for line in lines]
df = pd.DataFrame(lines[1:], columns = lines[0])
print(df.shape)

(13300, 9)


In [5]:
def build_ontologies_disease(df, text_preprocessor, query_dataset=None, training=True):
    
    result = {}
    mentions = set()

    for entdatry in df.to_dict('records'):
        alt_ids = []
        synonym_list = []

        if entry['DiseaseID'].startswith('MESH'):
            pid = entry['DiseaseID'].replace('MESH:', '')
        elif entry['DiseaseID'].startswith('OMIM'):
            pid = entry['DiseaseID'].replace('OMIM:', '')
        else:
            continue

        if entry['AltDiseaseIDs']:
            alt_ids.extend(entry['AltDiseaseIDs'].split('|'))
            alt_ids = [
                alt.replace('MESH:', '').replace('OMIM:', '') for alt in alt_ids if alt.startswith('MESH') or alt.startswith('OMIM')
            ]
               
        # Primary
        name_normalized = text_preprocessor.run(entry['# DiseaseName'])
        
        # Synonym/secondary
        if entry['Synonyms']:
            synonym_list = entry['Synonyms'].split('|')
            synonym_list_normalized = [
                text_preprocessor.run(syn) for syn in synonym_list
            ]
        
        if training:  # Separate main id and alt ids
            key_ids = [pid] + alt_ids
            for key in key_ids:
                result[key] = []
                result[key].append(
                    ['primary', name_normalized]
                )
                mentions.add(name_normalized)
                if entry['Synonyms']:
                    for syn_normalized in synonym_list_normalized:
                        result[key].append(
                            ['synonym/secondary', syn_normalized]
                        )
                        mentions.add(syn_normalized)
                        
        else:  # Linearized IDs(chained format)
            full_id = '|'.join([pid] + alt_ids)
            result[full_id] = []
            result[full_id].append(
                ['primary', name_normalized]
            )
            mentions.add(name_normalized)
            if entry['Synonyms']:
                for syn_normalized in synonym_list_normalized:
                    result[full_id].append(
                        ['synonym/secondary', syn_normalized]
                    )
                    mentions.add(syn_normalized)
                    
    # For DEV ontologies: incorporate id-mention from Train set
    # For TEST ontologies: incorporate id-mention from Train+Dev set
    if query_dataset and not training:
        concept_files = []
        for data_dir in query_dataset:
            concept_files.extend(glob.glob(os.path.join(data_dir, "*.concept")))
            
        concept_files = {
            'bc8biored': [fn for fn in concept_files if 'bc8biored' in fn],
            'additional': [fn for fn in concept_files if not 'bc8biored' in fn]
        }
        
        print(len(concept_files['bc8biored']), len(concept_files['additional']))
        
        for dataset_name in ['bc8biored', 'additional']:  # prioritize bc8 first
            for concept_file in concept_files[dataset_name]:
                with open(concept_file, 'r') as f:
                    doc_queries = f.readlines()

                for query in doc_queries:
                    _, _, _, name, cui = query.strip().split("||")
                    cui = cui.replace('+', '|')
                    name = text_preprocessor.run(name)

                    if name not in mentions:
                        if cui in result:
                            result[cui].append(
                                ['synonym/secondary', name]
                            )
                            mentions.add(name)
                        else:
                            result[cui] = []
                            result[cui].append(
                                ['primary', name]
                            )
                            mentions.add(name)
                                 
    print('Number of Keys >>', len(result))
    print('Number of Unique Mentions >>', len(mentions))
    
    return result

In [6]:
text_preprocessor = TextPreprocess()
ontology_train = build_ontologies_disease(
    df, text_preprocessor, training=True
)

Number of Keys >> 17335
Number of Unique Mentions >> 85388


In [7]:
# Incorporate id-word pairs from ncbi-disease and cdr for DEV set
query_dataset = [
    './data/biosyn-processed-bc8biored-disease/train/',
    './data/biosyn-processed-ncbi-disease/processed_train/',
    './data/biosyn-processed-ncbi-disease/processed_dev/',
    './data/biosyn-processed-ncbi-disease/processed_test/',
    './data/biosyn-processed-bc5cdr-disease/processed_train/',
    './data/biosyn-processed-bc5cdr-disease/processed_dev/',
    './data/biosyn-processed-bc5cdr-disease/processed_test/',   
]
ontology_dev = build_ontologies_disease(
    df, text_preprocessor, query_dataset=query_dataset, training=False
)

490 2292
Number of Keys >> 13780
Number of Unique Mentions >> 88471


In [8]:
# Incorporate id-word pairs from DEV set for TEST set
query_dataset.append('./data/biosyn-processed-bc8biored-disease/dev/')
ontology_test = build_ontologies_disease(
    df, text_preprocessor, query_dataset=query_dataset, training=False
)

590 2292
Number of Keys >> 13786
Number of Unique Mentions >> 88577


In [74]:
with open('./ontologies/bc8biored-disease-aio_train.json', 'w') as f:
    f.write(json.dumps(ontology_train))
with open('./ontologies/bc8biored-disease-aio_dev.json', 'w') as f:
    f.write(json.dumps(ontology_dev))
with open('./ontologies/bc8biored-disease-aio.json', 'w') as f:
    f.write(json.dumps(ontology_test))    

## Build Ontologies for Chemical
- Mainly constructed from MeSH vocab
- Add train samples for Dev Ontology, and Add train+dev for Test ontology (Follow the same process with Disease-type)

In [None]:
# load CTD: the ontology for Chemical
with open("./data/CTD_chemicals.tsv", 'r') as f:
    lines = f.readlines()
lines = [lines[27]] + lines[29:]
lines = [line.strip().split('\t')[:8] for line in lines]
df = pd.DataFrame(lines[1:], columns = lines[0])
print(df.shape)

In [None]:
def build_ontologies_chemical(df, text_preprocessor, query_dataset=None):
    
    result = {}
    mentions = set()

    for entry in df.to_dict('records'):

        synonym_list = []

        if entry['ChemicalID'].startswith('MESH'):
            pid = entry['ChemicalID'].replace('MESH:', '')
        else:
            continue

        # Primary
        name_normalized = text_preprocessor.run(entry['# ChemicalName'])
        
        # Synonym/secondary
        if entry['Synonyms'] is not None:
            synonym_list = entry['Synonyms'].split('|')
            synonym_list_normalized = [
                text_preprocessor.run(syn) for syn in synonym_list
            ]
        
        result[pid] = []
        result[pid].append(
            ['primary', name_normalized]
        )
        mentions.add(name_normalized)
        
        if entry['Synonyms'] is not None:
            for syn_normalized in synonym_list_normalized:
                result[pid].append(
                    ['synonym/secondary', syn_normalized]
                )
                mentions.add(syn_normalized)
                    
    # For DEV ontologies: incorporate id-mention from Train set
    # For TEST ontologies: incorporate id-mention from Train+Dev set
    if query_dataset:
        concept_files = []
        for data_dir in query_dataset:
            concept_files.extend(glob.glob(os.path.join(data_dir, "*.concept")))
            
        concept_files = {
            'bc8biored': [fn for fn in concept_files if 'bc8biored' in fn],
            'additional': [fn for fn in concept_files if not 'bc8biored' in fn]
        }
        
        print(len(concept_files['bc8biored']), len(concept_files['additional']))
        
        for dataset_name in ['bc8biored', 'additional']:  # prioritize bc8 first
            for concept_file in concept_files[dataset_name]:
                with open(concept_file, 'r') as f:
                    doc_queries = f.readlines()

                for query in doc_queries:
                    _, _, _, name, cui = query.strip().split("||")
                    cui = cui.replace('+', '|')
                    name = text_preprocessor.run(name)

                    if name not in mentions:
                        if cui in result:
                            result[cui].append(
                                ['synonym/secondary', name]
                            )
                            mentions.add(name)
                        else:
                            result[cui] = []
                            result[cui].append(
                                ['primary', name]
                            )
                            mentions.add(name)
                                 
    print('Number of Keys >>', len(result))
    print('Number of Unique Mentions >>', len(mentions))
    
    return result

In [None]:
text_preprocessor = TextPreprocess()
ontology_train = build_ontologies(
    df, text_preprocessor
)

In [None]:
query_dataset = [
    './data/biosyn-processed-bc8biored-chemical/train/',
    './data/biosyn-processed-bc5cdr-chemical/processed_train/',
    './data/biosyn-processed-bc5cdr-chemical/processed_dev/',
    './data/biosyn-processed-bc5cdr-chemical/processed_test/',   
]
ontology_dev = build_ontologies(
    df, text_preprocessor, query_dataset=query_dataset
)

In [None]:
query_dataset.append('./data/biosyn-processed-bc8biored-chemical/dev/')
ontology_test = build_ontologies(
    df, text_preprocessor, query_dataset=query_dataset
)

In [None]:
with open('./ontologies/bc8biored-chemical-aio_train.json', 'w') as f:
    f.write(json.dumps(ontology_train))
with open('./ontologies/bc8biored-chemical-aio_dev.json', 'w') as f:
    f.write(json.dumps(ontology_dev))
with open('./ontologies/bc8biored-chemical-aio.json', 'w') as f:
    f.write(json.dumps(ontology_test))    

## Generate Query Data for Disease concept

In [9]:
# Load BioRED processed datasets from BioSyn package
# then, create pairs of mention-ID

def load_data(data_dir, filter_composite=True, filter_duplicate=True, filter_cuiless=True):
    """       
    Parameters
    ----------
    data_dir : str
        a path of data
    filter_composite : bool
        filter composite mentions
    filter_duplicate : bool
        filter duplicate queries  
    filter_cuiless : bool
        remove samples with cuiless 
    Returns
    -------
    data : np.array 
        mention, cui pairs
    """
    data = []

    concept_files = glob.glob(os.path.join(data_dir, "*.concept"))
    for concept_file in tqdm(concept_files):
        with open(concept_file, "r", encoding='utf-8') as f:
            concepts = f.readlines()

        for concept in concepts:
            concept = concept.split("||")
            mention = concept[3].strip()
            cui = concept[4].strip()
            is_composite = (cui.replace("+","|").count("|") > 0)

            # filter composite cui
            if filter_composite and is_composite:
                continue
            # filter cuiless
            if filter_cuiless and (cui in ['-1', '-'] or not cui):
                continue

            data.append((mention,cui))

    if filter_duplicate:
        data = list(dict.fromkeys(data))

    return data

In [10]:
# Here, input data is already processed with Biosyn

data_dir = './data/biosyn-processed-bc8biored-disease/'
pairs_disease = {
    'train': load_data(os.path.join(data_dir, 'train')),
    'dev': load_data(os.path.join(data_dir, 'dev')),
    'test': load_data(os.path.join(data_dir, 'test'), filter_cuiless=False),
#     'test': load_data(os.path.join(data_dir, 'dev_pred'), filter_cuiless=False)  # For dev preds (= BioRED Test set)   
#     'test': load_data(os.path.join(data_dir, 'test_pred'), filter_cuiless=False)  # For test preds (No gold standard)
}
pairs_disease['traindev'] = list(dict.fromkeys(pairs_disease['train']+pairs_disease['dev']))

100%|█████████████████████████████████████████| 490/490 [00:00<00:00, 7872.13it/s]
100%|████████████████████████████████████████| 100/100 [00:00<00:00, 12814.08it/s]
100%|███████████████████████████████████████| 9668/9668 [00:01<00:00, 6971.13it/s]


In [12]:
for k, v in pairs_disease.items():
    print(k, len(v))

train 1313
dev 379
test 14191
traindev 1559


In [13]:
# Add Disease mentions from CDR and NCBI dataset for training set
additional_pairs = []
ncbi_d_dir = './data/ResCNN/biosyn-processed-ncbi-disease/'
for folder_name in ['train', 'dev', 'test']:
    result = load_data(os.path.join(ncbi_d_dir, f'processed_{folder_name}'))
    print(f"NCBI-D-{folder_name}", len(result))
    additional_pairs.extend(result)

100%|█████████████████████████████████████████| 592/592 [00:00<00:00, 3053.26it/s]


NCBI-D-train 1404


100%|█████████████████████████████████████████| 100/100 [00:00<00:00, 3511.99it/s]


NCBI-D-dev 305


100%|█████████████████████████████████████████| 100/100 [00:00<00:00, 3444.25it/s]

NCBI-D-test 340





In [14]:
# Add Disease mentions from CDR and NCBI dataset for training set
cdr_d_dir = './data/ResCNN/biosyn-processed-bc5cdr-disease/'
for folder_name in ['train', 'dev', 'test']:
    result = load_data(os.path.join(cdr_d_dir, f'processed_{folder_name}'))
    print(f"CDR-D-{folder_name}", len(result))

100%|████████████████████████████████████████| 500/500 [00:00<00:00, 10121.63it/s]


CDR-D-train 1254


100%|████████████████████████████████████████| 500/500 [00:00<00:00, 10166.78it/s]


CDR-D-dev 1138


100%|████████████████████████████████████████| 500/500 [00:00<00:00, 11689.94it/s]

CDR-D-test 1195





In [15]:
# filter out duplicated pairs from additional dataset
print(len(additional_pairs))
additional_pairs = list(dict.fromkeys(additional_pairs))
print(len(additional_pairs))

2049
1779


In [16]:
for mention, cui in additional_pairs:
    if cui not in ontology_train:
        continue
    if mention not in dict(pairs_disease['train']).keys():
        pairs_disease['train'].append((mention, cui))
    else:
        if cui != dict(pairs_disease['train'])[mention]:
            print('Additional:', (mention, cui))
            print('BC8BioRED:', (mention, dict(pairs_disease['train'])[mention]))
            
    if mention not in dict(pairs_disease['traindev']).keys():
        pairs_disease['traindev'].append((mention, cui))

Additional: ('carcinomas', 'D009369')
BC8BioRED: ('carcinomas', 'D002277')
Additional: ('neurological abnormalities', 'D009461')
BC8BioRED: ('neurological abnormalities', 'D009422')
Additional: ('neurologic deterioration', 'D009461')
BC8BioRED: ('neurologic deterioration', 'D009422')
Additional: ('colon carcinoma', 'D003110')
BC8BioRED: ('colon carcinoma', 'D015179')
Additional: ('neuronal dysfunction', 'D009461')
BC8BioRED: ('neuronal dysfunction', 'D009410')
Additional: ('neurodegeneration', 'D019636')
BC8BioRED: ('neurodegeneration', 'D009422')
Additional: ('pelizaeus merzbacher disease', 'D020371')
BC8BioRED: ('pelizaeus merzbacher disease', '312080')
Additional: ('wolfram syndrome', '222300')
BC8BioRED: ('wolfram syndrome', 'D014929')
Additional: ('hemochromatosis', 'D016399')
BC8BioRED: ('hemochromatosis', 'D006432')
Additional: ('polyposis', 'D011125')
BC8BioRED: ('polyposis', 'D044483')
Additional: ('infertility', 'D007246')
BC8BioRED: ('infertility', 'D007247')
Additional: ('c

In [17]:
for k, v in pairs_disease.items():
    print(k, len(v))
    
# Given 1779 extra mention-cui pair from cdr/ncbi disease,
# Training set extends from 1313 -> 2842
# Train+dev from 1559 -> 3062

train 2842
dev 379
test 14191
traindev 3062


In [18]:
# Check how many mentions are same btwn train and dev
train_mentions = set([mention for mention, cui in pairs_disease['train']])
dev_mentions = set([mention for mention, cui in pairs_disease['dev']])
# print(len(dev_mentions-train_mentions))
mentions_dev_only = len(dev_mentions-train_mentions)/len(dev_mentions)
print(f'Mentions only in Dev set: {mentions_dev_only:.4f}')

Mentions only in Dev set: 0.5726


In [95]:
output_dir = './bc8biored-disease-aio'
os.makedirs(output_dir, exist_ok=True)

# with open(os.path.join(output_dir, 'data.json'), 'w') as f:
with open(os.path.join(output_dir, 'data-dev-predicted.json'), 'w') as f:
# with open(os.path.join(output_dir, 'data-test-predicted.json'), 'w') as f:
    f.write(json.dumps(pairs_disease))

## Generate Query Data for Chemical concept

In [None]:
data_dir = './data/biosyn-processed-bc8biored-chemical/'
pairs_chemical = {
    'train': load_data(os.path.join(data_dir, 'train')),
    'dev': load_data(os.path.join(data_dir, 'dev')),
    'test': load_data(os.path.join(data_dir, 'test'), filter_cuiless=False),
#     'test': load_data(os.path.join(data_dir, 'dev_pred'), filter_cuiless=False)  # For dev preds (= BioRED Test set)   
#     'test': load_data(os.path.join(data_dir, 'test_pred'), filter_cuiless=False)  # For test preds (No gold standard)
}
pairs_chemical['traindev'] = list(dict.fromkeys(pairs_chemical['train']+pairs_chemical['dev']))

In [None]:
for k, v in pairs_chemical.items():
    print(k, len(v))

In [None]:
# Add Chemical mentions from CDR dataset for training set
additional_pairs = []
cdr_c_dir = './data/biosyn-processed-bc5cdr-chemical/'
for folder_name in ['train', 'dev', 'test']:
    result = load_data(os.path.join(cdr_c_dir, f'processed_{folder_name}'))
    print(f"CDR-C-{folder_name}", len(result))
    additional_pairs.extend(result)

In [None]:
# filter out duplicated pairs from additional dataset
print(len(additional_pairs))
additional_pairs = list(dict.fromkeys(additional_pairs))
print(len(additional_pairs))

In [None]:
for mention, cui in additional_pairs:
    # remove additional mentions that are not in the train ontology
    if cui not in ontology_train:
        continue
    if mention not in dict(pairs_chemical['train']).keys():
        pairs_chemical['train'].append((mention, cui))
    else:
        if cui != dict(pairs_chemical['train'])[mention]:
            print('Additional:', (mention, cui))
            print('BC8BioRED:', (mention, dict(pairs_chemical['train'])[mention]))
            
    if mention not in dict(pairs_chemical['traindev']).keys():
        pairs_chemical['traindev'].append((mention, cui))

In [None]:
for k, v in pairs_chemical.items():
    print(k, len(v))
    
# Given 1737 extra mention-cui pair from cdr/ncbi disease,
# Training set extends from 723 -> 2008
# Train+dev from 825 -> 2047

In [None]:
# Check how many mentions are same btwn train and dev
train_mentions = set([mention for mention, cui in pairs_chemical['train']])
dev_mentions = set([mention for mention, cui in pairs_chemical['dev']])
# print(len(dev_mentions-train_mentions))
mentions_dev_only = len(dev_mentions-train_mentions)/len(dev_mentions)
print(f'Mentions only in Dev set: {mentions_dev_only:.4f}')

In [None]:
output_dir = './bc8biored-chemical-aio'
os.makedirs(output_dir, exist_ok=True)
with open(os.path.join(output_dir, 'data.json'), 'w') as f:
# with open(os.path.join(output_dir, 'data-dev-predicted.json'), 'w') as f:
# with open(os.path.join(output_dir, 'data-test-predicted.json'), 'w') as f:
    f.write(json.dumps(pairs_chemical))