In [4]:
import os
import re 

output_dir = '../output/omim_psychiatric_disease_genes/'
if not os.path.exists(output_dir): os.makedirs(output_dir)
omim_dir = '../data/omim/'

In [5]:
# Building regexes for psychiatric disease terms 
diseases = ['autism', 'schizophrenia', 'bipolar', 'mdd', 'adhd']

autism_re = re.compile('[Aa]utism')
sczc_re = re.compile('[Ss]chizophrenia')
bipo_re = re.compile('([Bb]ipolar)|([Mm]anic)|(Major affective disorder)')
depress_re = re.compile('([Mm]ajor depressive disorder)|([Dd]epres)')
adhd_re = re.compile('ADHD|([Aa]tte)') 
regexes = {'autism': autism_re, 'schizophrenia': sczc_re, 'bipolar': bipo_re, 
           'mdd': depress_re, 'adhd': adhd_re}

In [6]:
def search_omim_for_disease(omim_db, disease_fn, regex):
    """
    Search the OMIM database file for the regex pattern. 
    
    """
    
    disease_genes = []
    with open(omim_db) as f: 
        for line in f: 

            # Skipping the header 
            if line.startswith('#'):
                continue

            # Parsing lines with info 
            phenotype, genes, omim_id, cyto = line.strip().split('\t')
            
            # Saving a match 
            if regex.search(phenotype):
                genes = genes.replace(' ', '').split(',')
                disease_genes.extend(genes)

        disease_genes = sorted(set(disease_genes))

    with open(disease_genes_fn, 'w') as fw:
        for gene in disease_genes:
            fw.write('{}\n'.format(gene))
    

In [7]:
omim_db = os.path.join(omim_dir, 'morbidmap.txt') 
for disease in diseases:
    regex = regexes[disease]
    disease_genes_fn = os.path.join(output_dir, '{}.txt'.format(disease))
    search_omim_for_disease(omim_db, disease_genes_fn, regex)    