In [33]:
import sys 
import os
import subprocess
import numpy as np 
import time
from itertools import product 
import glob
import re
import pandas as pd
import pysynapse
from ddot import Ontology
import multiprocessing as mp
from itertools import chain


output_dir = '../output/analyzing_clixo_ontologies'; 
if not os.path.exists(output_dir): os.makedirs(output_dir)

In [2]:
def get_alpha_beta(l):
    """
    Yield function to get the correct alpha and beta names.
    
    """
    for x in l: 
        alpha = '{:.1f}'.format(x[0])
        beta =  '{:.1f}'.format(x[1])
        yield((alpha, beta))
        
    yield(None, None)

def read_clixo_stats(fn):
    dic = {'num_valid_clusters': np.nan, 'largest_cluster': np.nan, 
          'num_edges_in_clustergraph': np.nan, 'num_clusters': np.nan}
    with open(fn) as f: 
        for line in f: 
            if line.startswith('#'):
                if 'Num valid clusters' in line: 
                    dic['num_valid_clusters'] = int(line.strip().split()[-1])
                elif 'Largest cluster' in line:
                    dic['largest_cluster'] = int(line.strip().split()[-1])
                elif 'Num edges in clusterGraph' in line:
                    dic['num_edges_in_clustergraph'] = int(line.strip().split()[-1])
                elif 'Num clusters' in line:
                    dic['num_clusters'] = int(line.strip().split()[-1])

    return(dic)

## Clean the CliXo ontology for Enrichment Analysis 

In [8]:
def generate_clean_ont_file(fn, out_fn): 
    
    with open(fn) as fr, open(out_fn, 'w') as fw: 
        
        fw.write('Parent\tChild\tEdgeType\n')
        for line in fr:
            if line.startswith('#'):
                continue
            else:
                line = line.strip().split('\t')
                line[2] = line[2].replace('default', 'Child-Parent')
                line[2] = line[2].replace('gene', 'Gene-Term')
                line = '\t'.join(line[0:3]) + '\n'
                fw.write(line)
    return(clean_fn)

In [48]:
for fn in sorted(glob.glob('../output/run_clixo/option*/*')):
    clean_fn = os.path.basename(fn)
    clean_fn = os.path.join(output_dir, clean_fn)
    if not os.path.exists(clean_fn):
        clean_fn = generate_clean_ont_file(fn, clean_fn) 

## Calculate CliXo statistics 

In [64]:
clixo_stats = {}
pattern = 'option([0-9]).*alpha([0-9]*\.[0-9]*)_beta([0-9]*\.[0-9]*)'
for fn in sorted(glob.glob('../output/run_clixo/option*/*')):
        
    params = re.search(pattern, fn)
    option, alpha, beta = [float(x) for x in params.groups()]    
    if alpha == None:
        break

    # Getting CliXo stats 
    stats = read_clixo_stats(fn) 
    
    clean_fn = os.path.basename(fn)
    clean_fn = os.path.join(output_dir, clean_fn)
    # Getting GO enrichment stats 
    if os.path.getsize(fn) > 15000: 
        go_enrichment = pysynapse.compare_to_go(clean_fn)
    else:
        go_enrichment = np.nan
    stats['go_enrichment'] = go_enrichment    
    clixo_stats[(option, alpha, beta)] = stats 

In [65]:
clixo_stats_df = pd.DataFrame.from_dict(clixo_stats, orient='index')
clixo_stats_df.reset_index(inplace=True)
clixo_stats_df.rename(columns={'level_0': 'option', 
                               'level_1': 'alpha',
                               'level_2': 'beta'}, inplace=True)

In [66]:
clixo_stats_df.go_enrichment.describe()

count    140.000000
mean      42.271429
std       18.032681
min       10.000000
25%       28.000000
50%       40.000000
75%       50.000000
max       86.000000
Name: go_enrichment, dtype: float64

## Analyze disease enrichment with Multiprocessing 

In [44]:
def analyze_gene_enrichment_test(ont_fn, disease_gene_fn, output):

    ## We read in our ontology
    #Generate custom ontology from Chromatin branch from human GO
    #ontology_file=Generate_Ontology_File('GO:0000785')
    ont = Ontology.from_table(ont_fn)
    translated = pysynapse.Find_GO_Focus_GeneDict(ont)

    #Test genes: autism
    text_file = open(disease_gene_fn, "r")
    test_gene_list = text_file.read().splitlines()
    text_file.close()

    #print("Number of autism genes in our ontology:" ,  len(set(ont.genes).intersection(set(test_gene_list))))
    num_ont_disease_genes = len(set(ont.genes).intersection(set(test_gene_list)))

    #Find number of test genes in enriched modules:
    num_enriched_disease_genes = pysynapse.Find_num_genes_in_enriched(ont, translated, test_gene_list)

    output.put((num_ont_disease_genes, num_enriched_disease_genes))

In [71]:
diseases = ['adhd', 'autism', 'bipolar', 'mdd', 'schizophrenia']

In [61]:
disease_stats = {}

# Define an output queue
output = mp.Queue()

for fn in sorted(glob.glob('../output/run_clixo/option*/*')):
          
    params = re.search(pattern, fn)
    option, alpha, beta = [float(x) for x in params.groups()]    
    
    clean_fn = os.path.basename(fn)
    clean_fn = os.path.join(output_dir, clean_fn)
    
    # Setup a list of processes that we want to run
    processes = []
    for disease in diseases: 
        disease_gene_fn = '../output/omim_psychiatric_disease_genes/{}.txt'.format(disease)
        p = mp.Process(target=analyze_gene_enrichment_test, 
                       args=(clean_fn, disease_gene_fn, output))
        processes.append(p)

    # Run processes
    for p in processes:
        p.start()

    # Exit the completed processes
    for p in processes:
        p.join()

    # Get process results from the output queue
    results = [output.get() for p in processes]
    
    disease_stats[(option, alpha, beta)] = results 
    time.sleep(0.1)

In [63]:
disease_stats

{(2.0, 0.1, 0.5): [(14, 5), (12, 0), (1, 0), (3, 0), (16, 0)],
 (2.0, 0.1, 0.6): [(14, 5), (3, 0), (1, 0), (12, 7), (16, 7)],
 (2.0, 0.1, 0.7): [(12, 6), (1, 0), (14, 5), (3, 0), (16, 9)],
 (2.0, 0.1, 0.8): [(3, 0), (12, 4), (1, 0), (14, 5), (16, 5)],
 (2.0, 0.1, 0.9): [(12, 4), (16, 3), (3, 0), (14, 5), (1, 0)],
 (2.0, 0.1, 1.0): [(3, 0), (1, 0), (12, 4), (14, 5), (16, 2)],
 (2.0, 0.1, 1.1): [(3, 0), (12, 4), (14, 5), (1, 0), (16, 2)],
 (2.0, 0.2, 0.5): [(14, 0), (12, 2), (1, 0), (3, 0), (16, 0)],
 (2.0, 0.2, 0.6): [(14, 5), (12, 0), (1, 0), (3, 2), (16, 4)],
 (2.0, 0.2, 0.7): [(3, 0), (16, 9), (14, 5), (12, 7), (1, 0)],
 (2.0, 0.2, 0.8): [(14, 5), (3, 0), (16, 0), (12, 4), (1, 0)],
 (2.0, 0.2, 0.9): [(14, 5), (1, 0), (12, 4), (3, 0), (16, 0)],
 (2.0, 0.2, 1.0): [(3, 0), (1, 0), (12, 4), (14, 5), (16, 0)],
 (2.0, 0.2, 1.1): [(14, 5), (1, 0), (3, 0), (12, 4), (16, 0)],
 (2.0, 0.3, 0.5): [(14, 0), (12, 0), (1, 0), (3, 0), (16, 0)],
 (2.0, 0.3, 0.6): [(3, 0), (14, 5), (12, 0), (1, 0), (1

In [99]:
disease_cols = [('{}_genes'.format(disease), 
                 '{}_enriched_genes'.format(disease)) for disease in diseases]
disease_cols = list(chain(*disease_cols))
disease_cols = ['option', 'alpha', 'beta'] + disease_cols
disease_stats_tmp = [list(k) + list(chain(*v)) for k, v in disease_stats.items()]
disease_stats_df = pd.DataFrame(disease_stats_tmp, columns=disease_cols)

In [101]:
disease_stats_df.head()

Unnamed: 0,option,alpha,beta,adhd_genes,adhd_enriched_genes,autism_genes,autism_enriched_genes,bipolar_genes,bipolar_enriched_genes,mdd_genes,mdd_enriched_genes,schizophrenia_genes,schizophrenia_enriched_genes
0,2.0,0.1,0.5,14,5,12,0,1,0,3,0,16,0
1,2.0,0.1,0.6,14,5,3,0,1,0,12,7,16,7
2,2.0,0.1,0.7,12,6,1,0,14,5,3,0,16,9
3,2.0,0.1,0.8,3,0,12,4,1,0,14,5,16,5
4,2.0,0.1,0.9,12,4,16,3,3,0,14,5,1,0


## Merge the dataframes together 

In [104]:
merged_df = pd.merge(clixo_stats_df, disease_stats_df, on=['option', 'alpha', 'beta'])
fn = os.path.join(output_dir, 'ontology_stats.tsv')
merged_df.to_csv(fn, sep='\t', index=False)