In [75]:
import pandas as pd
from collections import Counter
import numpy as np
import mygene
mg = mygene.MyGeneInfo()
import os

In [76]:
group_dict = {'MB30': 'WNT', 'MB31': 'WNT', 'MB34': 'WNT', 
 'MB55': 'SHH', 'MB05': 'SHH', 'MB49': 'SHH', 
 'MB40': 'SHH', 'MB04': 'SHH', 'MB06': 'SHH', 
 'MB25': 'SHH', 'MB24': 'SHH', 'MB43': 'SHH', 
 'MB46': 'SHH', 'MB16': 'G4', 'MB17': 'G4', 
 'MB48': 'G4', 'MB15': 'G4', 'MB20': 'G4', 
 'MB07': 'G4', 'MB08': 'G4', 'MB22': 'G4', 
 'MB13': 'G4', 'MB39': 'G4', 'MB47': 'G3_G4', 
 'MB09': 'G3_G4', 'MB54': 'G3_G4', 'MB50': 'G3', 
 'MB19': 'G3', 'MB51': 'G3', 'MB52': 'G3', 
 'MB53': 'G3', 'MB01': 'G3', 'MB03': 'G3', 
 'MB02': 'G3', 'MB14': 'G3', 'MB10': 'G4', 'MB21': 'WNT', 'MB33': 'WNT'}

files_dict = {'diseases':'Supplementary_Table_6.csv',
              'drugs':'Supplementary_Table_5.csv',
              'metabolites':'Supplementary_Table_8.csv',
              'pathways':'Supplementary_Table_7.csv',
              'ppis':'Supplementary_Table_4.csv'}

In [77]:
# layer = layer to analyze (diseases,drugs,metabolites,pathways,ppis)
# percent = % of patint of subgroups having the attribute enriched
# enrichment = type of enrichment (over/underrepresented)
# groups = max num of subgroups having the attribute enriched
# adjusted_p_cutoff = over/underenrichment adj p-value cutoff (neat default 0.05)

def find_group_attribute(layer, percent, enrichment, groups, adjusted_p_cutoff):
    file = files_dict[layer]
    df = pd.read_csv('../Supplementary_Tables/'+file,sep='\t')
    df.columns = ["A","B","nab","expected_nab","pvalue","adjusted_p","conclusion"]
    df = df[df['adjusted_p']<adjusted_p_cutoff]
    df['group'] = df['A']
    df = df.replace({'group': group_dict})
    a = df[df['conclusion']=='Overenrichment'].groupby(['B','group'])['A'].apply(list).reset_index(name='patients')
    a['count'] = [len(i) for i in a['patients']]
    a['tot'] = a['group']
    a = a.replace({'tot': Counter(group_dict.values())})
    a['percent'] = a['count']/a['tot']
    a = a[a['percent']>=percent] # enriched in % of patients of the group
    a = a.groupby('B')['group'].apply(list).reset_index(name='subgroup')
    a['count'] = [len(i) for i in a['subgroup']]
    a = a[a['count']<=groups] # specific of at most x groups
    a['subgroup'] = [','.join(map(str, l)) for l in a['subgroup']]
    a = a.groupby('subgroup')['B'].apply(list).reset_index(name='attribute')
    return(a)

In [78]:
a = find_group_attribute('diseases', 1.0, 'Overenrichment', 1, 0.01)

a = pd.DataFrame({'cluster':np.repeat(a.subgroup.values, a.attribute.str.len()),
                        'enriched association':np.concatenate(a.attribute.values),
                          'type':'diseases'})

mondo = pd.read_csv('diseases_definitions.txt',sep='\t',header=None)
mondo.columns = ['id','definition']
dict_mondo = pd.Series(mondo.definition.values,index=mondo.id).to_dict()

a['definition'] = a['enriched association']
a = a.replace({'definition': dict_mondo})
a['enriched association'] = a['enriched association'].str.replace('_',':')
a.to_csv('provenance_analysis.tsv',sep='\t',index=None)
a

Unnamed: 0,cluster,enriched association,type,definition
0,G3_G4,MONDO:0001187,diseases,urinary bladder cancer
1,G3_G4,MONDO:0007256,diseases,hepatocellular carcinoma
2,G3_G4,MONDO:0007564,diseases,pilomatrixoma
3,G3_G4,MONDO:0008093,diseases,nevus epidermal
4,G3_G4,MONDO:0008297,diseases,variegate porphyria
5,G3_G4,MONDO:0008903,diseases,lung cancer
6,G3_G4,MONDO:0011794,diseases,Dravet syndrome
7,G3_G4,MONDO:0013926,diseases,hypogonadotropic hypogonadism 14 with or witho...
8,G3_G4,MONDO:0014102,diseases,hypogonadotropic hypogonadism 17 with or witho...
9,G3_G4,MONDO:0014103,diseases,hypogonadotropic hypogonadism 18 with or witho...


In [None]:
a = find_group_attribute('metabolites', 1.0, 'Overenrichment', 1, 0.01)
a = pd.DataFrame({'cluster':np.repeat(a.subgroup.values, a.attribute.str.len()),
                        'enriched association':np.concatenate(a.attribute.values),
                          'type':'metabolites'})

url = 'http://bigg.ucsd.edu/static/namespace/bigg_models_metabolites.txt'
meabolites = pd.read_csv(url,sep='\t')
dict_meabolites = pd.Series(meabolites.name.values,index=meabolites.universal_bigg_id).to_dict()

a['definition'] = a['enriched association']
a = a.replace({'definition': dict_meabolites})
a.to_csv('provenance_analysis.tsv',sep='\t',index=None, mode='a', header=False)
a

Unnamed: 0,cluster,enriched association,type,definition
0,G3_G4,25aics,metabolites,(S)-2-[5-Amino-1-(5-phospho-D-ribosyl)imidazol...
1,G3_G4,2obut,metabolites,2-Oxobutanoate
2,G3_G4,2oxoadp,metabolites,2 Oxoadipate C6H6O5
3,G3_G4,3mb2coa,metabolites,3-Methylbut-2-enoyl-CoA
4,G3_G4,4ppan,metabolites,D-4'-Phosphopantothenate
5,G3_G4,5dpmev,metabolites,R 5 Diphosphomevalonate C6H10O10P2
6,G3_G4,HC01434,metabolites,Oxalatosuccinate(3-)
7,G3_G4,M02637,metabolites,Octadecatrienoylcarnitine
8,G3_G4,acac,metabolites,Acetoacetate
9,G3_G4,cit,metabolites,Citrate


In [None]:
a = find_group_attribute('pathways', 1.0, 'Overenrichment', 1, 0.01)
a = pd.DataFrame({'cluster':np.repeat(a.subgroup.values, a.attribute.str.len()),
                        'enriched association':np.concatenate(a.attribute.values),
                          'type':'pathways'})

url = 'https://reactome.org/download/current/ReactomePathways.txt'
reactome = pd.read_csv(url,sep='\t',header=None)
reactome.columns = ['id','definition','taxa']

reactome = reactome[reactome['taxa']=='Homo sapiens']
dict_reactome = pd.Series(reactome.definition.values,index=reactome.id).to_dict()

a['definition'] = a['enriched association']
a = a.replace({'definition': dict_reactome})
a.to_csv('provenance_analysis.tsv',sep='\t',index=None, mode='a', header=False)
a

In [None]:
a = find_group_attribute('drugs', 1.0, 'Overenrichment', 1, 0.01)
a = pd.DataFrame({'cluster':np.repeat(a.subgroup.values, a.attribute.str.len()),
                        'enriched association':np.concatenate(a.attribute.values),
                          'type':'drugs'})
drugs = pd.read_csv('drugs_definitions.txt',sep='\t',header=None)
drugs.columns = ['id','definition']
dict_drugs = pd.Series(drugs.definition.values,index=drugs.id).to_dict()

a['definition'] = a['enriched association']
a = a.replace({'definition': dict_drugs})
a['enriched association'] = a['enriched association'].str.replace('_',':')
a.to_csv('provenance_analysis.tsv',sep='\t',index=None, mode='a', header=False)
a

In [None]:
a = find_group_attribute('ppis', 1.0, 'Overenrichment', 1, 0.01)
a = pd.DataFrame({'cluster':np.repeat(a.subgroup.values, a.attribute.str.len()),
                        'enriched association':np.concatenate(a.attribute.values),
                          'type':'PPIs'})

mg_df = mg.querymany(a['enriched association'].unique(), scopes='entrezgene', species='human', as_dataframe=True)
mg_df['tag'] = mg_df['name']+' ('+mg_df['symbol']+')'
dict_entrez = mg_df['tag'].to_dict()

a['definition'] = a['enriched association'].astype(str)
a = a.replace({'definition': dict_entrez})
a.to_csv('provenance_analysis.tsv',sep='\t',index=None, mode='a', header=False)
a