In [1]:
import pandas as pd
import numpy as np
import json

# Find just data with no overlap

In [2]:
results = []
for dis_rank in range(10):
    for network in ["BioGRID","IMP"]:
        for mytask in ["GO","Monarch"]:
            # open the disease table
            fp = "../results/disease_evals/"
            dfdis = pd.read_csv(fp + f"DisMet__ce_dm_dr_hs_mm_sc__{network}.tsv",sep="\t")
            name = dfdis.iloc[dis_rank,:]["name"]
            log2p_score = dfdis.iloc[dis_rank,:]["log2p"]
            num_train = dfdis.iloc[dis_rank,:]["num_train_pos"] + dfdis.iloc[dis_rank,:]["num_test_pos"]
            ID = dfdis.iloc[dis_rank,:]["ID"].replace(":","")

            # get the positive genes for the disease
            fp_GSC = "../data/GSCs/GSCs/"
            with open(f"{fp_GSC}MainLabels__MethodNum-0__DisNet_hs_20_200_0.5_0.5_{network}.json","r") as handle:
                GSC = json.load(handle)
            dis_genes = GSC[dfdis.iloc[dis_rank,:]["ID"]]
            dis_pos = dis_genes['Train-Positive'] + dis_genes['Test-Positive']

            species_dict = {"ce":"worm","dm":"fly","dr":"fish","hs":"human","mm":"mouse","sc":"yeast"}

            for aspecies in ["ce","dm","dr","hs","mm","sc"]:
                if aspecies in ["mm","sc"]:
                    sp0 = "hs"
                    sp1 = aspecies
                else:
                    sp1 = "hs"
                    sp0 = aspecies

                # open up the GSC for the GOs
                fp_GSCd = "../data/GSCs/GSCDis/"
                with open(f"{fp_GSCd}DisGSCs__{mytask}__{aspecies}__{network}.json","r") as handle:
                    GSCGO = json.load(handle)

                # get sig GO terms
                dfGO = pd.read_csv(fp + f"{dis_rank}__{ID}__{mytask}__{aspecies}__{network}__ce_dm_dr_hs_mm_sc.tsv",sep="\t")
                sigGOs = dfGO[dfGO["FDR"]<1e-6]["ID"].tolist()
                FDRs = dfGO[dfGO["FDR"]<1e-6]["FDR"].tolist()
                Names = dfGO[dfGO["FDR"]<1e-6]["Name"].tolist()

                # Get ortholog info between model species and human
                if aspecies != "hs":
                    fp_ortho = "../data/edgelists/connections/"
                    sp0_orthos = {}
                    sp1_orthos = {}
                    with open(fp_ortho+f"{sp0}_{sp1}__{network}_raw__direct__AllOnes.edgelist","r") as f:
                        for line in f:
            #                 gene0_tmp = line.strip().split()[0]
            #                 gene1_tmp = line.strip().split()[1]
                            gene0_tmp = line.split()[0]
                            gene1_tmp = line.split()[1]
                            if gene0_tmp != sp0_orthos:
                                sp0_orthos[gene0_tmp] = [gene1_tmp]
                            else:
                                sp0_orthos[gene0_tmp] = sp0_orthos[gene0_tmp] + [gene1_tmp]
                            if gene1_tmp != sp1_orthos:
                                sp1_orthos[gene1_tmp] = [gene0_tmp]
                            else:
                                sp1_orthos[gene1_tmp] = sp1_orthos[gene1_tmp] + [gene0_tmp]

                # get disease gene overlap
                num_dis_orthos = 0
                for adisgene in dis_pos:
                    if aspecies in ["mm","sc"]:
                        myorthos = sp0_orthos
                    else:
                        myorthos = sp1_orthos
                    if adisgene in myorthos:
                        num_dis_orthos = num_dis_orthos + 1

                # iterate through and look for ortho matches
                for idx, anID in enumerate(sigGOs):
                    GO_genes = GSCGO[anID]["Genes"]
                    if aspecies != "hs":
                        overlapped = 0
                        num_hs_ortho = 0
                        for agene in GO_genes:
                            if aspecies in ["mm","sc"]:
                                myorthos = sp1_orthos
                            else:
                                myorthos = sp0_orthos
                            try:
                                num_inter = len(np.intersect1d(myorthos[agene],dis_pos))
                                num_hs_ortho = num_hs_ortho + 1
                            except KeyError:
                                num_inter = 0
                            if num_inter != 0:
                                overlapped = overlapped + 1
                    else:
                        overlapped = len(np.intersect1d(GO_genes,dis_pos))
                        num_hs_ortho = "NA"
                    if overlapped == 0:
                        results.append([network,species_dict[aspecies], dfdis.iloc[dis_rank,:]["ID"],name,len(dis_pos),
                                        num_dis_orthos, mytask,
                                        anID,Names[idx],f"{FDRs[idx]:.2E}",idx+1,len(GO_genes),
                                        num_hs_ortho, overlapped])
df_results = pd.DataFrame(results,columns=["Network","Species", "DOID","Disease Name", "Number of Disease Genes",
                                           "Number of Disease Genes With Any Orthologs in Model Species",
                                           "Task","Term ID", "Term Name", "FDR of Term in Model", "Rank",
                                           "Number of Term Genes",
                                           "Number of Term Genes With Any Orthologs in Human",
                                           "Disease-Term Gene Overlap"])
print(df_results.shape)
df_results = df_results.sort_values(by=["Number of Term Genes"],ascending=False)
df_results.to_csv("../figures/Supplamentary File 2.tsv",sep="\t",index=False,header=True)
df_results.head()

(3990, 14)


Unnamed: 0,Network,Species,DOID,Disease Name,Number of Disease Genes,Number of Disease Genes With Any Orthologs in Model Species,Task,Term ID,Term Name,FDR of Term in Model,Rank,Number of Term Genes,Number of Term Genes With Any Orthologs in Human,Disease-Term Gene Overlap
2270,IMP,mouse,DOID:0050557,congenital_muscular_dystrophy,33,32,Monarch,MP:0000438,abnormal cranium morphology,1.58e-11,219,200,184.0,0
3922,IMP,human,DOID:11831,cortical_blindness,33,18,GO,GO:0009887,animal organ morphogenesis,4.61e-07,35,199,,0
2764,IMP,mouse,DOID:420,hypertrichosis,71,70,GO,GO:1903827,regulation of cellular protein localization,6.23e-16,190,198,195.0,0
263,IMP,mouse,DOID:8469,influenza,53,49,GO,GO:0032102,negative regulation of response to external st...,9.18e-08,118,198,190.0,0
743,IMP,mouse,DOID:0060286,combined_oxidative_phosphorylation_deficiency,24,23,Monarch,MP:0001700,abnormal embryo turning,1.34e-08,21,198,191.0,0


# Find all data

In [4]:
# this is for all and not just zero
results = []
for dis_rank in range(10):
    for network in ["BioGRID","IMP"]:
        for mytask in ["GO","Monarch"]:
            # open the disease table
            fp = "../results/disease_evals/"
            dfdis = pd.read_csv(fp + f"DisMet__ce_dm_dr_hs_mm_sc__{network}.tsv",sep="\t")
            name = dfdis.iloc[dis_rank,:]["name"]
            log2p_score = dfdis.iloc[dis_rank,:]["log2p"]
            num_train = dfdis.iloc[dis_rank,:]["num_train_pos"] + dfdis.iloc[dis_rank,:]["num_test_pos"]
            ID = dfdis.iloc[dis_rank,:]["ID"].replace(":","")

            # get the positive genes for the disease
            fp_GSC = "../data/GSCs/GSCs/"
            with open(f"{fp_GSC}MainLabels__MethodNum-0__DisNet_hs_20_200_0.5_0.5_{network}.json","r") as handle:
                GSC = json.load(handle)
            dis_genes = GSC[dfdis.iloc[dis_rank,:]["ID"]]
            dis_pos = dis_genes['Train-Positive'] + dis_genes['Test-Positive']

            species_dict = {"ce":"worm","dm":"fly","dr":"fish","hs":"human","mm":"mouse","sc":"yeast"}

            for aspecies in ["ce","dm","dr","hs","mm","sc"]:
                if aspecies in ["mm","sc"]:
                    sp0 = "hs"
                    sp1 = aspecies
                else:
                    sp1 = "hs"
                    sp0 = aspecies

                # open up the GSC for the GOs
                fp_GSCd = "../data/GSCs/GSCDis/"
                with open(f"{fp_GSCd}DisGSCs__{mytask}__{aspecies}__{network}.json","r") as handle:
                    GSCGO = json.load(handle)

                # get sig GO terms
                dfGO = pd.read_csv(fp + f"{dis_rank}__{ID}__{mytask}__{aspecies}__{network}__ce_dm_dr_hs_mm_sc.tsv",sep="\t")
                sigGOs = dfGO[dfGO["FDR"]<1e-6]["ID"].tolist()
                FDRs = dfGO[dfGO["FDR"]<1e-6]["FDR"].tolist()
                Names = dfGO[dfGO["FDR"]<1e-6]["Name"].tolist()

                # Get ortholog info between model species and human
                if aspecies != "hs":
                    fp_ortho = "../data/edgelists/connections/"
                    sp0_orthos = {}
                    sp1_orthos = {}
                    with open(fp_ortho+f"{sp0}_{sp1}__{network}_raw__direct__AllOnes.edgelist","r") as f:
                        for line in f:
                            gene0_tmp = line.split()[0]
                            gene1_tmp = line.split()[1]
                            if gene0_tmp != sp0_orthos:
                                sp0_orthos[gene0_tmp] = [gene1_tmp]
                            else:
                                sp0_orthos[gene0_tmp] = sp0_orthos[gene0_tmp] + [gene1_tmp]
                            if gene1_tmp != sp1_orthos:
                                sp1_orthos[gene1_tmp] = [gene0_tmp]
                            else:
                                sp1_orthos[gene1_tmp] = sp1_orthos[gene1_tmp] + [gene0_tmp]

                # get disease gene overlap
                num_dis_orthos = 0
                for adisgene in dis_pos:
                    if aspecies in ["mm","sc"]:
                        myorthos = sp0_orthos
                    else:
                        myorthos = sp1_orthos
                    if adisgene in myorthos:
                        num_dis_orthos = num_dis_orthos + 1

                # iterate through and look for ortho matches
                for idx, anID in enumerate(sigGOs):
                    GO_genes = GSCGO[anID]["Genes"]
                    if aspecies != "hs":
                        overlapped = 0
                        num_hs_ortho = 0
                        for agene in GO_genes:
                            if aspecies in ["mm","sc"]:
                                myorthos = sp1_orthos
                            else:
                                myorthos = sp0_orthos
                            try:
                                num_inter = len(np.intersect1d(myorthos[agene],dis_pos))
                                num_hs_ortho = num_hs_ortho + 1
                            except KeyError:
                                num_inter = 0
                            if num_inter != 0:
                                overlapped = overlapped + 1
                    else:
                        overlapped = len(np.intersect1d(GO_genes,dis_pos))
                        num_hs_ortho = "NA"
                    results.append([network,species_dict[aspecies], dfdis.iloc[dis_rank,:]["ID"],name,len(dis_pos),
                                    num_dis_orthos, mytask,
                                    anID,Names[idx],f"{FDRs[idx]:.2E}",idx+1,len(GO_genes),
                                    num_hs_ortho, overlapped])
df_results = pd.DataFrame(results,columns=["Network","Species", "DOID","Disease Name", "Number of Disease Genes",
                                           "Number of Disease Genes With Any Orthologs in Model Species",
                                           "Task","Term ID", "Term Name", "FDR of Term in Model", "Rank",
                                           "Number of Term Genes",
                                           "Number of Term Genes With Any Orthologs in Human",
                                           "Disease-Term Gene Overlap"])
print(df_results.shape)
df_results = df_results.sort_values(by=["Number of Term Genes"],ascending=False)
df_results.to_csv("../figures/Supplamentary File 1.tsv",sep="\t",index=False,header=True)
df_results.head()

(14975, 14)


Unnamed: 0,Network,Species,DOID,Disease Name,Number of Disease Genes,Number of Disease Genes With Any Orthologs in Model Species,Task,Term ID,Term Name,FDR of Term in Model,Rank,Number of Term Genes,Number of Term Genes With Any Orthologs in Human,Disease-Term Gene Overlap
9535,IMP,mouse,DOID:0050557,congenital_muscular_dystrophy,33,32,Monarch,MP:0000438,abnormal cranium morphology,1.58e-11,219,200,184.0,0
8509,IMP,mouse,DOID:0050557,congenital_muscular_dystrophy,33,32,GO,GO:0030323,respiratory tube development,1.73e-13,79,200,183.0,2
382,IMP,human,DOID:8469,influenza,53,44,GO,GO:0043903,"regulation of symbiosis, encompassing mutualis...",2.57e-205,15,200,,15
10745,IMP,human,DOID:420,hypertrichosis,71,66,GO,GO:0043903,"regulation of symbiosis, encompassing mutualis...",1.54e-07,358,200,,3
12563,IMP,mouse,DOID:420,hypertrichosis,71,70,Monarch,MP:0000438,abnormal cranium morphology,6.02e-26,74,200,184.0,2
