In [1]:
import pybiopax

pathway_file = "R-MMU-HDR.owl"

model = pybiopax.model_from_owl_file("reactome/{}".format(pathway_file))

Processing OWL elements:   0%|          | 0.00/1.16k [00:00<?, ?it/s]

In [2]:
from collections import Counter
Counter([obj.__class__.__name__  for uid, obj in model.objects.items()])

Counter({'Pathway': 5,
         'BiochemicalReaction': 13,
         'SmallMolecule': 4,
         'CellularLocationVocabulary': 2,
         'UnificationXref': 375,
         'SmallMoleculeReference': 4,
         'Provenance': 1,
         'RelationshipXref': 18,
         'RelationshipTypeVocabulary': 4,
         'Protein': 73,
         'ProteinReference': 56,
         'BioSource': 1,
         'FragmentFeature': 69,
         'SequenceInterval': 69,
         'SequenceSite': 158,
         'ModificationFeature': 22,
         'SequenceModificationVocabulary': 4,
         'Catalysis': 4,
         'Evidence': 18,
         'PublicationXref': 49,
         'EvidenceCodeVocabulary': 1,
         'PathwayStep': 17,
         'Complex': 48,
         'Stoichiometry': 137,
         'PhysicalEntity': 9})

In [3]:
for reaction in model.get_objects_by_type(pybiopax.biopax.Pathway):
    print('%s %s' % (reaction.uid, reaction.name))

Pathway1 ['HDR through Homologous Recombination (HRR)']
Pathway2 ['Homologous DNA Pairing and Strand Exchange']
Pathway3 ['Presynaptic phase of homologous DNA pairing and strand exchange']
Pathway4 ['Resolution of D-Loop Structures']
Pathway5 ['Resolution of D-loop Structures through Holliday Junction Intermediates']


In [4]:
for reaction in model.get_objects_by_type(pybiopax.biopax.BiochemicalReaction):
    print('%s: %s -> %s' % (reaction.name, reaction.left, reaction.right))

['CHEK1 phosphorylates RAD51']: [SmallMolecule(ATP), Protein(Rad51)] -> [Protein(Q08297), SmallMolecule(ADP)]
['BCDX2 complex formation']: [Protein(RAD51D), Protein(RAD51B), Protein(Xrcc2), Protein(RAD51C)] -> [Complex(BCDX2 complex)]
['BCDX2 complex stabilizes RAD51 filament']: [Complex(BCDX2 complex), Complex(3' overhanging ssDNA-DSBs:p-MRN:p-S1981,Ac-K3016-ATM:KAT5:BRCA1-C complex:EXO1,DNA2:BLM,WRN:p-S990,Ac-K1249-BRIP1:p-T309-RAD51:p-T3387-BRCA2:SEM1)] -> [Complex(3' overhanging ssDNA-DSBs:p-MRN:p-S1981,Ac-K3016-ATM:KAT5:BRCA1-C complex:EXO1,DNA2:BLM,WRN:p-S990,Ac-K1249-BRIP1:p-T309-RAD51:p-T3387-BRCA2:SEM1:BCDX2 complex)]
['D-loop formation mediated by PALB2, BRCA2 and RAD51', 'Strand exchange/Branch migration mediated by PALB2, BRCA2 and RAD51']: [Complex(3' overhanging ssDNA-DSBs:p-MRN:p-S1981,Ac-K3016-ATM:KAT5:BRCA1-C complex:EXO1,DNA2:BLM,WRN:p-S990,Ac-K1249-BRIP1:p-T309-RAD51:p-T3387-BRCA2:SEM1:BCDX2 complex), PhysicalEntity(Sister Chromatid), Protein(Q8C551), Protein(Palb2)]

1. Go in order of biochemical reaction
2. Take left hand side
3. Number each gene by the stage of it's involvement

In [5]:
import numpy as np
import pandas as pd

all_entities = []
for reaction in model.get_objects_by_type(pybiopax.biopax.BiochemicalReaction):
    for l in reaction.left:
        all_entities.append(l.name)

flat_list = [item for sublist in all_entities for item in sublist]
all_entities = pd.Series(flat_list).str.capitalize().unique()
all_entities

def parse_reactome_physical_entities(entities, c="core"):
    results = []
    for e in entities:
        if ":" in e:
            s = e.split(":")
            results += parse_reactome_physical_entities(s, c)
        elif "," in e:
            s = e.split(",")
            results += parse_reactome_physical_entities(s, c="optional")
        elif "-" in e:
            s = e.split("-")
            results += parse_reactome_physical_entities(s, c)
        else:
            results.append((e, c))
    return results



all_entities = pd.DataFrame(parse_reactome_physical_entities(all_entities), columns=["Entity", "Class"]).drop_duplicates()
all_entities["Entity"] = all_entities["Entity"].str.capitalize()
all_entities = all_entities.loc[all_entities.sort_values("Class")["Entity"].drop_duplicates(keep="first").index]
all_entities = all_entities.set_index("Entity")
all_entities.head()


Unnamed: 0_level_0,Class
Entity,Unnamed: 1_level_1
Atp,core
D,core
Cx3 complex,core
Q9cxe6,core
Xrcc3,core


In [6]:
from src.config import get_experiment_artifacts

full_results_df = pd.read_pickle(get_experiment_artifacts() + "/augmented_with_GO_outlier_results.pkl")
full_results_df

Sample,Global,Global,Global,Global,Global,T1,T1,T1,T1,T2,T2,T2,T3,T3,T3,T3,Gene Sets,Gene Sets,Gene Sets,Global,Global
Measure,Max Score,Mean Consistency,Mean Score,Mean Std CLR Consistency,Mean Std Consistency,CLR Consistency,Consistency,distances,pvalues,CLR Consistency,...,pvalues,CLR Consistency,Consistency,distances,pvalues,Adamson,GeneSubset2,GeneSubsetSD30,isGODSBRepair,isGORepair
Gene,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
0610009B22Rik,-0.202316,-0.263600,-0.487511,-0.864817,-0.918179,-0.849354,-0.897694,0.169394,0.999409,-0.074988,...,0.733467,0.141639,-0.077164,6.205324,0.286750,False,False,False,False,False
0610010K14Rik,-0.506267,0.404967,-0.554160,0.147015,0.639911,0.689295,0.672534,1.420544,0.922053,-0.149701,...,0.731721,-0.117280,0.561789,2.871384,0.719807,False,False,False,False,False
0610030E20Rik,-0.096154,0.068685,-0.334093,-0.104114,-0.108152,0.366741,0.528662,8.092794,0.151195,-0.132531,...,0.684153,-0.118037,-0.279202,3.765686,0.583620,False,False,False,False,False
0610040J01Rik,0.585708,0.337484,0.246902,-0.339103,0.485153,0.380555,0.580776,12.609331,0.027328,-0.650751,...,0.233254,0.090293,0.499054,14.848909,0.011028,False,False,False,False,False
1110004F10Rik,0.243536,-0.194733,0.024213,-0.119202,-0.727883,-0.216158,-0.155832,9.343101,0.096137,0.086380,...,0.283318,0.283791,-0.373509,11.095744,0.049514,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Znhit1,0.521094,0.083298,0.292026,1.047208,-0.055802,,,,,0.612933,...,0.019401,0.474388,-0.497556,9.115033,0.104563,True,False,False,False,True
Zranb2,0.953777,0.065813,0.670830,0.916843,-0.120845,,,,,0.351070,...,0.033617,0.631090,0.308406,18.886149,0.002018,False,False,False,False,False
Zrsr1,0.251496,-0.220415,-0.017179,-0.678197,-0.742873,,,,,-0.582349,...,0.400979,0.157301,-0.485046,11.183051,0.047869,False,False,False,False,False
Zup1,0.072598,-0.140818,-0.145972,-0.724366,-0.572357,,,,,-0.206005,...,0.116112,-0.281189,-0.186564,4.425913,0.489857,False,False,False,False,False


In [7]:
ko_genes = full_results_df.index.to_series()

genes_of_interest = list(np.intersect1d(all_entities.index, ko_genes))

pathway_genes = full_results_df.loc[genes_of_interest, ("Global", "Mean Consistency")].sort_values().to_frame().droplevel(axis=1, level=0)

In [8]:
"Gen1" in ko_genes

True

In [9]:
def find_stage(gene):
    options = []
    for reaction in model.get_objects_by_type(pybiopax.biopax.BiochemicalReaction):
        if gene.lower() in (str(reaction.left)).lower():
            options.append(int(reaction.uid[19:]))
    if len(options):
        return np.min(options)
    return np.nan

pathway_genes["Stage"] = pathway_genes.index.to_series().apply(find_stage)
pathway_genes["Class"] = all_entities["Class"].loc[pathway_genes.index]
pathway_genes

Unnamed: 0_level_0,Mean Consistency,Stage,Class
Gene,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Brip1,-0.326551,3,optional
Spidr,-0.264972,11,core
Sem1,-0.141849,3,core
Kat5,-0.086527,3,core
Mus81,0.113813,8,core
Palb2,0.123724,4,core
Blm,0.189125,3,core
Rad51b,0.261538,2,core
Eme1,0.276661,8,optional
Rad51c,0.327459,2,core


In [38]:
import networkx as nx
from networkx.algorithms import bipartite

G = nx.DiGraph()

for reaction in model.get_objects_by_type(pybiopax.biopax.BiochemicalReaction):
    G.add_nodes_from(reaction.left, bipartite=0)
    G.add_nodes_from(reaction.right, bipartite=0)
    G.add_node(reaction.name[0], bipartite=1)
    for l in reaction.left:
        G.add_edge(l, reaction.name[0])
    for r in reaction.right:
        G.add_edge(reaction.name[0], r)
    c = bipartite.color(G)


In [48]:
bottom_nodes, top_nodes = bipartite.sets(G)

AmbiguousSolution: Disconnected graph: Ambiguous solution for bipartite sets.

In [47]:
pos = nx.bipartite_layout(G, G.nodes)