# Check availability data

In [1]:
#Upload packages
import pandas as pd
import cobra

## Upload data and metabolic model

In [2]:
data_dir = "C:/Users/user/polybox/MASTER/THESIS/1_availability_data/"
model = cobra.io.sbml.create_cobra_model_from_sbml_file(data_dir+"ecoli_core_model.xml")
#model = pd.read_excel(data_dir+"ecoli_core_model.xls",index_col="METABOLITE")
fluxes = pd.read_excel(data_dir+"fluxes.xlsx",index_col="Flux")
metabolites = pd.read_excel(data_dir+"metabolites.xlsx",index_col="name_short")
proteins = pd.read_excel(data_dir+"proteins.xlsx",index_col="Promoter")
mapping = pd.read_table(data_dir+"ECOLI_83333_idmapping.dat",header=None)
model

0,1
Name,Ecoli_core_model
Memory address,0x022be7e46b38
Number of metabolites,92
Number of reactions,95
Objective expression,-1.0*Biomass_Ecoli_core_w_GAM_reverse_1a29b + 1.0*Biomass_Ecoli_core_w_GAM
Compartments,"Cytoplasm, Extracellular"


## Check data availability for every reaction in model

Define the function that inputs the data and analyzes how many reactions have enough data:

In [3]:
def enough_data(model,fluxes,proteins,metabolites):
    for i in range(len(model.reactions)):
        rxn = model.reactions[i]
        #substrates = model2[model2[rxn.id]<0][rxn.id].index.values
        #metab = rxn.metabolites
        substrates = rxn.reactants # improvement: consider fluxes
        genes = list(rxn.genes) # use gene_reaction_rule to consider and/or

        # Check fluxes
        flux_bool = False
        if any(rxn.id in s for s in list(fluxes.index.values)):
            flux_bool = True

        # Check proteins
        prot_bool = False
        for j in range(len(genes)):
            prot = mapping[mapping[2]==genes[j].id][0].reset_index()
            if prot.empty==0:
                gene = list(mapping[(mapping[0]==prot[0][0]) & (mapping[1]=='Gene_Name')][2])
                if any(gene[0] in s for s in list(proteins.index.values)):
                    prot_bool = True
                    break

        # Check substrates
        met_bool = False
        for j in range(len(substrates)):
            if any(substrates[j].id[:-2] in s for s in list(metabolites.index.values))==0:
                break
        else:
            met_bool = True
        if flux_bool and met_bool and prot_bool:
            print("Enough data for %s (%s): %s" % (rxn.name, rxn.id, rxn.reaction))
    

Apply function to our model and Gerosa dataset. Be aware that metabolites, fluxes and proteins need to share same names as in the model in order to be correctly interpreted by the function.

In [4]:
enough_data(model,fluxes,proteins,metabolites)

Enough data for phosphogluconate dehydrogenase (GND): 6pgc_c + nadp_c --> co2_c + nadph_c + ru5p_D_c
Enough data for malate dehydrogenase (MDH): mal_L_c + nad_c <=> h_c + nadh_c + oaa_c
Enough data for phosphofructokinase (PFK): atp_c + f6p_c --> adp_c + fdp_c + h_c
Enough data for glucose-6-phosphate isomerase (PGI): g6p_c <=> f6p_c
Enough data for pyruvate kinase (PYK): adp_c + h_c + pep_c --> atp_c + pyr_c
Enough data for ribose-5-phosphate isomerase (RPI): r5p_c <=> ru5p_D_c


Create dataframe including what is the limitation data factor for each reaction:

In [18]:
Name = []
Reaction = [] 
is_flux = [False] * len(model.reactions)
is_metab = [False] * len(model.reactions)
is_prot = [False] * len(model.reactions)

for i in range(len(model.reactions)):
    rxn = model.reactions[i]
    substrates = rxn.reactants # improvement: consider fluxes
    genes = list(rxn.genes) # use gene_reaction_rule to consider and/or

    # Check fluxes
    flux_bool = False
    if any(rxn.id in s for s in list(fluxes.index.values)):
        flux_bool = True

    # Check proteins
    prot_bool = False
    for j in range(len(genes)):
        prot = mapping[mapping[2]==genes[j].id][0].reset_index()
        if prot.empty==0:
            gene = list(mapping[(mapping[0]==prot[0][0]) & (mapping[1]=='Gene_Name')][2])
            if any(gene[0] in s for s in list(proteins.index.values)):
                prot_bool = True
                break

    # Check substrates
    met_bool = False
    for j in range(len(substrates)):
        if any(substrates[j].id[:-2] in s for s in list(metabolites.index.values))==0:
            break
    else:
        met_bool = True
    
    Name.append(rxn.name)
    Reaction.append(rxn.reaction)
    is_flux[i] = flux_bool
    is_metab[i] = met_bool
    is_prot[i] = prot_bool

availability = {'Name':Name, 'Reaction':Reaction, 'is_flux':is_flux, 'is_metab':is_metab, 'is_prot':is_prot}
availability = pd.DataFrame(availability)
availability.to_excel(data_dir+"availability.xlsx")