# Check availability data

In [5]:
#Upload packages
import pandas as pd
import cobra
import alloregfit as arf

## Upload data and metabolic model

In [9]:
data_dir = "C:/Users/user/polybox/MASTER/THESIS/3_Karl_data/"
model = cobra.io.load_matlab_model(data_dir+"iJO1366.mat")
fluxes = pd.read_excel(data_dir+"fluxes_GS_nozeros.xlsx",index_col="name")
metabolites = pd.read_excel(data_dir+"merged_metabolites.xlsx",index_col="name")
proteins = pd.read_excel(data_dir+"proteome_noNaN.xlsx",index_col="name")
proteins2 = pd.read_excel(data_dir+"proteome_bnames.xlsx",index_col="name")
mapping = pd.read_table(data_dir+"ECOLI_83333_idmapping.dat",header=None)
model

0,1
Name,iJO1366
Memory address,0x01ceb6831780
Number of metabolites,1805
Number of reactions,2583
Objective expression,-1.0*BIOMASS_Ec_iJO1366_core_53p95M_reverse_5c8b1 + 1.0*BIOMASS_Ec_iJO1366_core_53p95M
Compartments,"c, e, p"


## Check data availability for every reaction in model

Define the function that inputs the data and analyzes how many reactions have enough data:

In [12]:
def enough_data(model,fluxes,proteins,metabolites):
    for i in range(len(model.reactions)):
        rxn = model.reactions[i]
        substrates = rxn.reactants # improvement: consider fluxes
        genes = list(rxn.genes) # use gene_reaction_rule to consider and/or

        # Check fluxes
        flux_bool = False
        if any(rxn.id in s for s in list(fluxes.index.values)):
            flux_bool = True

        # Check proteins
        prot_bool = False
        for j in range(len(genes)):
            prot = mapping[mapping[2]==genes[j].id][0].reset_index()
            if prot.empty==0:
                gene = list(mapping[(mapping[0]==prot[0][0]) & (mapping[1]=='Gene_Name')][2])
                if any(gene[0] in s for s in list(proteins.index.values)):
                    prot_bool = True
                    break

        # Check substrates
        met_bool = False
        for j in range(len(substrates)):
            if any(substrates[j].id[:-2] in s for s in ['h','h2o'])==0:
                if any(substrates[j].id[:-2] in s for s in list(metabolites.index.values))==0:
                    break
        else:
            met_bool = True
        if flux_bool and met_bool and prot_bool:
            print("%s" % (rxn.id))
    

In [7]:
def enough_data_bn(model,fluxes,proteins,metabolites):
    for i in range(len(model.reactions)):
        rxn = model.reactions[i]
        substrates = rxn.reactants # improvement: consider fluxes
        genes = list(rxn.genes) # use gene_reaction_rule to consider and/or

        # Check fluxes
        flux_bool = False
        if any(rxn.id in s for s in list(fluxes.index.values)):
            flux_bool = True

        # Check proteins
        prot_bool = False
        for j in range(len(genes)):
            if any(genes[j].id in s for s in list(proteins.index.values)):
                prot_bool = True
                break

        # Check substrates
        met_bool = False
        for j in range(len(substrates)):
            if any(substrates[j].id[:-2] in s for s in ['h','h2o'])==0:
                if any(substrates[j].id[:-2] in s for s in list(metabolites.index.values))==0:
                    break
        else:
            met_bool = True
        if flux_bool and met_bool and prot_bool:
            print("Enough data for %s (%s): %s" % (rxn.name, rxn.id, rxn.reaction))

Apply function to our model and Gerosa dataset. Be aware that metabolites, fluxes and proteins need to share same names as in the model in order to be correctly interpreted by the function.

In [13]:
enough_data(model,fluxes,proteins,metabolites)

A5PISO
ABUTtex
ACACT1r
ACACT2r
ACALD
ACALDtex
ACGK
ACGS
ACHBS
ACKr
ACLS
ACOAD1f
ACONTa
ACONTb
ACtex
ADK1
ADK3
ADNK1
ADPT
ADSL1r
ADSS
AGMHE
AGPR
AIRC3
AKGDH
AKGtex
ALAabcpp
ALAtex
ARGSS
ASAD
ASP1DC
ASPCT
ASPK
ASPO5
ASPTA
ATPPRT
ATPS4rpp
BUTtex
CITtex
CLtex
CPMPS
CTPS2
CYSDS
CYTK1
DADK
DDPA
DHAPT
DHBS
DHDPS
DHORTS
DHQS
DHQTi
DUTPDP
DXPS
E4PD
FADRx
FADRx2
FBA
FEENTERtex
FLVR
FLVRx
FUM
G3PD2
G6PDH2r
GALKr
GALURtex
GAPD
GF6PTA
GHMT2r
GK1
GLCptspp
GLU5K
GLUCYS
GLUDy
GLUPRT
GLUSy
GLUtex
GLYCAtex
GLYCL
GLYCLTtex
GLYCtpp
GLYK
GLYtex
GND
GTPCI
H2Otex
HISTP
Htex
IMPC
IMPD
IPPMIb
IPPS
KDOCT2
KDOPP
Ktex
LEUTAi
MALtex
MDH
METAT
NADTRHD
NDPK1
NDPK2
NDPK3
NDPK4
NDPK7
OCBT
PDH
PFK
PGCD
PGI
PGK
PGL
PHETA1
PItex
PMPK
PNTOtex
PPA
PPBNGS
PPK2
PPK
PPM
PPM2
PPS
PRAGSr
PRASCSi
PRFGS
PRPPS
PSCVT
PSERT
PSP_L
PTAr
PUNP1
PUNP2
PUNP5
PYK
PYNP2r
RBK
RHCCE
RIBabcpp
RIBtex
RPE
RPI
S7PI
SADT2
SERAT
SHKK
SHSL1
SO3tex
SO4tex
SUCCtex
SUCOAS
SULR
TALA
THD2pp
THDPS
THRD_L
THRS
TKT1
TKT2
TPI
TRPS3
TYRTA
UAGCVT
UDPG4E
UDPGAL

In [8]:
enough_data_bn(model,fluxes,proteins2,metabolites)

Enough data for Arabinose-5-phosphate isomerase (A5PISO): ru5p__D_c <=> ara5p_c
Enough data for 4-aminobutyrate transport via diffusion (extracellular to periplasm) (ABUTtex): 4abut_e <=> 4abut_p
Enough data for Acetyl-CoA C-acetyltransferase (ACACT1r): 2.0 accoa_c <=> aacoa_c + coa_c
Enough data for Acetyl-CoA C-acyltransferase (butanoyl-CoA) (r) (ACACT2r): accoa_c + btcoa_c <=> 3ohcoa_c + coa_c
Enough data for Acetaldehyde dehydrogenase (acetylating) (ACALD): acald_c + coa_c + nad_c <=> accoa_c + h_c + nadh_c
Enough data for Acetaldehyde transport via diffusion (extracellular to periplasm) (ACALDtex): acald_e <=> acald_p
Enough data for Acetylglutamate kinase (ACGK): acglu_c + atp_c --> acg5p_c + adp_c
Enough data for N-acetylglutamate synthase (ACGS): accoa_c + glu__L_c --> acglu_c + coa_c + h_c
Enough data for 2-aceto-2-hydroxybutanoate synthase (ACHBS): 2obut_c + h_c + pyr_c --> 2ahbut_c + co2_c
Enough data for Acetate kinase (ACKr): ac_c + atp_c <=> actp_c + adp_c
Enough data for

Create dataframe including what is the limitation data factor for each reaction:

In [104]:
Name = []
Reaction = [] 
is_flux = [False] * len(model.reactions)
is_metab = [False] * len(model.reactions)
is_prot = [False] * len(model.reactions)

for i in range(len(model.reactions)):
    rxn = model.reactions[i]
    substrates = rxn.reactants # improvement: consider fluxes
    genes = list(rxn.genes) # use gene_reaction_rule to consider and/or

    # Check fluxes
    flux_bool = False
    if any(rxn.id in s for s in list(fluxes.index.values)):
        flux_bool = True

    # Check proteins
    prot_bool = False
    for j in range(len(genes)):
        prot = mapping[mapping[2]==genes[j].id][0].reset_index()
        if prot.empty==0:
            gene = list(mapping[(mapping[0]==prot[0][0]) & (mapping[1]=='Gene_Name')][2])
            if any(gene[0] in s for s in list(proteins.index.values)):
                prot_bool = True
                break

    # Check substrates
    met_bool = False
    for j in range(len(substrates)):
        if any(substrates[j].id[:-2] in s for s in ['h','h2o'])==0:
            if any(substrates[j].id[:-2] in s for s in list(metabolites.index.values))==0:
                break
    else:
        met_bool = True
    
    Name.append(rxn.name)
    Reaction.append(rxn.reaction)
    is_flux[i] = flux_bool
    is_metab[i] = met_bool
    is_prot[i] = prot_bool

availability = {'Name':Name, 'Reaction':Reaction, 'is_flux':is_flux, 'is_metab':is_metab, 'is_prot':is_prot}
availability = pd.DataFrame(availability)
availability.to_excel(data_dir+"availability.xlsx")