#1 Imports

In [1]:
import os.path as path
import glob, os
from datetime import datetime
from importlib.metadata import version
import cobra
import thermo_flux
from thermo_flux.io import load_excel as ex
from thermo_flux.core.model import ThermoModel
from equilibrator_api import  Q_
import pandas as pd
from thermo_flux.io import helper_load as hl
import numpy as np
from thermo_flux.io import load_excel as ex
from scripts.logger import write_to_log
import gurobipy as gp
from gurobipy import GRB
from scripts.gen_model import gen_model
from scripts.reaction_utils import list_blocked_reactions

#2 Global variables and function definitions

In [2]:
INPUT_MODEL = "datafiles/model.xlsx"
INPUT_KEGGS = "datafiles/ecoli_kegg_id.csv"
INPUT_REED = "regression/reed.csv"
INPUT_INCHI = "regression/InChIs.csv"
INPUT_GAMS = "regression/model_Ecoli_from-gams.xlsx"
INPUT_EXP_DATA = "regression/allPhysioData_formatted_forGSM_20230831.csv"
INPUT_EXP_CONC = "regression/allConcRange_20230912.csv"
INPUT_METABOLOMICS = "regression/metabolomics-Kochanowski_20230925.csv"

MODEL_NAME = "ecoli"

OUTPUT_DIR = "output"
OUTPUT_NAME = f"test_output"
OUTPUT_LOG = f"{OUTPUT_DIR}{path.sep}{OUTPUT_NAME}_log.txt"

CONDITIONS_TO_REGRESS = ["WT-Glc_I", "WT-Gal_I", "WT-Fruc_I", "WT-Mann_I", "dptsG-Glc_I", 
                         "WT-Ace_I", "WT-Succ_I", "WT-Fum_I", "WT-Glyc_I", "WT-Pyr_I",
                         "WT-GlyCAA_II"]

INCLUDE_CO2 = True
INCLUDE_O2 = True
ALLOW_OTHER_EXCRETION = False
RELAX_EXP_FLUX_BOUNDS = 2.0

#3 Log model settings, time


In [3]:
time = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
write_to_log(OUTPUT_LOG, f"Started analysis at: {time}", "w")

# Write package versions:
modules = ["pandas", "numpy", "equilibrator_api", "cobra"]
write_to_log(OUTPUT_LOG, f"Package versions used:")
versions_packages = [f"  {m}: {version(m)}\n" for m in modules]
write_to_log(OUTPUT_LOG, "".join(versions_packages))

#4 Load model

In [4]:
tmodel = gen_model(MODEL_NAME, INPUT_MODEL, INPUT_KEGGS, INPUT_REED, INPUT_INCHI, INPUT_GAMS, OUTPUT_LOG, True, True)

Set parameter Username
Academic license - for non-commercial use only - expires 2026-11-02
['Parameters', 'Exchange reactions', 'Reactions', 'Biomass Composition', 'Transmembrane reactions', 'Metabolites', 'references', 'Transmembrane_reactions_reed', 'Transmembrane reactions_Orth', 'Transmembrane reactions old', 'Sheet3', 'log', 'subsystems']
*** Reading data from Reactions ***
unknown metabolite '2dhglcn[c]' created
unknown metabolite 'nadh[c]' created
unknown metabolite 'glcn[c]' created
unknown metabolite 'nad[c]' created
unknown metabolite 'nadph[c]' created
unknown metabolite 'nadp[c]' created
unknown metabolite '2dhguln[c]' created
unknown metabolite 'idon-L[c]' created
unknown metabolite '3hcinnm[c]' created
unknown metabolite 'o2[c]' created
unknown metabolite 'dhcinnm[c]' created
unknown metabolite 'h2o[c]' created
unknown metabolite '3hpppn[c]' created
unknown metabolite 'dhpppn[c]' created
unknown metabolite 'phthr[c]' created
unknown metabolite '4hthr[c]' created
unknown m

Downcasting behavior in `replace` is deprecated and will be removed in a future version. To retain the old behavior, explicitly call `result.infer_objects(copy=False)`. To opt-in to the future behavior, set `pd.set_option('future.no_silent_downcasting', True)`
Series.__getitem__ treating keys as positions is deprecated. In a future version, integer keys will always be treated as labels (consistent with DataFrame behavior). To access a value by position, use `ser.iloc[pos]`


*** Updating metabolite information ***
2dhglcn_c NOTHING DONE!
nadh_c NOTHING DONE!
glcn_c NOTHING DONE!
nad_c NOTHING DONE!
nadph_c NOTHING DONE!
nadp_c NOTHING DONE!
2dhguln_c NOTHING DONE!
idon-L_c NOTHING DONE!
3hcinnm_c NOTHING DONE!
o2_c NOTHING DONE!
dhcinnm_c NOTHING DONE!
h2o_c NOTHING DONE!
3hpppn_c NOTHING DONE!
dhpppn_c NOTHING DONE!
phthr_c NOTHING DONE!
4hthr_c NOTHING DONE!
pi_c NOTHING DONE!
5dglcn_c NOTHING DONE!
ru5p-D_c NOTHING DONE!
ara5p_c NOTHING DONE!
ACP_c NOTHING DONE!
atp_c NOTHING DONE!
ttdca_c NOTHING DONE!
amp_c NOTHING DONE!
myrsACP_c NOTHING DONE!
ppi_c NOTHING DONE!
ttdcea_c NOTHING DONE!
tdeACP_c NOTHING DONE!
hdca_c NOTHING DONE!
palmACP_c NOTHING DONE!
hdcea_c NOTHING DONE!
hdeACP_c NOTHING DONE!
ocdcea_c NOTHING DONE!
octeACP_c NOTHING DONE!
dtdp4aaddg_c NOTHING DONE!
unagamu_c NOTHING DONE!
dtdp_c NOTHING DONE!
unagamuf_c NOTHING DONE!
arbt6p_c NOTHING DONE!
g6p_c NOTHING DONE!
hqn_c NOTHING DONE!
4abut_c NOTHING DONE!
akg_c NOTHING DONE!
glu-L_c N

In [5]:
list_blocked_reactions(tmodel, "BASE", OUTPUT_LOG)

['4HTHRS',
 'AADDGT',
 'AB6PGH',
 'ACBIPGT',
 'ACGAMT',
 'ACMAMUT',
 'ACONMT',
 'ACPS1',
 'ADK4',
 'ADNCYC',
 'ADOCBIK',
 'ADOCBLS',
 'AHC',
 'AHCYSNS',
 'ALDD19x',
 'AMAOTr',
 'AMMQT82',
 'AMPMS',
 'AOXSr',
 'AP4AH',
 'AP5AH',
 'BTS2',
 'CBIAT',
 'CBLAT',
 'CDPMEK',
 'CINNDO',
 'CPPPGO',
 'CRNBTCT',
 'CRNCBCT',
 'CRNCDH',
 'CYANST',
 'DBTSr',
 'DHBD',
 'DHBSr',
 'DHCIND',
 'DHNAOT',
 'DHPTDC',
 'DMATT',
 'DMQMT',
 'DOGULNR',
 'DXPRIi',
 'DXPS',
 'DXYLK',
 'E4PD',
 'ECAPEC',
 'EDTXS3',
 'EDTXS4',
 'ENTCS',
 'FCLT',
 'FHL',
 'G1PTT',
 'G1SATi',
 'GDMANE',
 'glucys',
 'GLUTRR',
 'GLUTRS',
 'GMAND',
 'GOFUCR',
 'GP4GH',
 'GPDDA1',
 'GPDDA3',
 'GPDDA5',
 'GRTT',
 'GTHOr',
 'GTHS',
 'HBZOPT',
 'HEMEOS',
 'HETZK',
 'HMBS',
 'HMPK1',
 'ICHORSi',
 'ICHORT',
 'KG6PDC',
 'MAN1PT2',
 'MECDPDH',
 'MECDPS',
 'MEPCT',
 'MI1PP',
 'NNDMBRT',
 'NPHS',
 'OCTDPS',
 'OHPBAT',
 'OHPHM',
 'OMBZLM',
 'OMMBLHX',
 'OMPHHX',
 'OPHBDC',
 'OPHHX',
 'OXGDC2',
 'PACCOAL',
 'PDX5PO',
 'PDX5PS',
 'PEAMNO',
 'PERD',
 

## --- MODEL SETUP DONE ---

#8 Regression data

In [6]:
# Import experimental data:
reg_data = pd.read_csv(INPUT_EXP_DATA)
write_to_log(OUTPUT_LOG, f"Reading experimental flux data: {INPUT_EXP_DATA}")

reg_data.set_index(["cond", "rxn"], inplace=True) 
reg_data.head()

# Store gas fluxes:
reg_data_gas = reg_data.swaplevel().copy()
reg_data_gas = reg_data_gas.loc[["EX_co2", "EX_o2"]]
reg_data_gas = reg_data_gas.swaplevel()
reg_data_gas

if INCLUDE_CO2 is False:
    reg_data_no_gas = reg_data.swaplevel().copy()
    reg_data_no_gas = reg_data_no_gas.drop(["EX_co2"])
    reg_data_no_gas = reg_data_no_gas.swaplevel()
    reg_data = reg_data_no_gas
    write_to_log(OUTPUT_LOG, f" - ignoring CO2 data")
    
    
if INCLUDE_O2 is False:
    reg_data_no_gas = reg_data.swaplevel().copy()
    reg_data_no_gas = reg_data_no_gas.drop(["EX_o2"]) 
    reg_data_no_gas = reg_data_no_gas.swaplevel()
    reg_data = reg_data_no_gas
    write_to_log(OUTPUT_LOG, f" - ignoring O2 data") 

In [7]:
# Import experimental data:
conc_data = pd.read_csv(INPUT_EXP_CONC)
write_to_log(OUTPUT_LOG, f"Reading experimental extracellular concentration data: {INPUT_EXP_CONC}")

conc_data.set_index(["cond", "met"], inplace=True) 
conc_data.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,conc_M_min,conc_M_max
cond,met,Unnamed: 2_level_1,Unnamed: 3_level_1
WT-Ace_I,ac_e,0.0279622,0.038395
WT-Ace_I,orot_e,1e-07,7.7e-05
WT-Fruc_I,fru_e,0.02231905,0.029646
WT-Fruc_I,orot_e,1e-07,0.000154
WT-Fruc_I,ac_e,1e-07,0.002083


In [8]:
volume_data = pd.DataFrame({cond: {"c": 1.0} for cond in CONDITIONS_TO_REGRESS} ).T #specify the volume fractions for each condition
volume_data.head()

# Import experimental data:
write_to_log(OUTPUT_LOG, f"Reading intracellular metabolite concentration data: {INPUT_METABOLOMICS}")
met_data = pd.read_csv(INPUT_METABOLOMICS)
met_data.set_index(["cond", "met"], inplace=True) 
met_data.head()

conds_with_data = list(met_data.reset_index().cond.unique())
missing_conds = [cond for cond in CONDITIONS_TO_REGRESS if cond not in conds_with_data]

df_missing = pd.DataFrame({"cond": missing_conds, 
                           "met": "g6p",   # code doesn't deal well if we type a non-existince met here...
                           "mean": np.nan, 
                           "sd": np.nan, }).set_index(["cond", "met"])

met_data = pd.concat([met_data, df_missing])

#9 Setup regression

In [9]:
# Store the indices of all reactions:
map_rxn_id = {rxn.id: index for index, rxn in enumerate(tmodel.reactions)}

exchanges = [rxn.id for rxn in tmodel.exchanges]

exchanges_to_relax = ["EX_C", "EX_h", "EX_h2o", "EX_k", "EX_nh3", "EX_pi", "EX_so4"]

if INCLUDE_CO2 is False:
    exchanges_to_relax += ["EX_co2"]
    
if INCLUDE_O2 is False:
    exchanges_to_relax += ["EX_o2"]

if ALLOW_OTHER_EXCRETION is True:
    upper_bound_exchanges = 100
else:
    upper_bound_exchanges = 0

settings_tfba = {"error_type": "covariance",
                 "qnorm": 1,
                 "alpha": 0.95, 
                 "epsilon": 0.5,
                 "nullspace": None,
                 "gdiss_constraint": True,
                 "sigmac_limit": 130}


settings_regression = {"flux_data": reg_data,
                       "metabolite_data": met_data,
                       "volume_data": volume_data,
                       "conc_units": "mM",
                       "conc_fit": False,
                       "flux_fit": True,
                       "drG_fit": True, 
                       "resnorm": 1, 
                       "error_type": "covariance"}

write_to_log(OUTPUT_LOG, f"Setting up regressions:")
write_to_log(OUTPUT_LOG, f" - exchanges to be relaxed: {exchanges_to_relax}")
write_to_log(OUTPUT_LOG, f" - stdev of experimental fluxes increased by factor of: {RELAX_EXP_FLUX_BOUNDS}")
write_to_log(OUTPUT_LOG, f" - settings for tFBA: {settings_tfba}")
write_to_log(OUTPUT_LOG, f" - settings for regression: {settings_regression}")

In [None]:
write_to_log(OUTPUT_LOG, " ----- STARTING REGRESSION ----- ")

# Quick fix for df_conc now being defined in gen_model.py
# Load default concentration bounds from the GAMS model:
df_conc = hl.excel_to_df(INPUT_GAMS)["ConcLimits"]

# Rearrange data for easier use:
df_conc = df_conc.reset_index()
df_conc["met"] = df_conc["dim1"] + "_"+ df_conc["dim2"]
df_conc = df_conc.pivot_table(columns="dim3", values="Value", index="met")

for CONDITION in CONDITIONS_TO_REGRESS:
    write_to_log(OUTPUT_LOG, f"Setting up: {CONDITION}")
    
    # Reset all flux bounds to +- 100:
    for rxn in tmodel.reactions:
        tmodel.reactions.get_by_id(rxn.id).lower_bound = -100
        tmodel.reactions.get_by_id(rxn.id).upper_bound = 100
    
    # Add non-growth associate ATP maintenance cost:
    tmodel.reactions.ATPHYD.lower_bound = 3.15

    # Fix exchange reaction directions:
    for rxn in exchanges:
        tmodel.reactions.get_by_id(rxn).lower_bound = 0
        tmodel.reactions.get_by_id(rxn).upper_bound = upper_bound_exchanges

    # Allow for excretion of selected metabolites:
    #for rxn in EXCEPTIONS:
    #    tmodel.reactions.get_by_id(rxn).lower_bound = 0
    #    tmodel.reactions.get_by_id(rxn).upper_bound = +100
        
    # Relax essential exchanges:
    for rxn_rel in exchanges_to_relax:
        tmodel.reactions.get_by_id(rxn_rel).lower_bound = -100
        tmodel.reactions.get_by_id(rxn_rel).upper_bound = +100


    # Fix flux for the measured exchange reactions:
    for rxn, row in reg_data.loc[CONDITION].iterrows():
        tmodel.reactions.get_by_id(rxn).lower_bound = -100
        tmodel.reactions.get_by_id(rxn).upper_bound = 100
        tmodel.reactions.get_by_id(rxn).lower_bound = row["mean"] - RELAX_EXP_FLUX_BOUNDS * row["sd"]
        tmodel.reactions.get_by_id(rxn).upper_bound = row["mean"] + RELAX_EXP_FLUX_BOUNDS * row["sd"]
        write_to_log(OUTPUT_LOG, f" - {rxn}: ({tmodel.reactions.get_by_id(rxn).lower_bound :.3}, {tmodel.reactions.get_by_id(rxn).upper_bound :.3})")

    if CONDITION.startswith("dptsG-Glc"):
        tmodel.reactions.GLCpts.lower_bound = 0
        tmodel.reactions.GLCpts.upper_bound = 0
        write_to_log(OUTPUT_LOG, f" - blocked GLCpts")

        
    # Set metabolite concentrations to the values in the GAMS model:   
    for met, row in df_conc.iterrows():
        tmodel.metabolites.get_by_id(met).upper_bound = Q_(row["up"], "mM")
        tmodel.metabolites.get_by_id(met).lower_bound = Q_(row["lo"], "mM")
        
    # Fix concentration for the measured extracellular metabolites:
    for met, row in conc_data.loc[CONDITION].iterrows():
        tmodel.metabolites.get_by_id(met).lower_bound = Q_(1e-9, "M")
        tmodel.metabolites.get_by_id(met).upper_bound = Q_(100, "M")
        tmodel.metabolites.get_by_id(met).lower_bound = Q_(row["conc_M_min"], "M")
        tmodel.metabolites.get_by_id(met).upper_bound = Q_(row["conc_M_max"], "M")
        write_to_log(OUTPUT_LOG, f" - {met}: ({tmodel.metabolites.get_by_id(met).lower_bound :.3}, {tmodel.metabolites.get_by_id(met).upper_bound :.3})")
       
    
    # Initialize model:
    tmodel.m = None  # clear any previously build optimization models 
    tmodel.objective = tmodel.reactions.biomass_EX      # needed, otherwise add_TFBA_variables() gives an error due to lack of an obj. function
    # Add thermodynamics constraints
    tmodel.add_TFBA_variables(conds=[CONDITION], **settings_tfba) 

    # Setup regression to experimental data:
    tmodel.regression([CONDITION], **settings_regression)
    tmodel.m.update()

    
    # Display objective prior to adding RQ as an additional objective:
#     print("Before: ", tmodel.m.getObjective())
    write_to_log(OUTPUT_LOG, f" - Objective function before: {tmodel.m.getObjective()}")
    
    
    # Determine RQ for the given condition:
    data_gas = reg_data_gas.loc[CONDITION]
    vo2, vo2_err = data_gas.loc["EX_o2"]
    vco2, vco2_err = data_gas.loc["EX_co2"]
    rq = - vco2 / vo2
    rq_err = rq * np.sqrt( (vo2_err / vo2)**2 + (vco2_err / vco2)**2)
    write_to_log(OUTPUT_LOG, f" - RQ: {rq :.2} (vCO2 = {vco2 :.3} / vO2 = {vo2 :.3})")

    
    # Add new constraint - residual of RQ: 
    resrq = tmodel.m.addMVar(lb=0, ub=GRB.INFINITY, shape=(1,1), name="resRQ") 
    mrq = tmodel.m.addMVar(lb=0, ub=GRB.INFINITY, shape=(1,1), name="RQ") 
    tmodel.mvars["resRQ"] = resrq
    tmodel.mvars["rq"] = mrq
    
    tmodel.m.addConstr(tmodel.mvars["resRQ"][0, 0] >= ( (mrq - rq)/rq_err ), name = "resRQ_pos")
    tmodel.m.addConstr(tmodel.mvars["resRQ"][0, 0] >= (-(mrq - rq)/rq_err ), name = "resRQ_neg")
    
    
    # Impose RQ on the gas fluxes:
    idx_o2 = map_rxn_id["EX_o2"]
    idx_co2 = map_rxn_id["EX_co2"]   
    tmodel.m.addConstr(tmodel.mvars["v"][0, idx_co2] == (-mrq)*tmodel.mvars["v"][0, idx_o2], name="enforce_RQ")
    tmodel.m.update()
    
    b = list_blocked_reactions(tmodel, "BASE", OUTPUT_LOG)

    #Create a deep copy of the current model so we dont interfere with the base model when we remove
    #Reactions for this condition
    temp_model = tmodel.deepcopy()
    temp_model.remove_reactions(b, remove_orphans=True)

    # Update objective function to include the new RQ constraint:
    # (+ adjust weight of each term, giving equal weight to the RQ and each of the fluxes being fit)
    no_fluxes = len(list(reg_data.unstack()["mean"].T.index) )
    initial_weight = 1/no_fluxes
    new_weight = 1/(no_fluxes + 1)   # to account for RQ constraint
    
    # Equal weight to RQ and resflx:
    temp_model.m.setObjective(temp_model.m.getObjective() - initial_weight*temp_model.mvars["resflx"].sum() + new_weight*(resrq.sum() + temp_model.mvars["resflx"].sum()), GRB.MINIMIZE)
    
    write_to_log(OUTPUT_LOG, f" - Objective function after: {temp_model.m.getObjective()}")
    
    
    # Write model for optimization on the HPC cluster:
    model_filename = f"{OUTPUT_DIR}{path.sep}single_{CONDITION}_fit-co2-{INCLUDE_CO2}_fit-o2-{INCLUDE_O2}_allow-excretion-{ALLOW_OTHER_EXCRETION}.mps"
    temp_model.m.write(model_filename)
    write_to_log(OUTPUT_LOG, f"Saved model to: {model_filename}")
    
    temp_model.m.optimize()

    sol = temp_model.solution()
    sol.to_csv(f"{OUTPUT_DIR}{path.sep}{MODEL_NAME}_{CONDITION}_SOLUTION.csv")


NameError: name 'gams' is not defined

In [None]:
for model in glob.glob(f"{OUTPUT_DIR}{path.sep}*.mps"):
    write_to_log(OUTPUT_LOG, f"Starting optimization of {model}...")
    m = gp.read(model)
    m.Params.TimeLimit = 3600
    m.Params.Threads = 16
    m.optimize()

    





Read MPS format model from file output\single_dptsG-Glc_I_fit-co2-True_fit-o2-True_allow-excretion-False.mps
Reading time = 0.18 seconds
minlp: 10255 rows, 8809 columns, 287635 nonzeros
Set parameter TimeLimit to value 3600
Set parameter Threads to value 16
Gurobi Optimizer version 11.0.3 build v11.0.3rc0 (win64 - Windows 11+.0 (26100.2))

CPU model: AMD Ryzen 7 7800X3D 8-Core Processor, instruction set [SSE2|AVX|AVX2|AVX512]
Thread count: 8 physical cores, 16 logical processors, using up to 16 threads

Optimize a model with 10255 rows, 8809 columns and 287635 nonzeros
Model fingerprint: 0xfc232d32
Model has 389 quadratic objective terms
Model has 149 quadratic constraints
Model has 1 general constraint
Variable types: 7743 continuous, 1066 integer (1066 binary)
Coefficient statistics:
  Matrix range     [3e-06, 1e+08]
  QMatrix range    [1e+00, 1e+00]
  QLMatrix range   [1e+00, 1e+00]
  Objective range  [3e-02, 1e+00]
  QObjective range [2e+00, 2e+00]
  Bounds range     [1e-05, 1e+08]

AttributeError: 'NoneType' object has no attribute 'to_csv'

#9 Output

In [72]:
df_rxns = pd.DataFrame([{i: rxn.id for i, rxn in enumerate(tmodel.reactions)}]).T
df_rxns.rename(columns={0: "rxns"}, inplace=True)
df_mets = pd.DataFrame([{i: met.id for i, met in enumerate(tmodel.metabolites)}]).T
df_mets.rename(columns={0: "mets"}, inplace=True)
df = pd.concat([df_rxns, df_mets], axis=1)

out_name = f"{OUTPUT_DIR}{path.sep}{MODEL_NAME}_indices.csv"

df.to_csv(out_name)
write_to_log(OUTPUT_LOG, f"Saved model indices to: {out_name}")

In [None]:
# Write datetime:
time = datetime.now().strftime("%d/%m/%Y %H:%M:%S")
write_to_log(OUTPUT_LOG, f"Finished analysis at: {time}")