# Balancing Metabolic Models
Balancing is done in three major steps in this script: \
(1) mass balancing with Frowins script that he used for his models \
(2) automatically overwriting metabolite and reaction information with BIGG information \
(3) manual balancing, mostly charge balancing but after the BIGG overwrite there are a few mass imbalanced reactions as well

## Imports

In [29]:
import cobra
from cobra.io import read_sbml_model, write_sbml_model, load_json_model
from cobra import Reaction, Metabolite
from cobra.manipulation.validate import check_mass_balance
from collections import defaultdict, Counter
import os
import pandas as pd
import csv
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
#from macaw.main import run_all_tests
import ast
import matplotlib.pyplot as plt
from functions import *

# Mass Balance Metabolic Models
Thanks to Frowin for letting me use his code that he used for mass balancing!

## Paths

In [2]:
project_dir = "/home/lisa/Dokumente/Programmierung/"

# Path to the model files
model_path = project_dir+ "Models/06_carveme/"

# Path to the directory where you want to save the results aka mass balanced models
#save_path = project_dir + "Models/07_mass_balanced/"

## Functions to check reactions for imbalances

In [3]:
# Creates a list of all internal reactions in the model
def internal_reactions(model):

    # Get all reaction_ids in the model
    model_reactions = []
    for reaction in model.reactions:
        model_reactions.append(reaction.id)

    # Filter out all exchanges from the list of reactions
    indicator_exchanges = ['EX','sink','Growth']
    internal_reactions = list(filter(lambda i: all(indicator not in i for indicator in indicator_exchanges), model_reactions))

    return internal_reactions

In [4]:
# Confirms if a single reaction is mass balanced
# This function checks if the sum of the elements in the reactants equals the sum of the elements in the products
# It returns True if the reaction is mass balanced, and False otherwise
def is_mass_balanced(reaction):

    # Create a dict to store the sum of each element in the reaction
    balance = defaultdict(int)

    # Iterate over the metabolites in the reaction
    for metabolite, coefficient in model.reactions.get_by_id(reaction).metabolites.items():

        # Check if the metabolite has elements
        if metabolite.elements is None or len(metabolite.elements) == 0:
            return False

        # Sum the elements in the balance dict using their coefficients
        for element, amount in (metabolite.elements).items():
            balance[element] += coefficient * amount

    # Returns True if all elements are balanced (in a margin of 1e-10)
    return all(-1e-10 < amount < 1e-10 for amount in balance.values())

In [5]:
# This function checks all internal reactions in the model and returns a list of unbalanced reactions
def find_mass_unbalanced_reactions(model):

    # Get all internal reactions in the model
    reactions = internal_reactions(model)

    # Check each reaction for mass balance using the is_mass_balanced function
    # and return a list of unbalanced reactions
    return [rxn for rxn in reactions if not is_mass_balanced(rxn)]

In [6]:
# This function combines the list of all imbalanced reactions from all models
# and create a DataFrame showing how often each imbalanced reaction occurs throughout the models
def reaction_abundance(imbalanced_models):
    # Create a list to collect all imbalanced reactions
    # and a dict to store the models containing these reactions
    unique_imbalanced_reactions = []
    occurences_reactions = {}
    # Create the unique list of imbalanced reactions by iterating through all reaction list in the dict and appending reactions that are not already in the list
    for reaction_list in imbalanced_models.values():
        for reaction in reaction_list:
            if reaction not in unique_imbalanced_reactions:
                unique_imbalanced_reactions.append(reaction)
    # Iterate through the list and find all models containing this reaction
    for reaction in unique_imbalanced_reactions:
        occurences_reactions[reaction] = []
        for imbalanced_model in imbalanced_models.keys():
            if reaction in imbalanced_models[imbalanced_model]:
                occurences_reactions[reaction].append(imbalanced_model)
    # Create a table with all results
    imbalanced_reactions = pd.DataFrame({
    "Reaction": occurences_reactions.keys(),
    "Occurences": [len(v) for v in occurences_reactions.values()],
    "Model IDs": occurences_reactions.values()})
    return imbalanced_reactions

## Functions to fix imbalanced reactions

In [7]:
def fix_cmcbtf(model):

    # Change the formula and charge of fcmcbtt_c in all models
    model.metabolites.get_by_id('fcmcbtt_c').formula = 'C33FeH48N5O13'
    model.metabolites.get_by_id('fcmcbtt_c').charge = 3

    # Change the charge of fcmcbtt_p and _e if they are present in the model
    if 'fcmcbtt_p' in model.metabolites:
        model.metabolites.get_by_id('fcmcbtt_p').charge = 3
    if 'fcmcbtt_e' in model.metabolites:
        model.metabolites.get_by_id('fcmcbtt_e').charge = 3

    # Subtract protons from the reactions CMCBTFR1 and CMCBTFU if they are present in the model
    if 'CMCBTFR1' in model.reactions:
        model.reactions.get_by_id('CMCBTFR1').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if 'CMCBTFR2' in model.reactions:
        model.reactions.get_by_id('CMCBTFR2').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if 'CMCBTFU' in model.reactions:
        model.reactions.get_by_id('CMCBTFU').subtract_metabolites({model.metabolites.get_by_id('h_c'): 1})

    return model

In [8]:
def fix_dhbsz3feabcpp(model):

    # Change the formula and charge of fe3dhbzs3_c
    model.metabolites.get_by_id('fe3dhbzs3_c').charge = 3
    model.metabolites.get_by_id('fe3dhbzs3_c').formula = 'C30FeH28N3O16'

    # Change the charge of fe3dhbzs3_p and _e
    if 'fe3dhbzs3_p' in model.metabolites:
        model.metabolites.get_by_id('fe3dhbzs3_p').charge = 3
        model.metabolites.get_by_id('fe3dhbzs3_e').charge = 3

    if 'feenter_c' in model.metabolites:
        model.metabolites.get_by_id('feenter_c').charge = 3

    # Subtract protons from the right side of reaction FE3DHBZS3R, FEDHBZS3R1 and FEDHBZS3R2
    if "FE3DHBZS3R" in model.reactions:
        model.reactions.get_by_id('FE3DHBZS3R').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "FEDHBZS3R1" in model.reactions:
        model.reactions.get_by_id('FEDHBZS3R1').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "FEDHBZS3R2" in model.reactions:
        model.reactions.get_by_id('FEDHBZS3R2').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "FEDHBZS3R3" in model.reactions:
        model.reactions.get_by_id('FEDHBZS3R3').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "FEENTERES" in model.reactions:
        model.reactions.get_by_id('FEENTERES').subtract_metabolites({model.metabolites.get_by_id('h_c'): -1})

    # Return the model
    return model

In [9]:
def fix_man6gpts(model):

    # Deleting objects
    # This wil delete all listed reactions and due to the 'True' argument also all resulting orphan metabolites. In this case only 'man6pglyc_e'
    model.remove_reactions(['MAN6Gpts', 'EX_man6pglyc_e'], True)

    # Create objects to be added to the model

    # Metabolites:

    # Metabolite manglyc_p
    manglyc_p = Metabolite(
        'manglyc_p',
        formula='C9H15O9',
        name='2(alpha-D-Mannosyl)-D-glycerate',
        compartment='p',
        charge =-1)

    # Metabolite manglyc_e
    manglyc_e = Metabolite(
        'manglyc_e',
        formula='C9H15O9',
        name='2(alpha-D-Mannosyl)-D-glycerate',
        compartment='e',
        charge =-1)

    # Reactions:

    # Reaction MANGLYCptspp
    MANGLYCptspp = Reaction('MANGLYCptspp')
    MANGLYCptspp.name = '2-O-alpha-mannosyl-D-glycerate transport via PEP:Pyr PTS (periplasm)'
    MANGLYCptspp.subsystem = ''
    MANGLYCptspp.lower_bound = 0.0
    MANGLYCptspp.upper_bound = 1000.0
    MANGLYCptspp.add_metabolites({
        manglyc_p: -1,
        model.metabolites.get_by_id('pep_c'): -1,
        model.metabolites.get_by_id('man6pglyc_c'): 1,
        model.metabolites.get_by_id('pyr_c'): 1
    })

    # Reaction MANGLYCtex
    MANGLYCtex = Reaction('MANGLYCtex')
    MANGLYCtex.name = '2-O-alpha-mannosyl-D-glycerate transport via diffusion (extracellular to periplasm)'
    MANGLYCtex.subsystem = ''
    MANGLYCtex.lower_bound = -1000.0
    MANGLYCtex.upper_bound = 1000.0
    MANGLYCtex.add_metabolites({
        manglyc_e: -1,
        manglyc_p: 1
    })

    # Reaction EX_manglyc_e
    EX_manglyc_e = Reaction('EX_manglyc_e')
    EX_manglyc_e.name = '2(alpha-D-Mannosyl)-D-glycerate exchange'
    EX_manglyc_e.subsystem = ''
    EX_manglyc_e.lower_bound = 0.0
    EX_manglyc_e.upper_bound = 1000.0
    EX_manglyc_e.add_metabolites({
        manglyc_e: -1
    })

    # Add reactions to the model
    model.add_reactions([MANGLYCptspp, MANGLYCtex, EX_manglyc_e])

    # Apply changes
    model.metabolites.get_by_id('man6pglyc_c').charge = -3
    model.metabolites.get_by_id('man6p_c').charge = -2

    # Return the modified model
    return model

In [10]:
def fix_mcbtfabcpp(model):

    # Change the formula and charge of fe3mcbtt_c
    model.metabolites.get_by_id('fe3mcbtt_c').charge = 3
    model.metabolites.get_by_id('fe3mcbtt_c').formula = 'C47FeH77N5O10'

    # Change the charge of fe3mcbtt_p and _e
    model.metabolites.get_by_id('fe3mcbtt_p').charge = 3
    model.metabolites.get_by_id('fe3mcbtt_e').charge = 3

    # Subtract protons from the right side of reaction MCBTFR1
    model.reactions.get_by_id('MCBTFR1').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})

    return model

In [11]:
def fix_salchs4feabcpp(model):

    # Change the formula and charge of salchs4fe_c
    model.metabolites.get_by_id('salchs4fe_c').charge = 3
    model.metabolites.get_by_id('salchs4fe_c').formula = 'C42FeH46N3O25'

    # Change the charge of salchs4fe_p and _e
    if 'salchs4fe_p' in model.metabolites:
        model.metabolites.get_by_id('salchs4fe_p').charge = 3
        model.metabolites.get_by_id('salchs4fe_e').charge = 3

    # Subtract protons from the right side of reaction SALCHS4FER1, SALCHS4FER2 and SALCHS4FER3
    if "SALCHS4FER1" in model.reactions:
        model.reactions.get_by_id('SALCHS4FER1').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "SALCHS4FER2" in model.reactions:
        model.reactions.get_by_id('SALCHS4FER2').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})
    if "SALCHS4FER3" in model.reactions:
        model.reactions.get_by_id('SALCHS4FER3').subtract_metabolites({model.metabolites.get_by_id('h_c'): 2})

    return model

In [12]:
def fix_uaccpts(model):

    # Change the formula and charge of uaccg_e
    model.metabolites.get_by_id('uaccg_e').charge = 0
    model.metabolites.get_by_id('uaccg_e').formula = 'C20H29N3O19P2'

    # Change the charge of uamr_c
    model.metabolites.get_by_id('uamr_c').charge = -3

    return model

In [13]:
def fix_tagabc(model):

    # Change the formula and charge of tagur_e
    model.metabolites.get_by_id('tagur_e').formula = 'C6H10O7'

    # Change the charge of tagur_c
    model.metabolites.get_by_id('tagur_c').charge = -1

    return model

## Main

If a reaction is imbalanced in a model the corresponding function will be applied to it.
Afterwards the model gets saved as "modified_{model_id}" and a dictionary containing all modified_models as keys and their still imbalanced reactions (if existing) gets created

In [14]:
imbalanced_models = {}

for m in os.listdir(model_path):
    if m.endswith(".xml"):
        print(m)
        model = read_sbml_model(model_path+f'/{m}')
        if len(find_mass_unbalanced_reactions(model)) != 0:
            imbalanced_models[m] = find_mass_unbalanced_reactions(model)

AA5.xml
Restricted license - for non-production use only - expires 2026-11-23
AA4.xml
AA6.xml
AA1.xml
AA3.xml
AA7.xml
AA2.xml


In [15]:
imbalanced_reactions = reaction_abundance(imbalanced_models)

In [18]:
# Save the results
# imbalanced_reactions.to_csv(save_path+"/Imbalanced_reactions.csv", sep="\t")

In [16]:
imbalanced_reactions

Unnamed: 0,Reaction,Occurences,Model IDs
0,CMCBTFL,7,"[AA5.xml, AA4.xml, AA6.xml, AA1.xml, AA3.xml, ..."
1,SALCHS4FEabcpp,7,"[AA5.xml, AA4.xml, AA6.xml, AA1.xml, AA3.xml, ..."
2,CMCBTFabcpp,2,"[AA4.xml, AA2.xml]"
3,DHBSZ3FEabcpp,3,"[AA4.xml, AA3.xml, AA2.xml]"


In [17]:
reaction_dict = {}

# Iterate through all rows of the imbalanced_reactions DataFrame
for index, row in imbalanced_reactions.iterrows():
    reaction_dict[row[0]] = row[2]

# Combine CMCBTFabcpp and CMCBTFL into one entry since the function for these is the same
reaction_dict['CMCBTF'] = reaction_dict['CMCBTFabcpp'] + list(set(reaction_dict['CMCBTFL']) - set(reaction_dict['CMCBTFabcpp']))

In [18]:
# Dict to save the models with imbalanced reactions after the modification by the Functions to fix imbalanced reactions
imbalanced_models_aftermod = {}

In [19]:
for m in os.listdir(model_path):
    if m.endswith(".xml"):

        print("checking", m)

        # if f'{m[:-4]}_balanced.xml' not in os.listdir(save_path):

        model = read_sbml_model(model_path + m)

        try:
            if m in reaction_dict["SALCHS4FEabcpp"]:
                model = fix_salchs4feabcpp(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["MCBTFabcpp"]:
                model = fix_mcbtfabcpp(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["CMCBTF"]:
                model = fix_cmcbtf(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["DHBSZ3FEabcpp"]:
                model = fix_dhbsz3feabcpp(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["MAN6Gpts"]:
                model = fix_man6gpts(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["UACCpts"]:
                model = fix_uaccpts(model)
        except KeyError:
            pass
        try:
            if m in reaction_dict["TAGabc"]:
                model = fix_tagabc(model)
        except KeyError:
            pass

        # -------------------
        # Write the model
        #write_sbml_model(model, save_path+f'{m[:-4]}_mass_balanced.xml')
        # -------------------

        # Test if the fixes worked
        imbalanced_reactions_aftermod = find_mass_unbalanced_reactions(model)
        # print(imbalanced_reactions_aftermod)
        if len(imbalanced_reactions_aftermod) != 0:
            imbalanced_models_aftermod[m] = imbalanced_reactions_aftermod
        else:
            print("all fixed")

#with open('imbalanced_reactions_roots_modified2.json', 'w') as file:
    #json.dump(imbalanced_reactions_aftermod, file)

checking AA5.xml
all fixed
checking AA4.xml
all fixed
checking AA6.xml
all fixed
checking AA1.xml
all fixed
checking AA3.xml
all fixed
checking AA7.xml
all fixed
checking AA2.xml
all fixed


In [46]:
# To test if there are still models with imbalanced reactions. If this outputs only {}, then all models are mass balanced.
print(imbalanced_models_aftermod)

{}


# Charge Balance Metabolic Models with BIGG
With Frowins Scripts I was able to obtain stoichiometric consistent models; next step to balance the charges.
I use the BIGG database to compare the info there with model info and automatically replace charge and formula of metabolites with BIGG info if it seems more reasonable.
After this automated step, the remaining unbalanced reactions need to be manually curated.
For a few reactions it is also possible to compare them to the reaction used in other very well curated models (here I used the e.coli iml1515 model) to check how they balanced the reaction.

## Paths & Load models

In [20]:
# path to xml files
models_path = "../Models/07_mass_balanced/"

In [21]:
# import models after mass-balancing through Frowins scripts
models = {}
for model_name in (f for f in os.listdir(models_path) if f.endswith(".xml")):
    model = read_sbml_model(f"{models_path}/{model_name}")
    model.solver = "cplex"
    models[model_name[:3]] = model  # it takes first three letters of the xml filename as the model name

models = {key: models[key] for key in sorted(models.keys())}  # sorts the dictionary alphabetically (AA1...AA7) because of reasons it doesn't do this while creating

# this is specifically for my 7 models to be able to access them more easily; all models are stored in the "model" dict and can be accessed e.g. through models["AA1"] depending what you set as model name
AA1, AA2, AA3, AA4, AA5, AA6, AA7 = [models[f"AA{i}"] for i in range(1, 8)]

## Functions

In [22]:
def get_objective_value(model):
    print(f"value of objective for {model} is {model.optimize().objective_value}")

In [23]:
# checks the mass and charge balance for every reaction in a model
def check_balance(model, print_results=True):
    unbalanced_reactions = check_mass_balance(model)
    if print_results:
        print("There are {0} unbalanced reactions in {1}".format(len(unbalanced_reactions), model) )
    return unbalanced_reactions

In [24]:
# returns a pandas dataframe with metabolite info for a specific cobra model that includes: bigg_id, model_id, formula and charge
# NOTE: bigg_id could be wrong (i.e. not the real id on the website) because it only takes the model_id and removes the _compartment
def extract_met_info_model(model):
    met_infos = []

    for met in model.metabolites:
        met_infos.append({
            "bigg_id": met.id.rsplit("_", 1)[0],  # strip compartment so that it matches the actual BIGG ID that also doesn't have compartments (e.g., glc__D_c to glc__D)
            "model_id": met.id,
            "model_formula": met.formula,
            "model_charge": met.charge
        })

    met_infos = pd.DataFrame(met_infos)
    return met_infos

In [25]:
# returns pandas dataframe with metabolite info from the model and from big and compares info about formula and charge state
def compare_bigg_modelMets(model_mets, list_unbalanced_mets):
    # Merge on BiGG ID
    merged = model_mets.merge(df_bigg_met, on="bigg_id", how="left")

    merged["charge_match"] = merged.apply(
        lambda row: row["model_charge"] in row["charges"] if isinstance(row["charges"], list) else False,
        axis=1
    )

    merged["formula_match"] = merged.apply(
        lambda row: row["model_formula"] in row["formulas"] if isinstance(row["formulas"], (list, set)) else False, axis=1
    )

    # adds another column to check if the metabolites are part of an unbalanced reaction (false = not part of unbalanced reactions, true = part of unbalanced reaction(s))
    merged['unbalanced'] = merged['model_id'].isin(list_unbalanced_mets)
    # merged['unbalanced'] = merged['model_id'].isin(list_unbalanced_mets).astype(int) (instead of true/false with 1/0)

    return merged

In [26]:
# filters the merged df to only show the rows (aka metabolites) where model info and bigg info do NOT match
def get_mismatches_after_merge(df_merge):
    mismatches = df_merge.loc[(df_merge['formula_match'] == False) | (df_merge['charge_match'] == False)]
    mismatches = mismatches[["model_id", "bigg_id", "model_charge", "charges", "model_formula", "formulas", "charge_match", "formula_match", "unbalanced"]]

    return mismatches

In [27]:
# returns a confusion matrix showing how much mismatching info about charge state and/or formula there is
# this function either takes one df from one model as an input or the merged dict where all models are saved
def get_confmat_charge_formula(df_merge):
    if isinstance(df_merge, pd.DataFrame):
        conf_matrix = df_merge.groupby(["charge_match", "formula_match"]).size().reset_index(name='count')
        print(conf_matrix)
    elif isinstance(df_merge, dict):
        conf_matrix = {
            "charge_match": ["False", "False", "True", "True"],
            "forumla_match": ["False", "True", "False", "True"],
        }
        conf_matrix = pd.DataFrame(conf_matrix)
        for i, item in enumerate(df_merge.values()):
            conf_matrix_model = item.groupby(["charge_match", "formula_match"]).size().reset_index(name='count')
            name = f"AA{i+1}"
            conf_matrix.insert(i+2, name, conf_matrix_model["count"])
        print(conf_matrix)

## Evaluate current state of models regarding charge balance

In [61]:
# check flux through objective; this also shows that the models are not working correctly because the values are not feasible in vivo (but this is probably due to unconstrained EX reactions
for model in models.values():
    get_objective_value(model)

value of objective for AA1 is 34.02474449543432
value of objective for AA2 is 58.547243943051555
value of objective for AA3 is 43.720627010239504
value of objective for AA4 is 85.48331931079218
value of objective for AA5 is 48.127368212200146
value of objective for AA6 is 69.23396649941685
value of objective for AA7 is 55.322197071522076


In [31]:
# dictionary to store unbalanced reactions
unbalanced_reactions_dict = {}

# models is the dict where all models are stored that were "imported" witch read_sbml_file()
for name, model in models.items():
    unbalanced_reactions = check_balance(model)
    unbalanced_reactions_dict[name] = unbalanced_reactions

# these numbers are in accordance with the numbers for "charge balance" reactions in the memote report
# when i do memote with the mass balanced models; directly after carveme we have more charge unbalanced reactions but the mass balancing already fixed some

There are 439 unbalanced reactions in AA1
There are 480 unbalanced reactions in AA2
There are 367 unbalanced reactions in AA3
There are 413 unbalanced reactions in AA4
There are 358 unbalanced reactions in AA5
There are 425 unbalanced reactions in AA6
There are 448 unbalanced reactions in AA7


In [38]:
# We know how many unbalanced reactions each model has on their own but what is the overlap?
unique_reactions = set()

# Loop through all models and collect reaction IDs
for model_name, unbalanced_reactions in unbalanced_reactions_dict.items():
    # Add the reaction ID to the set (sets are by default like 'Mengen', i.e. they only have unique elements)
    unique_reactions.update(reaction.id for reaction in unbalanced_reactions.keys())

# this is a list of all the reaction IDs that are charge unbalanced throughout all models
unique_reaction_ids = list(unique_reactions)

print("There are {0} charge unbalanced reactions throughout all models.".format(len(unique_reaction_ids)))
# print(unique_reaction_ids)

There are 796 charge unbalanced reactions throughout all models.


In [39]:
# we now the unbalanced reactions but which metabolites are part of these?
# go through all unbalanced (unique) reactions and get all participating metabolites
metabolite_counter_compartment = Counter()
metabolite_counter_name = Counter()
seen_reactions = set()  # Track reactions that were already counted

for model in models.values():
    for rxn_id in unique_reaction_ids:
        if rxn_id in model.reactions and rxn_id not in seen_reactions:
            reaction = model.reactions.get_by_id(rxn_id)
            for metabolite in reaction.metabolites:
                metabolite_counter_compartment[metabolite.id] += 1  # this is compartment specific, e.g. h2o_c and h2o_p are different metabolites
                metabolite_counter_name[metabolite.name] += 1  # h2o is only counted once not dependent on compartment
            seen_reactions.add(rxn_id)  # Mark this reaction as counted

In [20]:
pd.DataFrame(metabolite_counter_compartment.items())

Unnamed: 0,0,1
0,ficytc_c,7
1,h_c,410
2,q8h2_c,1
3,focytc_c,7
4,h_p,45
...,...,...
970,prealginate__M_p,1
971,peamn_p,1
972,nh4_p,1
973,pacald_p,1


In [40]:
# these are the amounts of unique metabolites that are part of unbalanced reactions

# compartment specific, e.g. h20_c and h2o_p are counted separately
print(len(metabolite_counter_compartment))
# h2o only exists once
print(len(metabolite_counter_name))

975
863


In [41]:
# get the amount of metabolites that are only part of v reactions or check which metabolites are part of the most reactions
filtered = {m: v for m, v in metabolite_counter_compartment.items() if v == 1}
print(len(filtered))

print(metabolite_counter_compartment.most_common(5))


422
[('h_c', 410), ('h2o_c', 259), ('atp_c', 138), ('coa_c', 122), ('ppi_c', 103)]


In [72]:
%matplotlib notebook

# Count how many keys have each count (i.e. histogram of values)
count_distribution = Counter(metabolite_counter_compartment.values())

# Plot
plt.bar(count_distribution.keys(), count_distribution.values())
plt.xlabel('Amount of Reactions a Metabolite is Part of')
plt.ylabel('Number of Metabolites with that count')
plt.title('Distribution of Metabolite occurrences in unbalanced reactions')
plt.show()


<IPython.core.display.Javascript object>

In [73]:
# the number show how often one metabolite is part of an unbalanced reaction
metabolite_counter_compartment

Counter({'h_c': 410,
         'h2o_c': 259,
         'atp_c': 138,
         'coa_c': 122,
         'ppi_c': 103,
         'pi_c': 87,
         'amp_c': 84,
         'adp_c': 62,
         'h2o_p': 52,
         'co2_c': 49,
         'h_p': 45,
         'nadh_c': 41,
         'pyr_c': 39,
         'nad_c': 39,
         'nadph_c': 37,
         'nadp_c': 34,
         'fad_c': 33,
         'fadh2_c': 33,
         'ACP_c': 32,
         'cmp_c': 26,
         'o2_c': 26,
         'g3p_c': 21,
         'glyc3p_c': 19,
         'glu__L_c': 19,
         'fe2_c': 19,
         'pep_c': 17,
         'nh4_c': 16,
         'pi_p': 14,
         'f6p_c': 12,
         'gly_c': 12,
         'r5p_c': 12,
         'accoa_c': 11,
         'ctp_c': 10,
         'ser__L_c': 10,
         'fmn_c': 10,
         'akg_c': 10,
         '2dr1p_c': 10,
         'dhap_c': 9,
         'gtp_c': 8,
         'thmpp_c': 8,
         'asp__L_c': 8,
         'pppi_c': 8,
         'uacgam_c': 8,
         '23dhbzs_c': 8,
        

In [42]:
# all metabolites from unbalanced reactions
# this list is very important later on for some of the functions
unbalanced_mets = list(metabolite_counter_compartment.keys())

## scrape BIGG database - reactions
only needs to be executed once to get the csv file with the info, afterwards just read the csv file

In [14]:
# Get list of all universal reactions
base_url = "http://bigg.ucsd.edu/api/v2/"
list_url = base_url + "universal/reactions"
response = requests.get(list_url)

# check if request is going through
if response.status_code != 200:
    raise Exception("Failed to fetch reaction list")

reactions = response.json()["results"]
print(f"Found {len(reactions)} reactions. Fetching details...")

# Function that fetches specific information for one reaction
def fetch_reaction_details(reaction):
    bigg_id = reaction.get("bigg_id", "")
    name = reaction.get("name", "")
    url = f"{base_url}universal/reactions/{bigg_id}"
    try:
        r = requests.get(url, timeout=15)
        if r.status_code == 200:
            data = r.json()
            metabolites = data.get("metabolites", [])

            # Safe quoting
            safe_name = str(name).replace('"', "'")

            return {
                "bigg_id": bigg_id,
                "name": f'"{safe_name}"',
                "metabolites": str(metabolites)
            }
    except Exception as e:
        print(f"Error with {bigg_id}: {e}")

    # Fallback if request fails
    safe_name = str(name).replace('"', "'")
    return {
        "bigg_id": bigg_id,
        "name": f'"{safe_name}"',
        "metabolites": "[]"
    }

# Use ThreadPoolExecutor to parallelize requests
results = []
with ThreadPoolExecutor(max_workers=35) as executor:
    futures = [executor.submit(fetch_reaction_details, rxn) for rxn in reactions]
    for i, future in enumerate(as_completed(futures)):
        results.append(future.result())
        if i % 500 == 0:
            print(f"{i}/{len(reactions)} done...")

Found 28302 reactions. Fetching details...
0/28302 done...
500/28302 done...
1000/28302 done...
1500/28302 done...
2000/28302 done...
2500/28302 done...
3000/28302 done...
3500/28302 done...
4000/28302 done...
4500/28302 done...
5000/28302 done...
5500/28302 done...
6000/28302 done...
6500/28302 done...
7000/28302 done...
7500/28302 done...
8000/28302 done...
8500/28302 done...
9000/28302 done...
9500/28302 done...
10000/28302 done...
10500/28302 done...
11000/28302 done...
11500/28302 done...
12000/28302 done...
12500/28302 done...
13000/28302 done...
13500/28302 done...
14000/28302 done...
14500/28302 done...
15000/28302 done...
15500/28302 done...
16000/28302 done...
16500/28302 done...
17000/28302 done...
17500/28302 done...
18000/28302 done...
18500/28302 done...
19000/28302 done...
19500/28302 done...
20000/28302 done...
20500/28302 done...
21000/28302 done...
21500/28302 done...
22000/28302 done...
22500/28302 done...
23000/28302 done...
23500/28302 done...
24000/28302 done...
2

In [None]:
df_bigg_rea = pd.DataFrame(results)

# add a equation column to the bigg reaction df that includes the reaction equation in a format that we can directly use to overwrite the model equation with
df_bigg_rea["equation"] = None

for i in range(0, len(df_bigg_rea)):
    dict_equation = {}
    for met in df_bigg_rea["metabolites"][i]:
        name = met["bigg_id"] + "_" + met["compartment_bigg_id"]
        dict_equation[name] = met["stoichiometry"]

    df_bigg_rea.at[i, 'equation'] = dict_equation

# Save to CSV
# df_bigg_rea.to_csv("../bigg_reactions_complete.csv", index=False, quoting=csv.QUOTE_MINIMAL)

In [32]:
# Read CSv
df_bigg_rea = pd.read_csv("../Datasets/BIGG/bigg_reactions_complete.csv", quotechar='"', usecols=['bigg_id', 'name', 'metabolites', 'equation'])
df_bigg_rea["metabolites"] = df_bigg_rea["metabolites"].apply(ast.literal_eval)
df_bigg_rea["equation"] = df_bigg_rea["equation"].apply(ast.literal_eval)

In [77]:
df_bigg_rea

Unnamed: 0,bigg_id,name,metabolites,equation
0,10FTHF7GLUtm,"""7-glutamyl-10FTHF transport, mitochondrial""","[{'bigg_id': '10fthf7glu', 'name': '10-formylt...","{'10fthf7glu_c': 1.0, '10fthf7glu_m': -1.0}"
1,11DOCRTSLtm,"""11-deoxycortisol intracellular transport""","[{'bigg_id': '11docrtsl', 'name': '11docrtsl c...","{'11docrtsl_c': -1.0, '11docrtsl_m': 1.0}"
2,11DOCRTSTRNtr,"""11-deoxycorticosterone intracellular transport""","[{'bigg_id': '11docrtstrn', 'name': '11-Deoxyc...","{'11docrtstrn_c': -1.0, '11docrtstrn_r': 1.0}"
3,10FTHF5GLUtm,"""5-glutamyl-10FTHF transport, mitochondrial""","[{'bigg_id': '10fthf5glu', 'name': '10-formylt...","{'10fthf5glu_c': 1.0, '10fthf5glu_m': -1.0}"
4,10FTHF7GLUtl,"""7-glutamyl-10FTHF transport, lysosomal""","[{'bigg_id': '10fthf7glu', 'name': '10-formylt...","{'10fthf7glu_c': -1.0, '10fthf7glu_l': 1.0}"
...,...,...,...,...
28297,ZYMSTESTH_SC,"""Zymosterol ester hydrolase yeast specific""","[{'bigg_id': 'h', 'name': 'H+', 'compartment_b...","{'h_c': 1.0, 'h2o_c': -1.0, 'hdca_c': 0.02, 'h..."
28298,ZYMSTt,"""Zymosterol reversible transport""","[{'bigg_id': 'zymst', 'name': 'Zymosterol C27H...","{'zymst_c': 1.0, 'zymst_e': -1.0}"
28299,ZYMSTR,"""Zymosterol reductase""","[{'bigg_id': 'h', 'name': 'H+', 'compartment_b...","{'h_c': -1.0, 'nadp_c': 1.0, 'nadph_c': -1.0, ..."
28300,ZN2tpp,"""Zinc transport in via permease (no H+)""","[{'bigg_id': 'zn2', 'name': 'Zinc', 'compartme...","{'zn2_c': 1.0, 'zn2_p': -1.0}"


## scrape BIGG database - metabolites
only needs to be executed once to get the csv file with the info, afterwards just read the csv file

In [1]:
# this downloads metabolite information for all BIGG metabolites with their bigg ID, name, formulae and charge and saves it to a csv file
# there are 9088 metabolites


# Get list of all universal metabolites
base_url = "http://bigg.ucsd.edu/api/v2/"
list_url = base_url + "universal/metabolites"
response = requests.get(list_url)

# check if request is going through
if response.status_code != 200:
    raise Exception("Failed to fetch metabolite list")

metabolites = response.json()["results"]
print(f"Found {len(metabolites)} metabolites. Fetching details...")

# function that fetches specific information for one metabolite, i.e. BIGG ID, name, formulas and charges
def fetch_metabolite_details(met):
    bigg_id = met.get("bigg_id", "")
    name = met.get("name", "")
    url = f"{base_url}universal/metabolites/{bigg_id}"
    try:
        r = requests.get(url, timeout=10)
        if r.status_code == 200:
            data = r.json()  # converts JSON response to a dictionary (data)
            formulae = data.get("formulae", [])  # if formula not available, use empty list []
            charges = data.get("charges", [])

            # Safe quoting for CSV
            safe_name = str(name).replace('"', "'")
            name = f'"{safe_name}"'

            return {
                "bigg_id": bigg_id,
                "name": name,
                "formulas": str(formulae),
                "charges": str(charges)
            }
    except Exception as e:
        print(f"Error with {bigg_id}: {e}")

    safe_name = str(name).replace('"', "'")
    name = f'"{safe_name}"'
    return {
        "bigg_id": bigg_id,
        "name": name,
        "formulas": "[]",
        "charges": "[]"
    }

# Use ThreadPoolExecutor to parallelise requests
results = []
with ThreadPoolExecutor(max_workers=25) as executor:
    futures = [executor.submit(fetch_metabolite_details, met) for met in metabolites]
    for i, future in enumerate(as_completed(futures)):
        results.append(future.result())
        if i % 500 == 0:
            print(f"{i}/{len(metabolites)} done...")

# Save to CSV
df = pd.DataFrame(results)
df.to_csv("../bigg_metabolites_complete.csv", index=False, quoting=csv.QUOTE_MINIMAL)
print("Saved to bigg_metabolites_complete.csv")

Found 9088 metabolites. Fetching details...
0/9088 done...
500/9088 done...
1000/9088 done...
1500/9088 done...
2000/9088 done...
2500/9088 done...
3000/9088 done...
3500/9088 done...
4000/9088 done...
4500/9088 done...
5000/9088 done...
5500/9088 done...
6000/9088 done...
6500/9088 done...
7000/9088 done...
7500/9088 done...
8000/9088 done...
8500/9088 done...
9000/9088 done...
Saved to bigg_metabolites_complete.csv


In [33]:
# Read previously created CSV with all BIGG metabolites
df_bigg_met = pd.read_csv("../Datasets/BIGG/bigg_metabolites_complete.csv", quotechar='"')

# Convert stringified lists back to real lists
df_bigg_met["formulas"] = df_bigg_met["formulas"].apply(ast.literal_eval)
df_bigg_met["charges"] = df_bigg_met["charges"].apply(ast.literal_eval)


In [22]:
df_bigg_met

Unnamed: 0,bigg_id,name,formulas,charges
0,10fthf6glu,"""10-formyltetrahydrofolate-[Glu](6)""",[C45H51N12O22],[-7]
1,10fthf,"""10-Formyltetrahydrofolate""",[C20H21N7O7],[-2]
2,10fthfglu__L,"""10-Formyltetrahydrofolyl L-glutamate""",[C25H28N8O10],[]
3,10fthf5glu,"""10-formyltetrahydrofolate-[Glu](5)""",[C40H45N11O19],[-6]
4,10m3ouACP,"""10-methyl-3-oxo-undecanoyl-ACP""",[C23H41N2O9PRS],[0]
...,...,...,...,...
9083,zymstest_SC,"""Zymosterol ester yeast specific C1694H2993O101""",[C1694H2993O101],[0]
9084,xylu__L,"""L-Xylulose""",[C5H10O5],[0]
9085,zymst,"""Zymosterol C27H44O""",[C27H44O],[0]
9086,zymstnl,"""5alpha-cholest-8-en-3beta-ol""",[C27H46O],[0]


## Extract Metabolite info out of models
we need infos about metabolites from out models so we can compare it to the BIGG info that we just downloaded

In [34]:
# get metabolite info for all 7 models (i.e. formula and charge state) and save the 7 df's in a dict
model_mets = {f"AA{i}_mets": extract_met_info_model(models[f"AA{i}"]) for i in range(1, 8)}

In [79]:
model_mets["AA1_mets"]

Unnamed: 0,bigg_id,model_id,model_formula,model_charge
0,10fthf,10fthf_c,C20H21N7O7,-2
1,12dgr120,12dgr120_c,C27H52O5,0
2,12dgr120,12dgr120_p,C27H52O5,0
3,12dgr140,12dgr140_c,C31H60O5,0
4,12dgr140,12dgr140_p,C31H60O5,0
...,...,...,...,...
1509,xylb,xylb_e,C10H18O9,0
1510,xylu__D,xylu__D_c,C5H10O5,0
1511,zn2,zn2_c,Zn,2
1512,zn2,zn2_e,Zn,2


In [43]:
# merge the metabolite info from the models with the bigg info; creates 2 columns to show if charge/formula info match between model and bigg
# saves all 7 df's in a dict model_merged but for easier access to the individual df's, there are saved as objects (e.g. AA1_merged) but these are still linked to the dict
model_merged = {f"AA{i}_merged": compare_bigg_modelMets(model_mets[f"AA{i}_mets"], unbalanced_mets) for i in range(1, 8)}
AA1_merged, AA2_merged, AA3_merged, AA4_merged, AA5_merged, AA6_merged, AA7_merged = [model_merged[f"AA{i}_merged"] for i in range(1, 8)]

In [81]:
AA1_merged

Unnamed: 0,bigg_id,model_id,model_formula,model_charge,name,formulas,charges,charge_match,formula_match,unbalanced
0,10fthf,10fthf_c,C20H21N7O7,-2,"""10-Formyltetrahydrofolate""",[C20H21N7O7],[-2],True,True,True
1,12dgr120,12dgr120_c,C27H52O5,0,"""1,2-Diacyl-sn-glycerol (didodecanoyl, n-C12:0)""",[C27H52O5],[0],True,True,True
2,12dgr120,12dgr120_p,C27H52O5,0,"""1,2-Diacyl-sn-glycerol (didodecanoyl, n-C12:0)""",[C27H52O5],[0],True,True,True
3,12dgr140,12dgr140_c,C31H60O5,0,"""1,2-Diacyl-sn-glycerol (ditetradecanoyl, n-C1...",[C31H60O5],[0],True,True,True
4,12dgr140,12dgr140_p,C31H60O5,0,"""1,2-Diacyl-sn-glycerol (ditetradecanoyl, n-C1...",[C31H60O5],[0],True,True,True
...,...,...,...,...,...,...,...,...,...,...
1509,xylb,xylb_e,C10H18O9,0,"""Xylobiose""",[C10H18O9],[],False,True,False
1510,xylu__D,xylu__D_c,C5H10O5,0,"""D-Xylulose""",[C5H10O5],[0],True,True,False
1511,zn2,zn2_c,Zn,2,"""Zinc""",[Zn],[2],True,True,False
1512,zn2,zn2_e,Zn,2,"""Zinc""",[Zn],[2],True,True,False


In [44]:
# extracts just the rows where charge and/or formula dont match with bigg info and saves them into a dict with the seven df's
model_mismatch = {f"AA{i}_mismatch": get_mismatches_after_merge(model_merged[f"AA{i}_merged"]) for i in range(1, 8)}

In [83]:
model_mismatch["AA1_mismatch"]

Unnamed: 0,model_id,bigg_id,model_charge,charges,model_formula,formulas,charge_match,formula_match,unbalanced
23,1btol_c,1btol,0,[],C4H10O,[C4H10O],False,True,False
27,1p2cbxl_c,1p2cbxl,0,[-1],C5H6NO2,[C5H6NO2],False,True,True
30,23ddhb_c,23ddhb,0,[-1],C7H7O4,[C7H7O4],False,True,True
41,2agpe160_c,2agpe160,0,[0],C21H44NO7P,[C21H44NO7P1],True,False,True
42,2agpe160_p,2agpe160,0,[0],C21H44NO7P,[C21H44NO7P1],True,False,False
...,...,...,...,...,...,...,...,...,...
1489,val__D_p,val__D,0,[0],C5H9NO2,[C5H11NO2],True,False,False
1504,xyl3_c,xyl3,0,[],C15H26O13,[C15H26O13],False,True,False
1505,xyl3_e,xyl3,0,[],C15H26O13,[C15H26O13],False,True,False
1508,xylb_c,xylb,0,[],C10H18O9,[C10H18O9],False,True,False


In [84]:
# confusion matrix to show for how many metabolites there are differences in the infos between current model and bigg
get_confmat_charge_formula(AA1_merged)

   charge_match  formula_match  count
0         False          False    178
1         False           True    190
2          True          False    102
3          True           True   1044


In [85]:
# function also takes dict with all the 7 df's and creates one big confusion matrix
get_confmat_charge_formula(model_merged)

  charge_match forumla_match   AA1   AA2   AA3   AA4   AA5   AA6   AA7
0        False         False   178   217   150   102   134   201   183
1        False          True   190   232   228   173   175   221   227
2         True         False   102   101    97   114    95    94    73
3         True          True  1044  1246  1028  1374  1044  1175  1175


In [86]:
# for all metabolites in AA1, there are 816 metabolites (53%) that are not part of unbalanced reactions and 716 metabolites that are in unbalanced reactions
# metabolites are only part
AA1_merged['unbalanced'].value_counts()

unbalanced
False    814
True     700
Name: count, dtype: int64

In [87]:
# if we only look at metabolites where BIGG infos and model infos do NOT match, we now have 319 metabolites (66.6%) that are in unbalanced reactions and only 33.4% of these metabolites are in balanced reactions
model_mismatch["AA1_mismatch"]['unbalanced'].value_counts()

unbalanced
True     312
False    158
Name: count, dtype: int64

In [88]:
combo_counts = AA1_merged.groupby(['unbalanced', 'charge_match', 'formula_match']).size().reset_index(name='count')
print(combo_counts)
# "False True True" is optimal case, i.e. metabolite has infos that matches with bigg and is not in any unbalanced reaction

   unbalanced  charge_match  formula_match  count
0       False         False          False     61
1       False         False           True     54
2       False          True          False     43
3       False          True           True    656
4        True         False          False    117
5        True         False           True    136
6        True          True          False     59
7        True          True           True    388


In [91]:
AA6_merged['unbalanced'].value_counts()

unbalanced
0    1026
1     776
Name: count, dtype: int64

In [92]:
model_mismatch["AA6_mismatch"]['unbalanced'].value_counts()

unbalanced
1    343
0    193
Name: count, dtype: int64

In [101]:
combo_counts = AA6_merged.groupby(['unbalanced', 'charge_match', 'formula_match']).size().reset_index(name='count')
print(combo_counts)

   unbalanced  charge_match  formula_match  count
0       False         False          False     83
1       False         False           True     63
2       False          True          False     47
3       False          True           True    833
4        True         False          False    112
5        True         False           True    175
6        True          True          False     56
7        True          True           True    433


## Overwrite model with BIGG information
We want to try out if the information on BIGG is valuable to our model to help with the big amount of charge unbalanced reactions.
That means for reactions or rather metabolites were the bigg information is different to our model info, we can try to overwrite it with the bigg info.

At the moment I am only overwriting model info with BIGG info if the BIGG info only gives one charge/formula. If they're multiple possible charge states/formulas I should do manual curation. I also only overwrite it when the metabolite is part of an unbalanced reaction.

### Create a copies of our models
with these I can introduces changes and then can compare it to the OG model

In [45]:
# read the xml files a second time to create "copies" of our models that we can try to curate and compare to the original model
# creating an actual copy (new object with deepcopy()) takes a very long time that is why, I use read_sbml_model again

models_path = "/home/lisa/Dokumente/Programmierung/Models/07_mass_balanced/"

models_curation = {}
for model_name in (f for f in os.listdir(models_path) if f.endswith("balanced.xml")):
    model = read_sbml_model(f"{models_path}/{model_name}")
    model.solver = "cplex"
    name = str(model_name[:3]+"_curate")
    models_curation[name] = model

models_curation = {key: models_curation[key] for key in sorted(models_curation.keys())}  # sorts the dictionary alphabetically
AA1_curate, AA2_curate, AA3_curate, AA4_curate, AA5_curate, AA6_curate, AA7_curate = [models_curation[f"AA{i}_curate"] for i in range(1, 8)]

### Use BIGG metabolite info to overwrite model metabolite infos
--> only overwrite when bigg info is unambiguously

In [46]:
# Overwrite model metabolite info with BIGG info if the BIGG info is unambiguously, i.e. only one charge state/formula
def overwrite_with_BIGG_metabolites(model, merged_df):
    n_unbalanced = len(check_balance(model, print_results=False))
    for i in range(0,len(merged_df)):
        # check if there is only one charge state
        if len(merged_df["charges"][i]) == 1:
            model.metabolites.get_by_id(merged_df["model_id"][i]).charge = int(merged_df["charges"][i][0])

        else:  # we only need this to get the right datatype (int) for the charge state to save the model later on because apparently it got fucked up
            model.metabolites.get_by_id(merged_df["model_id"][i]).charge = int(model.metabolites.get_by_id(merged_df["model_id"][i]).charge)

        # check if there is only one formula (and at least one charge state because otherwise that model formula could be right)
        if len(merged_df["formulas"][i]) == 1 and len(merged_df["charges"][i]) != 0:
            if not "X" in merged_df["formulas"][i][0] and not "R" in merged_df["formulas"][i][0]:
                model.metabolites.get_by_id(merged_df["model_id"][i]).formula = merged_df["formulas"][i][0]

    n_unbalanced_update = len(check_balance(model, print_results=False))
    print(f'{model.id}: There were {n_unbalanced} unbalanced reactions before and now there are {n_unbalanced_update} after overwriting metabolite info with BIGG data.')

In [33]:
overwrite_with_BIGG_metabolites(AA1_curate, AA1_merged)
overwrite_with_BIGG_metabolites(AA2_curate, AA2_merged)
overwrite_with_BIGG_metabolites(AA3_curate, AA3_merged)
overwrite_with_BIGG_metabolites(AA4_curate, AA4_merged)
overwrite_with_BIGG_metabolites(AA5_curate, AA5_merged)
overwrite_with_BIGG_metabolites(AA6_curate, AA6_merged)
overwrite_with_BIGG_metabolites(AA7_curate, AA7_merged)

AA1: There were 439 unbalanced reactions before and now there are 191 after overwriting metabolite info with BIGG data.
AA2: There were 480 unbalanced reactions before and now there are 204 after overwriting metabolite info with BIGG data.
AA3: There were 367 unbalanced reactions before and now there are 157 after overwriting metabolite info with BIGG data.
AA4: There were 413 unbalanced reactions before and now there are 166 after overwriting metabolite info with BIGG data.
AA5: There were 358 unbalanced reactions before and now there are 171 after overwriting metabolite info with BIGG data.
AA6: There were 425 unbalanced reactions before and now there are 180 after overwriting metabolite info with BIGG data.
AA7: There were 448 unbalanced reactions before and now there are 174 after overwriting metabolite info with BIGG data.


### Use BIGG reactions to overwrite model reactions info

In [48]:
def overwrite_with_BIGG_reactions(model):
    unbalanced_rxns = check_balance(model, print_results=False)
    unbalanced_rxns = [r.id for r in unbalanced_rxns]

    for rxn in unbalanced_rxns:

        new_react = df_bigg_rea[df_bigg_rea['bigg_id'] == rxn]["equation"].iloc[0]

        new_mets_dict = {model.metabolites.get_by_id(met_id): coeff for met_id, coeff in new_react.items()}

        reaction = model.reactions.get_by_id(rxn)
        reaction.subtract_metabolites(reaction.metabolites)
        reaction.add_metabolites(new_mets_dict)

    unbalanced_rxns_after = check_balance(model, print_results=False)
    unbalanced_rxns_after = [r.id for r in unbalanced_rxns_after]

    print(f'{model.id}: There were {len(unbalanced_rxns)} unbalanced reactions before and now there are {len(unbalanced_rxns_after)} after overwriting reaction info with BIGG data.')

In [49]:
overwrite_with_BIGG_reactions(AA1_curate)
overwrite_with_BIGG_reactions(AA2_curate)
overwrite_with_BIGG_reactions(AA3_curate)
overwrite_with_BIGG_reactions(AA4_curate)
overwrite_with_BIGG_reactions(AA5_curate)
overwrite_with_BIGG_reactions(AA6_curate)
overwrite_with_BIGG_reactions(AA7_curate)

AA1: There were 191 unbalanced reactions before and now there are 186 after overwriting reaction info with BIGG data.
AA2: There were 204 unbalanced reactions before and now there are 200 after overwriting reaction info with BIGG data.
AA3: There were 157 unbalanced reactions before and now there are 152 after overwriting reaction info with BIGG data.
AA4: There were 166 unbalanced reactions before and now there are 162 after overwriting reaction info with BIGG data.
AA5: There were 171 unbalanced reactions before and now there are 165 after overwriting reaction info with BIGG data.
AA6: There were 180 unbalanced reactions before and now there are 176 after overwriting reaction info with BIGG data.
AA7: There were 174 unbalanced reactions before and now there are 170 after overwriting reaction info with BIGG data.


In [58]:
total_unique = set()
for model in models_curation.values():
    aa1_rxns = check_balance(model)
    total_unique.update(set([rxn.id for rxn in aa1_rxns.keys()]))
print(f"There are in total {len(total_unique)} unique reactions in the models")

There are 186 unbalanced reactions in AA1
There are 200 unbalanced reactions in AA2
There are 152 unbalanced reactions in AA3
There are 162 unbalanced reactions in AA4
There are 165 unbalanced reactions in AA5
There are 176 unbalanced reactions in AA6
There are 170 unbalanced reactions in AA7
There are in total 356 unique reactions in the models


In [59]:
check_balance(AA3_curate)

There are 152 unbalanced reactions in AA3


{<Reaction 23CTI1 at 0x7d4e6f1bea70>: {'H': -1.0},
 <Reaction 2AACLPGT180 at 0x7d4e6f1bf6d0>: {'charge': 2.0},
 <Reaction AACPS3 at 0x7d4e6f1c7580>: {'charge': -1.0},
 <Reaction AACPS6 at 0x7d4e6f1c7e80>: {'charge': -2.0},
 <Reaction AACPS9 at 0x7d4e6f1d1090>: {'charge': -2.0},
 <Reaction ACOAD1f at 0x7d4e6f1e1a80>: {'charge': -2.0},
 <Reaction ACOAD23f at 0x7d4e6f1e2800>: {'charge': -2.0},
 <Reaction ACOAD24f at 0x7d4e6f1e29e0>: {'charge': -2.0},
 <Reaction ACOAD26f at 0x7d4e6f1e0640>: {'charge': -2.0},
 <Reaction ACOAD27f at 0x7d4e6f1e2ce0>: {'charge': -2.0},
 <Reaction ACOAD28f at 0x7d4e6f1e2e00>: {'charge': -2.0},
 <Reaction ACOAD2f at 0x7d4e6f1e2f50>: {'charge': -2.0},
 <Reaction ACOAD34f at 0x7d4e6f1e3250>: {'charge': -2.0},
 <Reaction ACOAD3f at 0x7d4e6f1e34f0>: {'charge': -2.0},
 <Reaction ACOAD4f at 0x7d4e6f1e1c90>: {'charge': -2.0},
 <Reaction ACOAD5f at 0x7d4e6f1e3760>: {'charge': -2.0},
 <Reaction ACOAD6f at 0x7d4e6f1e3b80>: {'charge': -2.0},
 <Reaction ACOAD7f at 0x7d4e6f1

# Manual changes
These following changes were made in a manual curation process, i.e. checking all imbalanced reactions by hand and changing mostly charge and formula of metabolites according to databases like BIGG, SEED and MetaCyc.
Some metabolites were not overwritten by BIGG info in the previous step because BIGG has multiple charge states/formulas and now we need to decide what fits the model the best.
Priority is to get rid of mass imbalances and second is charge imbalances, because we definitely want a stoichiometry consistent model.

## Functions to overwrite metabolites and reactions

In [39]:
def overwrite_charge(model, rxn_id, new_charge):
    if rxn_id in model.metabolites:
        model.metabolites.get_by_id(rxn_id).charge = new_charge

In [40]:
def overwrite_formula(model, rxn_id, new_formula):
    if rxn_id in model.metabolites:
        model.metabolites.get_by_id(rxn_id).formula = new_formula

In [41]:
def overwrite_reaction(model, rxn_id, new_rxn_dict):
    if rxn_id in model.reactions:
        rxn = model.reactions.get_by_id(rxn_id)
        rxn.subtract_metabolites(rxn.metabolites)
        rxn.add_metabolites(new_rxn_dict)

In [42]:
def delete_metabolite(model, met_id):
    if met_id in model.metabolites:

        if len(model.metabolites.get_by_id(met_id).reactions) == 0:
            met = model.metabolites.get_by_id(met_id)
            model.metabolites.remove(met)

        else:
            print(f'metabolite {met_id} cannot be deleted from {model.id} because of reaction(s): {model.metabolites.get_by_id(met_id).reactions}')


In [43]:
def delete_reaction(model, rxn_id):
    if rxn_id in model.reactions:
        rxn = model.reactions.get_by_id(rxn_id)
        model.remove_reactions([rxn])

In [44]:
def delete_duplicate_reaction(model, rxn_old, rxn_new, gpr = False):
    if rxn_old in model.reactions and rxn_new in model.reactions:
        rxn_o = model.reactions.get_by_id(rxn_old)
        rxn_n = model.reactions.get_by_id(rxn_new)

        if gpr == True: # combine GPR aka keep one of them if both have something in common or are empty
            if len(rxn_n.gene_reaction_rule) == 0 and len(rxn_o.gene_reaction_rule) == 0:
                rxn_n.gene_reaction_rule = ''
            if len(rxn_n.gene_reaction_rule) == 0:
                rxn_n.gene_reaction_rule = rxn_o.gene_reaction_rule
            if len(rxn_o.gene_reaction_rule) == 0:
                rxn_n.gene_reaction_rule = rxn_n.gene_reaction_rule
            if rxn_n.gene_reaction_rule in rxn_o.gene_reaction_rule or rxn_o.gene_reaction_rule in rxn_n.gene_reaction_rule: # check if gpr's overlap
                rxn_n.gene_reaction_rule =  rxn_n.gene_reaction_rule if len(rxn_n.gene_reaction_rule) >= len(rxn_o.gene_reaction_rule) else rxn_o.gene_reaction_rule

            else:
                return # if gpr's dont overlap we return and dont delete the reaction

        model.remove_reactions([rxn_o])

In [45]:
def overwrite_manual(model):
    # first all changes to metabolites, i.e. charges and formulas
    # afterwards changes for reactions, i.e. changing stoichiometry, replacing/deleting metabolites (especially H)
    # last deletions (mostly reactions if duplicate but also metabolites)
    # every category is alphabetically sorted

    # first: metabolites
    overwrite_formula(model, "2ameph_p", "C2H7NO3P") # og = C2H8NO3P, charge was changed from 0 to -1 automatically with bigg and formula now also needed to be changed
    overwrite_formula(model, "2ameph_e", "C2H7NO3P")
    overwrite_formula(model, "2ameph_c", "C2H7NO3P")
    overwrite_charge(model, "2dhphaccoa_c", -4) #og = 0; according to seed https://modelseed.org/biochem/compounds/cpd16740
    overwrite_charge(model, "2mpdhl_c", -1)
    overwrite_charge(model, "23dhbzs3_c", -1) # og = 0, -1 alterative in bigg
    overwrite_formula(model, "3hsa_c", "C19H24O3")
    overwrite_charge(model, "3sala_c", -1) # og = 0
    overwrite_formula(model, "3sala_c", "C3H6NO4S")
    overwrite_formula(model, "34dhsa_c", "C19H24O4")
    overwrite_charge(model, "4cml_c", -2)
    overwrite_formula(model, "4cml_c", "C7H4O6")
    overwrite_charge(model, "4hoxpac_c", -1) # og = 0, -1 alterative in bigg
    overwrite_charge(model, "4hoxpac_e", -1)
    overwrite_charge(model, "4hoxpac_p", -1)
    overwrite_formula(model, "49dsha_c", "C19H23O6")
    overwrite_charge(model, "49dsha_c", -1)
    overwrite_charge(model, "5aizc_c", -3)
    overwrite_formula(model, "5mthf_c", "C20H24N7O6") # C20H24N7O6, https://www.genome.jp/entry/C00440
    overwrite_formula(model, "5ohhipcoa_c", "C34H50N7O19P3S")
    overwrite_charge(model, "5ohhipcoa_c", -4)
    overwrite_charge(model, "5ohhip_c", -2)
    overwrite_charge(model, "6pgg_c", -2) # og = 0, -2 alterative in bigg and in accordance with ecoli
    overwrite_formula(model, "9ohadd_c", "C19H24O3")

    overwrite_charge(model, "aad_c", -2)
    overwrite_formula(model, "abg4_c", "C12H12N2O5") # C12H11N2O5; https://biocyc.org/compound?orgid=META&id=CPD0-889
    overwrite_charge(model, "abg4_c", -2) # og = 0
    overwrite_formula(model, "abg4_e", "C12H12N2O5")
    overwrite_charge(model, "abg4_e", -2)
    overwrite_formula(model, "acadl_c", "C12H14N5O8P") # og = C12H14N5O8P; https://pubchem.ncbi.nlm.nih.gov/compound/440867 with h16 charge = 0, we have -1 charge
    overwrite_charge(model, "acadl_c", -2) # https://pubchem.ncbi.nlm.nih.gov/compound/440867 with h16 is charge = 0 and we have H14, so we need -2
    overwrite_charge(model, "ACP_c", 0)
    overwrite_charge(model, "actACP_c", -1)
    overwrite_charge(model, "acysbmn_e", -1) # og = 0, -1 according to metacyc with same formula https://metacyc.org/compound?orgid=META&id=CPD1G-185
    overwrite_charge(model, "acysbmn_c", -1) # og = 0, -1
    overwrite_charge(model, "ah6p__D_c", -2) # og = 0, -2 must be because f6p is also -2 and they can directly converted into each other
    overwrite_charge(model, "air_c", -2)
    overwrite_charge(model, "amacald_c", 1)
    overwrite_formula(model, "amacald_c", "C2H6NO")
    overwrite_formula(model, "andrs14dn317dn_c", "C19H24O2")
    overwrite_formula(model, "apoACP_c", "C373H582N94O136S2") # og = C373H583N94O136S2; charge was changed from 1 to 0 and now the amount of H also reflects that
    overwrite_charge(model, "aso3_c", -1)
    overwrite_charge(model, "aso3_e", -1)
    overwrite_charge(model, "aso3_p", -1)
    overwrite_formula(model, "aso3_c", "H2O3As")
    overwrite_formula(model, "aso3_e", "H2O3As")
    overwrite_formula(model, "aso3_p", "H2O3As")
    overwrite_formula(model, "aso4_c", "HO4As")
    overwrite_formula(model, "aso4_e", "HO4As")
    overwrite_formula(model, "aso4_p", "HO4As")

    overwrite_charge(model, "bmn_c", 2) # og = 0, to balance BMNMSHS (bmn is just imported for this reaction)
    overwrite_charge(model, "bmn_e", 2)
    overwrite_charge(model, "but2eACP_c", -1)

    overwrite_charge(model, "CCbuttc_c", -3) # og = -3; to balance reaction with 4cml_c
    overwrite_formula(model, "CCbuttc_c", "C7H3O6") # C7H3O6; to reflect charge change
    overwrite_formula(model, "cchol_c", "C27H42O3")
    overwrite_charge(model, "cdigmp_c", -2)
    overwrite_formula(model, "cholc3coa_c", "C43H66N7O18P3S")
    overwrite_formula(model, "cholc5coa_c", "C45H70N7O18P3S")
    overwrite_formula(model, "cholc8coa_c", "C48H76N7O18P3S")
    overwrite_formula(model, "cholenec3coa_c", "C43H64N7O18P3S")
    overwrite_formula(model, "cholenec5coa_c", "C45H68N7O18P3S")
    overwrite_formula(model, "cholenec8coa_c", "C48H74N7O18P3S")

    overwrite_formula(model, "decoa_c", "C31H48N7O17P3S")
    overwrite_charge(model, "dgal6p_c", -2)
    overwrite_charge(model, "dmlgnc_c", -1) # og = 0; https://modelseed.org/biochem/compounds/cpd15951
    overwrite_charge(model, "dtbt_c", -1)

    overwrite_charge(model, "fad_c", -2)
    overwrite_charge(model, "fad_e", -2)
    overwrite_charge(model, "fad_p", -2)
    overwrite_charge(model, "fe3dhbzs3_c", 3) # og=0, alternative in bigg
    overwrite_formula(model, "fe3dhbzs3_c", "C30FeH29N3O16") # og = C30FeH28N3O16, with H29 is in bigg and ecoli
    overwrite_formula(model, "fe3dhbzs3_e", "C30FeH29N3O16")
    overwrite_formula(model, "fe3dhbzs3_p", "C30FeH29N3O16")
    overwrite_formula(model, "feoxam_c", "C25H46FeN6O8") # formula change according to bigg and ecoli
    overwrite_formula(model, "feoxam_e", "C25H46FeN6O8")
    overwrite_formula(model, "feoxam_p", "C25H46FeN6O8")
    overwrite_charge(model, "ficytc_c", 1)
    overwrite_charge(model, "fmcbtt_c", 2) # og = 0 but it has fe2 in it
    overwrite_charge(model, "fmn_c", -2)
    overwrite_formula(model, "fmn_c", "C17H19N4O9P")
    overwrite_charge(model, "fmn_e", -2)
    overwrite_formula(model, "fmn_e", "C17H19N4O9P")
    overwrite_charge(model, "fmn_p", -2)
    overwrite_formula(model, "fmn_p", "C17H19N4O9P")
    overwrite_charge(model, "focytc_c", 1)
    overwrite_charge(model, "fpram_c", -1)
    overwrite_formula(model, "fpram_c", "C8H15N3O8P")

    overwrite_charge(model, "g3p_c", -2)
    overwrite_charge(model, "g6p_A_c", -2)
    overwrite_formula(model, "galam6p_c", "C6H13NO8P") # https://biocyc.org/compound?orgid=META&id=D-GALACTOSAMINE-6-PHOSPHATE
    overwrite_charge(model, "galam6p_c", -1)
    overwrite_charge(model, "galam_p", +1) #og = 0 https://metacyc.org/compound?orgid=META&id=GALACTOSAMINE
    overwrite_formula(model, "galam_p", "C6H14NO5")
    overwrite_charge(model, "galam_e", +1)
    overwrite_formula(model, "galam_e", "C6H14NO5")
    overwrite_charge(model, "gcvHL_ADPr_c", -1)
    overwrite_formula(model, "gcvHL_ADPr_c", "C23H36N6O21P4S2")
    overwrite_charge(model, "gcvHL_nhLA_c", 0)
    overwrite_formula(model, "gcvHL_nhLA_c", "C8H16NO8P2S2")
    overwrite_charge(model, "gdptp_c", -7)
    overwrite_charge(model, "glutrna_c", -3)
    overwrite_formula(model, "glycogen_c", "C6H10O5")
    overwrite_charge(model, "gly_pro__L_c", 1)
    overwrite_formula(model, "gly_pro__L_c", "C7H13N2O3")
    overwrite_charge(model, "gly_pro__L_e", 1)
    overwrite_formula(model, "gly_pro__L_e", "C7H13N2O3")
    overwrite_formula(model, "gly_tyr_c", "C11H14N2O4")
    overwrite_formula(model, "gly_phe_c", "C11H14N2O3")
    overwrite_formula(model, "gly_leu_c", "C8H16N2O3")
    overwrite_formula(model, "gly_cys_c", "C5H10N2O3S")

    overwrite_formula(model, "hchol_c", "C27H44O2")
    overwrite_formula(model, "hcholc8coa_c", "C48H76N7O19P3S")
    overwrite_formula(model, "hcholc5coa_c", "C45H70N7O19P3S")
    overwrite_formula(model, "hcholc3coa_c", "C43H66N7O19P3S")
    overwrite_charge(model, "hethmpp_c", -2)
    overwrite_charge(model, "hemeO_c", -2)
    overwrite_formula(model, "hia_c", "C11H16O4")
    overwrite_formula(model, "hia_e", "C11H16O4")
    overwrite_formula(model, "hip_c", "C13H17O4")
    overwrite_charge(model, "hip_c", -1)
    overwrite_formula(model, "hipcoa_c", "C34H48N7O19P3S")
    overwrite_charge(model, "hipcoa_c", -4)
    overwrite_formula(model, "hipecoa_c", "C34H48N7O19P3S")
    overwrite_charge(model, "hipecoa_c", -4)
    overwrite_formula(model, "hipohcoa_c", "C34H50N7O20P3S")
    overwrite_charge(model, "hipohcoa_c", -4)
    overwrite_formula(model, "hipocoa_c", "C34H48N7O20P3S")
    overwrite_charge(model, "hipocoa_c", -4)
    overwrite_charge(model, "hmbpp_c", -4) # pubchem C5H12O8P2 with charge 0; model=C5H8O8P2, so charge must be -4

    overwrite_charge(model, "istfrnA_e", -2)
    overwrite_formula(model, "istfrnA_e", "C17FeH19N2O14")
    overwrite_charge(model, "istfrnB_e", +1)
    overwrite_formula(model, "istfrnB_e", "C16FeH22N2O11")

    overwrite_charge(model, "lysglugly_c", 0)
    overwrite_charge(model, "lysglugly_e", 0)

    overwrite_charge(model, "man6pglyc_c", -3) # og = 0; alternative in bigg and in accordance with ecoli
    overwrite_charge(model, "mbhn_c", -1) # og = 0; https://modelseed.org/biochem/compounds/cpd15971
    overwrite_formula(model, "mcbtt_c", "C47H77N5O10") # was wrongly overwritten by a false bigg formula = [C43H71N5O10], metacyc also has the original one that was in the model
    overwrite_charge(model, "mcbtt_c", 0)
    overwrite_charge(model, "met_L_ala__L_c", -1)
    overwrite_charge(model, "met_L_ala__L_e", -1)
    overwrite_formula(model, "met_L_ala__L_c", "C8H15N2O3S")
    overwrite_formula(model, "met_L_ala__L_e", "C8H15N2O3S")
    overwrite_charge(model, "mhpglu_c", -4)
    overwrite_charge(model, "mi3p__D_c", -2) # og = 0, -2 according to bigg

    overwrite_formula(model, "Nforglu_c", "C6H7NO5")

    overwrite_charge(model, "ocACP_c", 0) # og = -1, 0 alterative in bigg and is in accordance with charge = 0 of ACP
    overwrite_formula(model, "ochol_c", "C27H42O2")
    overwrite_formula(model, "ocholc8coa_c", "C48H74N7O19P3S")
    overwrite_formula(model, "ocholc5coa_c", "C45H68N7O19P3S")
    overwrite_charge(model, "ocdcaACP_c", 0) # og = -1, 0 alterative in bigg and is in accordance with charge = 0 of ACP

    overwrite_charge(model, "peng_p", -1) # og = 0, https://modelseed.org/biochem/compounds/cpd03292
    overwrite_formula(model, "peng_p", "C16H17N2O4S") # og = C16H18N2O4S,https://modelseed.org/biochem/compounds/cpd03292
    overwrite_charge(model, "peng_e", -1)
    overwrite_formula(model, "peng_e", "C16H17N2O4S")
    overwrite_charge(model, "phdcacoa_c", -4) # og=0 but its coa
    overwrite_charge(model, "phdca_c", -1) # og = 0 but https://modelseed.org/biochem/compounds/cpd16013
    overwrite_charge(model, "phdca_e", -1)
    overwrite_charge(model, "ppad_c", -2)
    overwrite_charge(model, "ptd1ino160_c", -1)
    overwrite_charge(model, "pqqh2_c", -3)
    overwrite_charge(model, "pqqh2_p", -3)
    overwrite_charge(model, "ppgpp_c", -6)
    overwrite_charge(model, "prepphth_c", -1) # og = 0; https://modelseed.org/biochem/compounds/cpd16028
    overwrite_charge(model, "prohisglu_c", -1) # og = -2; tripeptid pro-his-glu, only glu has -1 charge and other two are neutral
    overwrite_charge(model, "prohisglu_e", -1)

    overwrite_formula(model, "ribflv_c", "C17H20N4O6")
    overwrite_formula(model, "ribflv_e", "C17H20N4O6")

    # Salmochelin fixes (there are first fixes by Frowin in the apply mass balance function notebook)
    overwrite_formula(model, "salchsx_c", "C16H20NO11") # og C16H21NO11; https://pubchem.ncbi.nlm.nih.gov/compound/135397946
    overwrite_formula(model, "salchsx_e", "C16H20NO11")
    overwrite_formula(model, "salchsx_p", "C16H20NO11")
    overwrite_charge(model, "salchs2fe_c", 3) # to match salchs4fe
    overwrite_charge(model, "salchs2fe_p", 3)
    overwrite_charge(model, "salchs2fe_e", 3)
    #----- more S
    overwrite_charge(model, "salc_e", -1) # og = 0, -1 alterative in bigg
    overwrite_charge(model, "salc_c", -1)
    overwrite_charge(model, "scl_c", -7) # og = 0, -7 according to bigg
    overwrite_charge(model, "scys__L_c", -1)
    overwrite_charge(model, "ssaltpp_c", -3) # og = 0; 0 is not in bigg only -3 or -2
    overwrite_charge(model, "stfrnA_e", -5)
    overwrite_formula(model, "stfrnA_e", "C17H19N2O14")
    overwrite_charge(model, "stfrnA_c", -5)
    overwrite_formula(model, "stfrnA_c", "C17H19N2O14")
    overwrite_charge(model, "stfrnB_e", -2)
    overwrite_formula(model, "stfrnB_e", "C16H22N2O11")
    overwrite_charge(model, "stfrnB_c", -2)
    overwrite_formula(model, "stfrnB_c", "C16H22N2O11")

    overwrite_charge(model, "tag6p__D_c", -2)
    overwrite_charge(model, "tagdp__D_c", -4)
    overwrite_charge(model, "tamocta_c", -1) # og = 0; https://modelseed.org/biochem/compounds/cpd16038
    overwrite_charge(model, "tmhexc_c", -1) # og = 0; https://modelseed.org/biochem/compounds/cpd16050

    overwrite_charge(model, "udpacgal_c", -2) # og = 0, -2 alterative in bigg
    overwrite_charge(model, "udpacgal_p", -2) # og = 0; -2 alternative in bigg and in accordance with ecoli
    overwrite_charge(model, "udpacgal_e", -2)

    overwrite_charge(model, "vacc_c", -1)
    overwrite_charge(model, "vacc_p", -1)
    overwrite_charge(model, "vacc_e", -1)

    overwrite_charge(model, "xylan4_c", -1) # og = 0, no charge given in Bigg, but -1 fits equations
    overwrite_charge(model, "xylan4_e", -1)


    # second: reactions
    overwrite_reaction(model, "3HPAOX", # H was removed from this reaction
                       {"3hoxpac_c": -1.0,
                        "nadh_c": -1.0,
                        "o2_c": -1.0,
                        "34dhpha_c": 1.0,
                        "h2o_c": 1.0,
                        "nad_c": 1.0})

    overwrite_reaction(model, "3SALATAi", # this and ASPA2 are duplicate reactions (only differencs is an H), reaction was curated according to metacyc; https://biocyc.org/reaction?orgid=META&id=3-SULFINOALANINE-AMINOTRANSFERASE-RXN
                       {"3sala_c": -1.0,
                        "akg_c": -1.0,
                        "3snpyr_c": 1.0,
                        "glu__L_c": 1.0})

    overwrite_reaction(model, "ACOAM", # H was removed
                       {"ac_c": -1.0,
                        "atp_c": -1.0,
                        "acadl_c": 1.0,
                        "ppi_c": 1.0})

    overwrite_reaction(model, "ACPS1",
                       {"apoACP_c": -1.0,
                        "coa_c": -1.0,
                        "ACP_c": 1.0,
                        "pap_c": 1.0})

    overwrite_reaction(model, "ACPpds",
                       {"ACP_c": -1.0,
                        "h2o_c": -1.0,
                        "apoACP_c": 1.0,
                        "h_c": 2.0,
                        "pan4p_c": 1.0})

    overwrite_reaction(model, "ALDD31_1",
                       {"gly_c": 1.0,
                        "h_c": 2.0,
                        "h2o_c": -1.0,
                        "nad_c": -1.0,
                        "nadh_c": 1.0,
                        "amacald_c": -1})

    overwrite_reaction(model, "ASR",
                       {"aso4_c": -1.0,
                        "gthrd_c": -2.0,
                        "h_c": -1.0,
                        "aso3_c": 1.0,
                        "gthox_c": 1.0,
                        "h2o_c": 1.0})

    # https://biocyc.org/reaction?orgid=META&id=RXN-10737
    overwrite_reaction(model, "ASR2",
                       {"aso4_c": -1.0,
                        "trdrd_c": -1.0,
                        "h_c": -1.0,
                        "aso3_c": 1.0,
                        "h2o_c": 1.0,
                        "trdox_c": 1.0})

    # was overwritten by BIGG, but before that the H was in the reaction and equals also this reaction BKDC that is e.g. in AA1
    overwrite_reaction(model, "AT_MBD2",
                       {"dhlam_c": -1.0,
                        "ibcoa_c": -1.0,
                        "2mpdhl_c": 1.0,
                        "coa_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "BEF",
                       {"betald_c": -1.0,
                        "fad_c": -1.0,
                        "h2o_c": -1.0,
                        "fadh2_c": 1.0,
                        "glyb_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "CMLDC", # https://modelseed.org/biochem/reactions/rxn02483
                       {"4cml_c": -1.0,
                        "h_c": -1.0, # h changed from product to educt site
                        "5odhf2a_c": 1.0,
                        "co2_c": 1.0})
    if "4CMLCL_kt" not in model.reactions and "CMLDC" in model.reactions:
        rxn = model.reactions.get_by_id("CMLDC")
        rxn.id = "4CMLCL_kt"

    overwrite_reaction(model, "DACL", # https://biocyc.org/reaction?orgid=META&id=RXN0-5040 H was removed
                       {"abg4_c": -1.0,
                        "h2o_c": -1.0,
                        "4abz_c": 1.0,
                        "glu__D_c": 1.0})

    overwrite_reaction(model, "DHBZS2H",
                       {"23dhbzs2_c": -1.0,
                        "h2o_c": -1.0,
                        "h_c": 2.0, # new because of logic
                        "23dhbzs_c": 2.0})

     # https://biocyc.org/reaction?orgid=META&id=RXN-14477
    overwrite_reaction(model, "ENTERH",
                       {"enter_c": -1.0,
                        "h2o_c": -1.0,
                        "23dhbzs3_c": 1.0,
                        "h_c": 1.0}) # h was added

    overwrite_reaction(model, "FADD3",
                       {"atp_c": -1.0,
                        "coa_c": -1.0,
                        "hip_c": -1.0,
                        "hipcoa_c": 1.0,
                        "ppi_c": 1.0,
                        "amp_c": 1.0})

    overwrite_reaction(model, "FE3DHBZS3R",
                       {"fe3dhbzs3_c": -2.0,
                        "nadph_c": -1.0,
                        "23dhbzs3_c": 2.0,
                        "fe2_c": 2.0,
                        "h_c": 3.0,
                        "nadp_c": 1.0})

    overwrite_reaction(model, "FEDHBZS3R1",
                       {"fe3dhbzs3_c": -2.0,
                        "fadh2_c": -1.0,
                        "23dhbzs3_c": 2.0,
                        "fe2_c": 2.0,
                        "h_c": 4.0,
                        "fad_c": 1.0})

    overwrite_reaction(model, "FEDHBZS3R2",
                       {"fe3dhbzs3_c": -2.0,
                        "fmnh2_c": -1.0,
                        "23dhbzs3_c": 2.0,
                        "fe2_c": 2.0,
                        "h_c": 4.0,
                        "fmn_c": 1.0})

    overwrite_reaction(model, "FEDHBZS3R3",
                       {"fe3dhbzs3_c": -2.0,
                        "rbflvrd_c": -1.0,
                        "23dhbzs3_c": 2.0,
                        "fe2_c": 2.0,
                        "h_c": 4.0,
                        "ribflv_c": 1.0})

    overwrite_reaction(model, "FNOR",
                       {"fdxrd_c": -2.0,
                        "h_c": -1.0,
                        "nadp_c": -1.0,
                        "fdxox_c": 2.0, # replaces fdxo_2_2_c
                        "nadph_c": 1.0})

    overwrite_reaction(model, "FORGLUIH2",
                       {"forglu_c": -1.0,
                        "h2o_c": -1.0,
                        "Nforglu_c": 1.0,
                        "nh4_c": 1.0})

    # https://biocyc.org/reaction?orgid=META&id=1.18.1.2-RXN change of stoichiometry
    overwrite_reaction(model, "FPRA",
                       {"fdxrd_c": -2.0,
                        "h_c": -1.0,
                        "nadp_c": -1.0,
                        "fdxox_c": 2.0,
                        "nadph_c": 1.0})

    # https://modelseed.org/biochem/reactions/rxn28276 (immer noch charge imbalance, aber mass stimmt)
    overwrite_reaction(model, "GCDH",
                       {"glutcoa_c": -1.0,
                        "b2coa_c": 1.0,
                        "h_c": 1.0,
                        "co2_c": 1.0})

    # https://metacyc.org/reaction?orgid=META&id=GLUTAMATE-SYNTHASE-FERREDOXIN-RXN#
    overwrite_reaction(model, "GLMS_syn",
                       {"fdxrd_c": -2.0,
                        "akg_c": -1.0,
                        "gln__L_c": -1.0,
                        "h_c": -2.0,
                        "glu__L_c": 2.0,
                        "fdxox_c": 2.0}) # replaces fdxo_2_2_c because we need +2 charge

    overwrite_reaction(model, "GLUTRS_3",
                       {"atp_c": -1.0,
                        "glu__L_c": -1.0,
                        "trnaglu_c": -1.0,
                        "amp_c": 1.0,
                        "glutrna_c": 1.0,
                        "ppi_c": 1.0})

    overwrite_reaction(model, "GLYCS_I",
                       {"gthrd_c": -1.0,
                        "mthgxl_c": -1.0,
                        "lgt__S_c": 1.0}) # og = lgt_s_c; they are duplicates

    overwrite_reaction(model, "GLYCS_II",
                       {"h2o_c": -1.0,
                        "lgt__S_c": -1.0, # og = lgt_s_c; they are duplicates
                        "gthrd_c": 1.0,
                        "h_c": 1.0,
                        "lac__L_c": 1.0})

    overwrite_reaction(model, "GLYTYRabc",
                       {"atp_c": -1.0,
                        "gly_tyr_e": -1.0,
                        "h2o_c": -1.0,
                        "adp_c": 1.0,
                        "gly_tyr_c": 1.0,
                        "pi_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "GLYLEUtr",
                       {"atp_c": -1.0,
                        "gly_leu_e": -1.0,
                        "h2o_c": -1.0,
                        "adp_c": 1.0,
                        "gly_leu_c": 1.0,
                        "pi_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "GLYPHEtr",
                       {"atp_c": -1.0,
                        "gly_phe_e": -1.0,
                        "h2o_c": -1.0,
                        "adp_c": 1.0,
                        "gly_phe_c": 1.0,
                        "pi_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "GLYCYSabc",
                       {"atp_c": -1.0,
                        "gly_cys_e": -1.0,
                        "h2o_c": -1.0,
                        "adp_c": 1.0,
                        "gly_cys_c": 1.0,
                        "pi_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "GTPDPK_1",
                       {"atp_c": -1.0,
                        "gtp_c": -1.0,
                        "amp_c": 1.0,
                        "gdptp_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "HIPA",
                       {"hip_c": -1.0,
                        "nadh_c": -1.0,
                        "5ohhip_c": 1.0,
                        "nad_c": 1.0})

    overwrite_reaction(model, "HSAC",
                       {"34dhsa_c": -1.0,
                        "o2_c": -1.0,
                        "49dsha_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "IPDAB",
                       {"5ohhip_c": -1.0,
                        "atp_c": -1.0,
                        "coa_c": -1.0,
                        "h_c": -1.0,
                        "5ohhipcoa_c": 1.0,
                        "amp_c": 1.0,
                        "ppi_c": 1.0})

    overwrite_reaction(model, "MECDPDH3_syn",
                       {"2mecdp_c": -1.0,
                        "fdxrd_c": -2.0,
                        "h_c": -1.0,
                        "fdxox_c": 2.0, # replaces fdxo_2_2_c
                        "h2mb4p_c": 1.0,
                        "h2o_c": 1.0})
    overwrite_reaction(model, "MECDPDH4E", # AA3 only has this reaction but not MECDPDH3_syn which in other models are duplicates; to be able to better compare between models, i am going to change the name of the reaction
                       {"2mecdp_c": -1.0,
                        "fdxrd_c": -2.0,
                        "h_c": -1.0,
                        "fdxox_c": 2.0, # replaces fdxo_2_2_c
                        "h2mb4p_c": 1.0,
                        "h2o_c": 1.0})
    if "MECDPDH3_syn" not in model.reactions and "MECDPDH4E" in model.reactions:
        rxn = model.reactions.get_by_id("MECDPDH4E")
        rxn.id = "MECDPDH3_syn"


    overwrite_reaction(model, "MRF", # https://www.genome.jp/entry/R01217, iron complexes were doubled in stoichiometry according to kegg; H was changed in accordance to mass/charge balance and out formulas/charges mlthf/5mthf are diff from kegg
                       {"fdxrd_c": -2.0,
                        "h_c": -3.0,
                        "mlthf_c": -1.0,
                        "5mthf_c": 1.0,
                        "fdxox_c": 2.0})

    overwrite_reaction(model, "MS_1",
                       {"hcys__L_c": -1.0,
                        "mhpglu_c": -1.0,
                        "hpglu_c": 1.0,
                        "met__L_c": 1.0})

    # there is still charge imbalance with his reaction but the fix gets rid of the mass inbalance
    overwrite_reaction(model, "NMO",
               {"etha_c": -1.0,
                "fmnh2_c": -1.0,
                "o2_c": -1.0,
                "acald_c": 1.0,
                "fmn_c": 1.0,
                "no2_c": 1.0,
                "h_c": 6.0  # this was og reaction but was overwritten with bigg info (H was lost)
                })

    overwrite_reaction(model, "OOR3r", # https://biocyc.org/reaction?orgid=META&id=2-OXOGLUTARATE-SYNTHASE-RXN; https://www.genome.jp/dbget-bin/www_bget?ec:1.2.7.3 EC number was given on BIGG page for that reaction but bigg was a bit off
                       {"akg_c": -1.0,
                        "coa_c": -1.0,
                        "fdxox_c": -2.0,
                        "succoa_c": 1.0,
                        "co2_c": 1.0,
                        "h_c": 1.0,
                        "fdxrd_c": 2.0})

    overwrite_reaction(model, "PACPT_1",
                       {"amp_c": 1.0,
                        "coa_c": -1.0,
                        "ppcoa_c": 1.0,
                        "ppad_c": -1.0})

    # https://modelseed.org/biochem/reactions/rxn13395 out scl is only -7, in seed it is -8, so we need only one H
    overwrite_reaction(model, "PC2DHG",
                       {"dscl_c": -1.0,
                        "nadp_c": -1.0,
                        "nadph_c": 1.0,
                        "scl_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "PCADYOX", # https://modelseed.org/biochem/reactions/rxn01192
                       {"34dhbz_c": -1.0,
                        "o2_c": -1.0,
                        "CCbuttc_c": 1.0,
                        "h_c": 2.0}) # was added

    overwrite_reaction(model, "POR_syn",
                       {"fdxox_c": -2.0, # replaces fdxo_2_2_c
                        "coa_c": -1.0,
                        "pyr_c": -1.0,
                        "accoa_c": 1.0,
                        "co2_c": 1.0,
                        "h_c": 1.0,
                        "fdxrd_c": 2.0})

    overwrite_reaction(model, "PRAIS",
                       {"atp_c": -1.0,
                        "fpram_c": -1.0,
                        "adp_c": 1.0,
                        "air_c": 1.0,
                        "pi_c": 1.0,
                        "h_c": 2.0})

    overwrite_reaction(model, "QSDH",
                       {"pqq_c": -1.0,
                        "skm_c": -1.0,
                        "3dhsk_c": 1.0,
                        "pqqh2_c": 1.0})

    # Salmochelin fixes (there are first fixes by Frowin in the apply mass balance function notebook)
    overwrite_reaction(model, "SALCHS1H",
                       {"h2o_c": -1.0,
                        "salchs1_c": -1.0,
                        "23dhbzs_c": 1.0,
                        "salchsx_c": 1.0,
                        "h_c": 2.0})
    overwrite_reaction(model, "SALCHS2H",
                       {"h2o_c": -1.0,
                        "salchs2_c": -1.0,
                        "salchs1_c": 1.0,
                        "salchsx_c": 1.0,
                        "h_c": 1.0})

    overwrite_reaction(model, "SMIA1",
                       {"fe3_e": -1.0,
                        "stfrnA_e": -1.0,
                        "istfrnA_e": 1.0})

    overwrite_reaction(model, "SMIA1abc",
                       {"atp_c": -1.0,
                        "h2o_c": -1.0,
                        "istfrnB_e": -1.0,
                        "adp_c": 1.0,
                        "fe3_c": 1.0,
                        "h_c": 1.0,
                        "pi_c": 1.0,
                        "stfrnB_c": 1.0})

    overwrite_reaction(model, "SMIA2abc",
                       {"atp_c": -1.0,
                        "h2o_c": -1.0,
                        "istfrnA_e": -1.0,
                        "adp_c": 1.0,
                        "fe3_c": 1.0,
                        "h_c": 1.0,
                        "pi_c": 1.0,
                        "stfrnA_c": 1.0})

    overwrite_reaction(model, "SMIB1",
                       {"fe3_e": -1.0,
                        "stfrnB_e": -1.0,
                        "istfrnB_e": 1.0})

    overwrite_reaction(model, "STAS",
                       {"atp_c": -2.0,
                        "cit_c": -2.0,
                        "orn_c": -1.0,
                        "amp_c": 2.0,
                        "ppi_c": 2.0,
                        "stfrnA_c": 1.0,
                        "h_c": 2.0}) # H changed sides, similar to here https://biocyc.org/reaction?orgid=META&id=RXN-19521 but reaction is still different

    overwrite_reaction(model, "T6PK",
                       {"atp_c": -1.0,
                        "tag6p__D_c": -1.0,
                        "adp_c": 1.0,
                        "tagdp__D_c": 1.0,
                        "h_c": 1.0})

    # https://modelseed.org/biochem/reactions/rxn10816
    overwrite_reaction(model, "THZSN_1",
                       {"cys__L_c": -1.0,
                        "dxyl_c": -1.0,
                        "fdxox_c": -1.0, # instead of fdx_2_2_c
                        "tyr__L_c": -1.0,
                        "4hba_c": 1.0,
                        "4mhetz_c": 1.0,
                        "co2_c": 1.0,
                        "fdxrd_c": 1.0,
                        "h2o_c": 1.0,
                        "h_c": 2.0, # 2 instead of 1;
                        "nh4_c": 1.0,
                        "pyr_c": 1.0})

    # deletions
    # delete_duplicate_reaction(model, "ACPS1_1", "ACPS1", gpr=True) # cannot delete because gprs dont overlap
    # delete_duplicate_reaction(model, "ASPA2", "3SALATAi", gpr=True) # cannot delete because gprs dont overlap
    delete_duplicate_reaction(model, "CMLDC", "4CMLCL_kt", gpr=True)
    delete_duplicate_reaction(model, "CO2FO", "COCO2", gpr=True)
    delete_duplicate_reaction(model, "FMNAT_1", "FMNAT") # same gpr
    delete_duplicate_reaction(model, "GAPD_1", "GAPD", gpr=True) # GAPD_1 gpr is in GAPD
    delete_duplicate_reaction(model, "GTPDPK_1", "GTPDPK", gpr=True) # depending on model, gpr's match
    delete_duplicate_reaction(model, "HMEDS", "MECDPDH3_syn") # both have no gpr
    delete_duplicate_reaction(model, "MECDPDH4E", "MECDPDH3_syn") # same gpr
    delete_duplicate_reaction(model, "PRFGCL", "PRAIS", gpr=True) # same gpr
    delete_duplicate_reaction(model, "RBFK_1", "RBFK") # same gpr

    if "PRFGS" in model.reactions and "PRFGS_1" in model.reactions: # gprs overlap but both have something unique thats why special command for that
        model.reactions.get_by_id("PRFGS").gene_reaction_rule = model.reactions.get_by_id("PRFGS").gene_reaction_rule + " or (WP_079211777_1 and WP_079211778_1)"
        delete_reaction(model, "PRFGS_1")

    # all three metabolites are only part of this reaction
    # PENAM: h2o_p + peng_p <=> 6apa_p + pac_p
    if "peng_p" in model.metabolites and "6apa_p" in model.metabolites and "pac_p" in model.metabolites:
        if len(model.metabolites.get_by_id("peng_p").reactions) == 1 and len(model.metabolites.get_by_id("6apa_p").reactions) == 1 and len(model.metabolites.get_by_id("pac_p").reactions) == 1:
            delete_reaction(model, "PENAM")
            delete_metabolite(model, "peng_p")
            delete_metabolite(model, "6apa_p")
            delete_metabolite(model, "pac_p")

    delete_metabolite(model, "fdxo_2_2_c")
    delete_metabolite(model, "lgt_s_c")

## Overwrite

In [46]:
overwrite_manual(AA1_curate)
overwrite_manual(AA2_curate)
overwrite_manual(AA3_curate)
overwrite_manual(AA4_curate)
overwrite_manual(AA5_curate)
overwrite_manual(AA6_curate)
overwrite_manual(AA7_curate)

metabolite peng_p cannot be deleted from AA4 because of reaction(s): frozenset({<Reaction PENGt1 at 0x7afdbcfe6200>})
metabolite 6apa_p cannot be deleted from AA4 because of reaction(s): frozenset({<Reaction 6APAt1 at 0x7afdb2c57850>})
metabolite pac_p cannot be deleted from AA4 because of reaction(s): frozenset({<Reaction PACt1 at 0x7afdbd0b6110>})


In [103]:
check_balance(AA1_curate)

There are 2 unbalanced reactions in AA1


{<Reaction AMMQT8 at 0x7239232ee4d0>: {'charge': -2.0},
 <Reaction NMO at 0x72392e0151b0>: {'charge': 4.0}}

In [104]:
check_balance(AA2_curate)

There are 4 unbalanced reactions in AA2


{<Reaction AMMQT8 at 0x72391d4b9300>: {'charge': -2.0},
 <Reaction DHNAOT at 0x72391af343a0>: {'charge': 2.0},
 <Reaction GCDH at 0x72391b727dc0>: {'charge': 2.0},
 <Reaction THZSN_1 at 0x723919ae6260>: {'charge': 1.0}}

In [110]:
check_balance(AA3_curate)
# APPATr_BS and G3POA_BS are not actually imbalanced but because these reactions contain very large metabolites there are Rundungsfehler

There are 10 unbalanced reactions in AA3


{<Reaction AGPATr_BS at 0x723922631de0>: {'charge': 1.6653345369377348e-16,
  'C': -2.220446049250313e-15,
  'H': -5.329070518200751e-15,
  'N': -1.7486012637846216e-15,
  'O': -1.5543122344752192e-15,
  'P': 1.942890293094024e-16,
  'S': -4.163336342344337e-17},
 <Reaction DHNAOT at 0x723922de7430>: {'charge': 2.0},
 <Reaction G3POA_BS at 0x72392205b220>: {'charge': 1.6653345369377348e-16,
  'C': -2.220446049250313e-15,
  'H': -5.329070518200751e-15,
  'N': -1.7486012637846216e-15,
  'O': 2.220446049250313e-16,
  'P': -2.498001805406602e-16,
  'S': -4.163336342344337e-17},
 <Reaction HDECH at 0x723922fe5a20>: {'charge': 2.0},
 <Reaction LIPO1S24_BS at 0x723922482710>: {'charge': 2160.0},
 <Reaction LIPO2S24_BS at 0x723922482980>: {'charge': 2160.0},
 <Reaction LIPO3S24_BS at 0x723922481de0>: {'charge': 2160.0},
 <Reaction LIPO4S24_BS at 0x723922482c80>: {'charge': 2160.0},
 <Reaction MECDPDH at 0x72392302d900>: {'charge': -2.0},
 <Reaction NMO at 0x7239231be320>: {'charge': 4.0}}

In [106]:
check_balance(AA4_curate)

There are 1 unbalanced reactions in AA4


{<Reaction GCDH at 0x72392ca13e20>: {'charge': 2.0}}

In [107]:
check_balance(AA5_curate)

There are 2 unbalanced reactions in AA5


{<Reaction HDECH at 0x72391fd5ddb0>: {'charge': 2.0},
 <Reaction NMO at 0x7239205abe20>: {'charge': 4.0}}

In [108]:
check_balance(AA6_curate)

There are 4 unbalanced reactions in AA6


{<Reaction AMMQT8 at 0x7239186e19c0>: {'charge': -2.0},
 <Reaction DHNAOT at 0x723917f8f820>: {'charge': 2.0},
 <Reaction GCDH at 0x723918076710>: {'charge': 2.0},
 <Reaction NMO at 0x7239187d0e50>: {'charge': 4.0}}

In [48]:
check_balance(AA7_curate)

There are 2 unbalanced reactions in AA7


{<Reaction DHNAOT at 0x7afdac171330>: {'charge': 2.0},
 <Reaction NMO at 0x7afdaafd8a00>: {'charge': 4.0}}

In [49]:
# save all curated model as file (so we can do memote report with it)
for model_name, model in models_curation.items():
    path = f"/home/lisa/Dokumente/Programmierung/Models/08_charge_balanced/{model_name[:3]}_curated.xml"
    write_sbml_model(model, path)

## Scale Down Teichoic Acids

In [113]:
# AA3 has some problems with teichoic acids reactions
# i.e. LIPO1S24_BS, LIPO2S24_BS, LIPO3S24_BS, LIPO4S24_BS
check_mass_balance(AA3_curate)

{<Reaction AGPATr_BS at 0x723922631de0>: {'charge': 1.6653345369377348e-16,
  'C': -2.220446049250313e-15,
  'H': -5.329070518200751e-15,
  'N': -1.7486012637846216e-15,
  'O': -1.5543122344752192e-15,
  'P': 1.942890293094024e-16,
  'S': -4.163336342344337e-17},
 <Reaction DHNAOT at 0x723922de7430>: {'charge': 2.0},
 <Reaction G3POA_BS at 0x72392205b220>: {'charge': 1.6653345369377348e-16,
  'C': -2.220446049250313e-15,
  'H': -5.329070518200751e-15,
  'N': -1.7486012637846216e-15,
  'O': 2.220446049250313e-16,
  'P': -2.498001805406602e-16,
  'S': -4.163336342344337e-17},
 <Reaction HDECH at 0x723922fe5a20>: {'charge': 2.0},
 <Reaction LIPO1S24_BS at 0x723922482710>: {'charge': 2160.0},
 <Reaction LIPO2S24_BS at 0x723922482980>: {'charge': 2160.0},
 <Reaction LIPO3S24_BS at 0x723922481de0>: {'charge': 2160.0},
 <Reaction LIPO4S24_BS at 0x723922482c80>: {'charge': 2160.0},
 <Reaction MECDPDH at 0x72392302d900>: {'charge': -2.0},
 <Reaction NMO at 0x7239231be320>: {'charge': 4.0}}

In [51]:
def scale_down_lipo(model):
    # LIPO1S24_BS/LIPO2S24_BS/LIPO3S24_BS/LIPO4S24_BS
    overwrite_reaction(model, "LIPO1S24_BS",{
        "cdpglyc_c": -1.0,
        "d12dg_BS_c": -(1/2400),
        "udpg_c": -1.0,
        "cmp_c": 1.0,
        "h_c": 2.0,
        "lipo1_24_BS_c": (1/2400),
        "udp_c": 1.0
    })
    overwrite_reaction(model, "LIPO2S24_BS",{
        "cdpglyc_c": -1.0,
        "d12dg_BS_c": -(1/2400),
        "uacgam_c": -1.0,
        "cmp_c": 1.0,
        "h_c": 2.0,
        "lipo2_24_BS_c": (1/2400),
        "udp_c": 1.0
    })
    overwrite_reaction(model, "LIPO3S24_BS",{
        "ala__D_c": -1.0,
        "atp_c": -1.0,
        "cdpglyc_c": -1.0,
        "d12dg_BS_c": -(1/2400),
        "h2o_c": -1.0,
        "amp_c": 1.0,
        "cmp_c": 1.0,
        "h_c": 2.0,
        "lipo3_24_BS_c": (1/2400),
        "ppi_c": 1.0
    })
    overwrite_reaction(model, "LIPO4S24_BS",{
        "cdpglyc_c": -1.0,
        "d12dg_BS_c": -(1/2400),
        "cmp_c": 1.0,
        "h_c": 2.0,
        "lipo4_24_BS_c": (1/2400)
    })

In [52]:
scale_down_lipo(AA3_curate)

In [53]:
write_sbml_model(AA3_curate, "/home/lisa/Dokumente/Programmierung/Models/08_charge_balanced/AA3_curated.xml")

## Go through unbalanced reactions

In [117]:
inbalanced_rxns = {}
for model in models_curation.values():
    current_imbalances = check_mass_balance(model)
    for rxn, charge in current_imbalances.items():
        if rxn.id != "AGPATr_BS" and rxn.id != "G3POA_BS":
            inbalanced_rxns.update({rxn.id:charge})

In [118]:
print(len(inbalanced_rxns))
inbalanced_rxns

11


{'AMMQT8': {'charge': -2.0},
 'NMO': {'charge': 4.0},
 'DHNAOT': {'charge': 2.0},
 'GCDH': {'charge': 2.0},
 'THZSN_1': {'charge': 1.0},
 'HDECH': {'charge': 2.0},
 'LIPO1S24_BS': {'charge': 0.8999999999999999,
  'C': 1.7763568394002505e-15,
  'H': 3.552713678800501e-15},
 'LIPO2S24_BS': {'charge': 0.8999999999999999,
  'C': 1.7763568394002505e-15,
  'H': 3.552713678800501e-15},
 'LIPO3S24_BS': {'charge': 0.8999999999999999,
  'C': -8.881784197001252e-16,
  'H': -3.552713678800501e-15,
  'O': 8.881784197001252e-16},
 'LIPO4S24_BS': {'charge': 1.9, 'H': 1.0},
 'MECDPDH': {'charge': -2.0}}

In [119]:
for rxn in inbalanced_rxns.keys():
    for model in models_curation.values():
        if rxn in model.reactions:
            print(rxn,"is in", model.id)

AMMQT8 is in AA1
AMMQT8 is in AA2
AMMQT8 is in AA6
NMO is in AA1
NMO is in AA3
NMO is in AA5
NMO is in AA6
NMO is in AA7
DHNAOT is in AA2
DHNAOT is in AA3
DHNAOT is in AA6
DHNAOT is in AA7
GCDH is in AA2
GCDH is in AA4
GCDH is in AA6
THZSN_1 is in AA2
HDECH is in AA3
HDECH is in AA5
LIPO1S24_BS is in AA3
LIPO2S24_BS is in AA3
LIPO3S24_BS is in AA3
LIPO4S24_BS is in AA3
MECDPDH is in AA3


In [326]:
# there was one metabolite and one reaction that I changed two times during my manual curation process into different things, so we need to inspect what the right change is
for model in models_curation.values():
    #if "3sala_c" in model.metabolites:
        #print("3sala: ", model.id)
    if "PCADYOX" in model.reactions:
       print("PCADYOX: ", model.id)

PCADYOX:  AA1
PCADYOX:  AA2
PCADYOX:  AA4
PCADYOX:  AA5
PCADYOX:  AA6
PCADYOX:  AA7


# Compare with already curated models

check with e.g. nicely curated E. coli model (gram negative; iML1515) and check reactions/metabolites/pathways there and if we have them in our models so we can compare

In [2]:
# Load SBML Models
models_path = "/home/lisa/Dokumente/Programmierung/Models/10_duplicate_removal/"
models_curation = {}
for model_name in (f for f in os.listdir(models_path) if f.endswith(".xml")):
    model = read_sbml_model(f"{models_path}/{model_name}")
    model.solver = "cplex"
    name = str(model_name[:3]+"_curate")
    models_curation[name] = model

models_curation = {key: models_curation[key] for key in sorted(models_curation.keys())}  # sorts the dictionary alphabetically
AA1_curate, AA2_curate, AA3_curate, AA4_curate, AA5_curate, AA6_curate, AA7_curate = [models_curation[f"AA{i}_curate"] for i in range(1, 8)]

Restricted license - for non-production use only - expires 2026-11-23


In [3]:
ecoli = read_sbml_model("../Models/iML1515.xml")
ecoli.solver = 'cplex'

Restricted license - for non-production use only - expires 2026-11-23


In [5]:
strepo = load_json_model("../Models/iYH543.json")
strepo.solver = 'cplex'

In [6]:
ecoli2 = read_sbml_model("../Models/iHM1533.xml")
ecoli2.solver = 'cplex'

In [7]:
vergleichsmodelle = {
    "ecoli": ecoli,
    "ecoli2": ecoli2,
    "strepto": strepo
}

In [139]:
# compare if reactions from one of our models is also in the ecoli model; we can then check if we can use the balanced ecoli reaction to also balance our model
def compare_with_ecoli(model):
    not_ecoli = []
    in_ecoli = []

    unfinished_business = check_balance(model, print_results=False)

    for rxn in unfinished_business:
        if rxn.id in ecoli.reactions:
            in_ecoli.append(rxn.id)
        else:
            not_ecoli.append(rxn.id)

    print(f"The following {len(not_ecoli)} reactions are NOT in E. coli model:\n{not_ecoli}\n"
          f"The following {len(in_ecoli)} reactions are IN E. coli model:\n{in_ecoli}")

In [140]:
compare_with_ecoli(AA7_curate)

The following 6 reactions are NOT in E. coli model:
['DHNAOT', 'NMO', 'OOR3r', 'POR_syn', 'STAS', 'SUCD2']
The following 0 reactions are IN E. coli model:
[]


In [16]:
ecoli.metabolites.query("coa")

[<Metabolite 3ohdcoa_c at 0x778b9cfea380>,
 <Metabolite 3hodcoa_c at 0x778b9cfebbb0>,
 <Metabolite hdcoa_c at 0x778b9cfeaf80>,
 <Metabolite tdecoa_c at 0x778b9d048850>,
 <Metabolite oxalcoa_c at 0x778b9d048f70>,
 <Metabolite phaccoa_c at 0x778b9d049930>,
 <Metabolite odecoa_c at 0x778b9d049cc0>,
 <Metabolite accoa_c at 0x778b9d049f00>,
 <Metabolite occoa_c at 0x778b9d04a020>,
 <Metabolite 3hbcoa_c at 0x778b9d04a620>,
 <Metabolite ddcacoa_c at 0x778b9d04ad40>,
 <Metabolite malcoame_c at 0x778b9d04b400>,
 <Metabolite 3hddcoa_c at 0x778b9d04b910>,
 <Metabolite sbzcoa_c at 0x778b9d048130>,
 <Metabolite crnDcoa_c at 0x778b9cf387c0>,
 <Metabolite 3hbzcoa_c at 0x778b9cf38df0>,
 <Metabolite 3oocoa_c at 0x778b9cf39060>,
 <Metabolite 2tpr3dpcoa_c at 0x778b9cf394e0>,
 <Metabolite btcoa_c at 0x778b9cf3a020>,
 <Metabolite tdcoa_c at 0x778b9cf3a0b0>,
 <Metabolite 3otdcoa_c at 0x778b9cf3a890>,
 <Metabolite 3hhdcoa_c at 0x778b9cf3a8f0>,
 <Metabolite hx2coa_c at 0x778b9cf3a9b0>,
 <Metabolite 3ohcoa_c a

## Compare Directionailty

I want to check directionality of my reactions; however getting that info from bigg is very tedious, so i want to use well curated models. first i have to check though how many of my reactions are actually in these models.

In [77]:
all_models = models_curation | vergleichsmodelle

In [8]:
# Create a combined set of all unique reaction IDs across AA models
aa_rxns = set()
for model in models_curation.values():
    aa_rxns.update(r.id for r in model.reactions)

In [9]:
vergleich_rxns = set()
for model in vergleichsmodelle.values():
    vergleich_rxns.update(r.id for r in model.reactions)

In [65]:
ecoli_rxns = set(r.id for r in ecoli.reactions)

only_in_aa = aa_rxns - ecoli_rxns
only_in_ecoli = ecoli_rxns - aa_rxns
common_rxns = aa_rxns & ecoli_rxns

print(f"Only in AA models: {len(only_in_aa)}")
print(f"Only in E. coli: {len(only_in_ecoli)}")
print(f"Common: {len(common_rxns)}")


Only in AA models: 2469
Only in E. coli: 586
Common: 2126


In [10]:
only_in_aa = aa_rxns - vergleich_rxns
only_in_ecoli = vergleich_rxns - aa_rxns
common_rxns = aa_rxns & vergleich_rxns

print(f"Only in AA models: {len(only_in_aa)}")
print(f"Only in well curated models: {len(only_in_ecoli)}")
print(f"Common: {len(common_rxns)}")


Only in AA models: 2084
Only in well curated models: 1396
Common: 2463


In [11]:
def get_vergleichsmodell(vergleichsmodelle, rxn):
    for model in vergleichsmodelle.values():
        if rxn in model.reactions:
            return model
    return None

In [14]:
def compare_bounds(common_rxns):
    both_bounds = set()
    upper_bound = set()
    lower_bound = set()
    all_good = set()

    for model in models_curation.values():
        #print(model.id)

        for rxn in common_rxns:
            if rxn in model.reactions:
                vergleichsmodell = get_vergleichsmodell(vergleichsmodelle, rxn)
                ub_vergleich = vergleichsmodell.reactions.get_by_id(rxn).upper_bound
                lb_vergleich = vergleichsmodell.reactions.get_by_id(rxn).lower_bound

                ub_model = model.reactions.get_by_id(rxn).upper_bound
                lb_model = model.reactions.get_by_id(rxn).lower_bound

                if ub_vergleich != ub_model and lb_vergleich != lb_model:
                    both_bounds.add(rxn)
                elif ub_vergleich != ub_model:
                    upper_bound.add(rxn)
                elif lb_vergleich != lb_model:
                    lower_bound.add(rxn)
                else:
                    all_good.add(rxn)

    print(f"There are {len(all_good)} reactions where bounds fit.\n"
          f"There are {len(both_bounds)} reactions where both bounds do not match.\n"
          f"There are {len(upper_bound)} reactions where upper bounds do not match.\n"
          f"There are {len(lower_bound)} reactions where lower bounds do not match.\n")

    return both_bounds, upper_bound, lower_bound

In [15]:
both_bounds, upper_bounds, lower_bounds = compare_bounds(common_rxns)

There are 2036 reactions where bounds fit.
There are 3 reactions where both bounds do not match.
There are 9 reactions where upper bounds do not match.
There are 416 reactions where lower bounds do not match.



In [101]:
both_bounds

{'AH6PI', 'CLt3_2pp', 'INDOLEt2pp'}

In [74]:
upper_bounds

{'EX_12dgr180_e',
 'EX_fol_e',
 'EX_hco3_e',
 'EX_ribflv_e',
 'EX_sheme_e',
 'FORt',
 'HMGCOAS',
 'URCN'}

In [75]:
lower_bounds

{'4ABUTD',
 '4ABZGLUtr',
 '4HALDD',
 '4HTHRtrpp',
 '5DGLCNt2rpp',
 '6PHBG',
 '6PHBG2',
 'ACACt2',
 'ACACt2pp',
 'ACt2rpp',
 'ADEt2rpp',
 'ADNt2',
 'ADNt2pp',
 'AKGt2rpp',
 'ALAt2r',
 'ALDD31',
 'ALLTNt2rpp',
 'AMPEP1',
 'AMPEP13',
 'AMPEP14',
 'AMPEP5',
 'AMPEP8',
 'AOXSr',
 'ARBt2rpp',
 'ASNt2rpp',
 'ATPM',
 'BGLA',
 'BUTt2rpp',
 'CRNDt2rpp',
 'CRNt2rpp',
 'CYTDt2pp',
 'DARBt2rpp',
 'DCMPDA',
 'DDGLCNt2rpp',
 'D_LACt2pp',
 'EX_12ppd__R_e',
 'EX_12ppd__S_e',
 'EX_14glucan_e',
 'EX_15dap_e',
 'EX_23camp_e',
 'EX_23ccmp_e',
 'EX_23cgmp_e',
 'EX_23cump_e',
 'EX_26dap__M_e',
 'EX_2ddglcn_e',
 'EX_2pg_e',
 'EX_34dhpac_e',
 'EX_3amp_e',
 'EX_3cmp_e',
 'EX_3gmp_e',
 'EX_3hcinnm_e',
 'EX_3hoxpac_e',
 'EX_3hpppn_e',
 'EX_3ump_e',
 'EX_4abut_e',
 'EX_4abz_e',
 'EX_4abzglu_e',
 'EX_4hoxpac_e',
 'EX_4hoxpacd_e',
 'EX_4hphac_e',
 'EX_4hthr_e',
 'EX_5aptn_e',
 'EX_5dglcn_e',
 'EX_6apa_e',
 'EX_LalaDgluMdapDala_e',
 'EX_LalaDgluMdap_e',
 'EX_LalaDglu_e',
 'EX_LalaLglu_e',
 'EX_abt__D_e',
 'EX_ac_e',


## Check origin

In [3]:
rxn_model_origin = pd.read_csv("../Datasets/BIGG/reactions_bigg_model_origin.csv", quotechar='"')
rxn_model_origin["bigg_models"] = rxn_model_origin["bigg_models"].apply(ast.literal_eval)

In [17]:
value = rxn_model_origin.loc[rxn_model_origin['bigg_id'] == "10FTHFtm", 'bigg_models'].iloc[0]

In [43]:
bigg_models = []
for rxn in list(both_bounds | lower_bounds | upper_bounds):
    if rxn in rxn_model_origin["bigg_id"].values:
        bigg_models_for_rxn = rxn_model_origin.loc[rxn_model_origin['bigg_id'] == rxn, 'bigg_models'].iloc[0]
        bigg_models.append(bigg_models_for_rxn)

In [44]:
bigg_models = [item for sublist in bigg_models for item in sublist]
bigg_models

['iJN1463',
 'iSSON_1240',
 'iG2583_1286',
 'iYS1720',
 'iSBO_1134',
 'iECIAI1_1343',
 'iEC1356_Bl21DE3',
 'iB21_1397',
 'iEKO11_1354',
 'iSbBS512_1146',
 'iEcE24377_1341',
 'iBWG_1329',
 'iECNA114_1301',
 'iEK1008',
 'iECOK1_1307',
 'iECS88_1305',
 'iAF1260b',
 'iETEC_1333',
 'ic_1306',
 'iEcDH1_1363',
 'iSFxv_1172',
 'iSDY_1059',
 'iEC1344_C',
 'iSF_1195',
 'iECED1_1282',
 'iECIAI39_1322',
 'iECUMN_1333',
 'iJR904',
 'Recon3D',
 'iRC1080',
 'iCHOv1',
 'iCHOv1_DG44',
 'iY75_1357',
 'iECDH10B_1368',
 'iZ_1308',
 'iJO1366',
 'iWFL_1372',
 'iECO103_1326',
 'iE2348C_1286',
 'iEC1349_Crooks',
 'iECH74115_1262',
 'iLB1027_lipid',
 'iML1515',
 'iUMN146_1321',
 'iAPECO1_1312',
 'iECD_1391',
 'iAF987',
 'iCN900',
 'iEC042_1314',
 'iS_1188',
 'STM_v1_0',
 'iSB619',
 'iECSF_1327',
 'iNJ661',
 'iECO111_1330',
 'iECs_1301',
 'iIT341',
 'iUTI89_1310',
 'iJN1463',
 'iAF1260',
 'iHN637',
 'iPC815',
 'iJN746',
 'iNRG857_1313',
 'iYS854',
 'iEcSMS35_1347',
 'iEC1368_DH5a',
 'iECSE_1348',
 'iECB_1328',


In [45]:
counts = Counter(bigg_models)
counts

Counter({'iWFL_1372': 353,
         'iECW_1372': 353,
         'iNRG857_1313': 352,
         'iEcolC_1368': 352,
         'iUMNK88_1353': 352,
         'iLF82_1304': 352,
         'iECIAI1_1343': 351,
         'iEKO11_1354': 351,
         'iEcE24377_1341': 351,
         'iECNA114_1301': 351,
         'ic_1306': 351,
         'iECO103_1326': 351,
         'iE2348C_1286': 351,
         'iEC55989_1330': 351,
         'iECP_1309': 351,
         'iG2583_1286': 350,
         'iB21_1397': 350,
         'iBWG_1329': 350,
         'iECOK1_1307': 350,
         'iECS88_1305': 350,
         'iETEC_1333': 350,
         'iECUMN_1333': 350,
         'iY75_1357': 350,
         'iECDH10B_1368': 350,
         'iZ_1308': 350,
         'iUMN146_1321': 350,
         'iAPECO1_1312': 350,
         'iECD_1391': 350,
         'iECSF_1327': 350,
         'iECO111_1330': 350,
         'iECs_1301': 350,
         'iUTI89_1310': 350,
         'iEcSMS35_1347': 350,
         'iECSE_1348': 350,
         'iECB_1328': 3