# Manual Duplicate Fixing

## Imports

In [None]:
import re
import ast
import os
from cobra.io import read_sbml_model, write_sbml_model
import pandas as pd

## Functions

In [None]:
def parse_gpr(gpr_str):
    # Split on ' or ' but keep "A and B" as one element
    parts = re.split(r'\s+or\s+', gpr_str)
    # Strip and unify whitespace
    parts = [' '.join(p.strip().split()) for p in parts if p.strip()]
    return set(parts)


def merge_gpr(del_rxn_gpr, keep_rxn_gpr):

    if del_rxn_gpr == "" and keep_rxn_gpr == "":
        return ""

    elif del_rxn_gpr == "":
        return keep_rxn_gpr
    elif keep_rxn_gpr == "":
        return del_rxn_gpr

    else:
        del_rxn_gpr = parse_gpr(del_rxn_gpr)
        keep_rxn_gpr = parse_gpr(keep_rxn_gpr)

        merged_gpr = sorted(del_rxn_gpr.union(keep_rxn_gpr))  # Sort for consistency

        return ' or '.join(merged_gpr)

In [None]:
def fix_metabolite_duplicates(model, met_delete, met_keep, rxn_dup_pairs,):

    # (0) check if met_delete is in current model and rxn_dup_pairs has right format
    if rxn_dup_pairs != [] and not all(isinstance(pair, tuple) and len(pair) == 2 for pair in rxn_dup_pairs):
        print(f"check rxn_dup_pairs for correctness for {rxn_dup_pairs}")

    elif met_delete in model.metabolites:
        # (1) delete all rxns from met_delete that are in rxn_dup_pairs if met_delete in model
        rxns_met_delete = [rxn[0] for rxn in rxn_dup_pairs if rxn[0] in model.reactions] # all reactions that we need to delete (if they are in the current model)
        rxns_met_keep = [rxn[1] for rxn in rxn_dup_pairs if rxn[1] in model.reactions]

        for dup_pair in rxn_dup_pairs:
            del_rxn, keep_rxn = dup_pair

            # (1.1) if both rxns of one rxn_dup_pairs in model
            if del_rxn in rxns_met_delete and keep_rxn in rxns_met_keep:
                # (1.1.1) merge gpr onto rxn_keep
                del_rxn_gpr = model.reactions.get_by_id(del_rxn).gene_reaction_rule
                keep_rxn_gpr = model.reactions.get_by_id(keep_rxn).gene_reaction_rule
                model.reactions.get_by_id(keep_rxn).gene_reaction_rule = merge_gpr(del_rxn_gpr, keep_rxn_gpr)

                # (1.1.2) delete del_rxn from model
                model.remove_reactions([model.reactions.get_by_id(del_rxn)])

            # (1.2) if only keep_rxn in model, continue with next rxn
            elif keep_rxn in rxns_met_keep:
                continue

            # (1.3) if only del_rxn in model
            elif del_rxn in rxns_met_delete:
                del_rxn_model = model.reactions.get_by_id(del_rxn)
                met_delete_model = model.metabolites.get_by_id(met_delete)
                if met_delete_model in del_rxn_model.metabolites: # should always be the case
                    # (1.3.1) replace met_delete with met_keep in del_rxn
                    coeff = del_rxn_model.get_coefficient(met_delete) # save stoichiometry/coefficient of met_delete

                    # if met_keep doesnt exist in the model, make copy of met_delete but give it ID of met_keep
                    # i cannot just change the id of met_delete because then (3) doesnt work so universally anymore
                    if met_keep not in model.metabolites:
                        original_met = model.metabolites.get_by_id(met_delete)
                        met_keep_model = original_met.copy()
                        met_keep_model.id = met_keep
                        model.add_metabolites([met_keep_model])

                    del_rxn_model.add_metabolites({met_keep: coeff}) # use that coefficient to add met_keep
                    del_rxn_model.add_metabolites({met_delete: -coeff}) # delete met_delete from reaction

                    # (1.3.2) replace id del_rxn with id keep_rxn
                    del_rxn_model.id = keep_rxn

            else:
                #print(f"Problems regarding the duplicate pair: {dup_pair} in model {model.id}. Check manually.")
                # both reactions are not in model, so we can continue to next pair
                continue

        # (2) check if met_delete has rxns left (these are the ones without duplicates that we want to keep)
        left_rxns = [rxn.id for rxn in model.metabolites.get_by_id(met_delete).reactions if rxn.id not in rxns_met_delete]

        # (2.1) change met_delete to met_keep in left_rxns
        for rxn in left_rxns:
            rxn_model = model.reactions.get_by_id(rxn)
            coeff = rxn_model.get_coefficient(met_delete)

            # if met_keep doesnt exist in the model, make copy of met_delete but give it ID of met_keep
            # i cannot just change the id of met_delete because then (3) doesnt work so universally anymore
            if met_keep not in model.metabolites:
                original_met = model.metabolites.get_by_id(met_delete)
                met_keep_model = original_met.copy()
                met_keep_model.id = met_keep
                model.add_metabolites([met_keep_model])

            rxn_model.add_metabolites({met_keep: coeff})
            rxn_model.add_metabolites({met_delete: -coeff})

        # (3) delete met_delete
        if len(model.metabolites.get_by_id(met_delete).reactions) == 0:
            met = model.metabolites.get_by_id(met_delete)
            model.metabolites.remove(met)
        else:
            print(f'metabolite {met_delete} cannot be deleted from {model.id} because of reaction(s): {model.metabolites.get_by_id(met_delete).reactions}')


In [None]:
# Fix and evaluate string from the CSV file below
# convert strings that i get from csv file to lists and add " " around actual strings
def fix_and_parse(s):
    if not isinstance(s, str) or s.strip() == "[]":
        return []

    # Add quotes around unquoted reaction names like BTS, ALCD4, 34DHPACDO
    s_fixed = re.sub(r'(?<![\'"])\b([\w\d]+)\b(?![\'"])', r"'\1'", s)

    try:
        return ast.literal_eval(s_fixed)
    except Exception as e:
        print(f"Error parsing: {s}\nFixed to: {s_fixed}\nError: {e}")
        return []

## Load CSV with Duplicate Metabolites

In [None]:
# Load CSV with duplicate info
# this file was created outside of these notebooks, we manually checked metabolites and their reactions to decide if we want to merge metabolites
# the csv consists of 3 columns: (1) metabolite that is obsolete (will be removed), (2) the "right" metaboolite that we're gonna keep and (3) a list of tuples where each tuples contains two reactions. the first reaction is the duplicated reaction that comes from the obsolete metabolite and the second reaction is the one we're gonna keep
metabolite_dups = pd.read_csv("../Datasets/Metabolite_Duplicates.csv", sep=";")

# Change strings from last col to list
col = "rxn_dup_pairs[del_rxn, keep_rxn]"
metabolite_dups[col] = metabolite_dups[col].apply(fix_and_parse)

## MAIN
paths m端ssen angepasst werden, wo die Modelle sind und auch wo sie hingespeichert werden m端ssen; ebenso wie der Name der xml Datei; meine Modelle werden nach dem Einlesen alle in ein models_curation dict gespeichert, 端ber welches ich iteriere, dass m端sste bei dir wahrscheinlich auch angepasst werden

In [None]:
# Load SBML Models
models_path = "/home/lisa/Dokumente/Programmierung/Models/09_macaw_fixes/"
models_curation = {}
for model_name in (f for f in os.listdir(models_path) if f.endswith(".xml")):
    model = read_sbml_model(f"{models_path}/{model_name}")
    model.solver = "cplex"
    name = str(model_name[:3]+"_curate")
    models_curation[name] = model

models_curation = {key: models_curation[key] for key in sorted(models_curation.keys())}  # sorts the dictionary alphabetically
AA1_curate, AA2_curate, AA3_curate, AA4_curate, AA5_curate, AA6_curate, AA7_curate = [models_curation[f"AA{i}_curate"] for i in range(1, 8)]

In [None]:
# MAIN
for model in models_curation.values(): # go over every model
    for index, row in metabolite_dups.iterrows(): # go over every duplicate metabolite
        met_delete = row["met_delete"]
        met_keep = row["met_keep"]
        rxn_pairs = row["rxn_dup_pairs[del_rxn, keep_rxn]"]

        # Fix the current metabolite
        fix_metabolite_duplicates(model, met_delete, met_keep, rxn_pairs)

In [None]:
# save all curated model as file
for model_name, model in models_curation.items():
    path = f"../Models/10_duplicate_removal/{model_name[:3]}_deleted_duplicates.xml"
    write_sbml_model(model, path)