In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np # lower than numpy 2.0
import pandas as pd

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.manifold import TSNE
import umap.umap_ as umap # from umap-learn

# Functions

In [None]:
def results_dic_to_df(results):
    """
    converts the results dictionary into a data frame containing medium components (unless they are fixed or sodium), 
    growth rate, medium cost, production rate and if the medium composition is Pareto optimal.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()

    RETURNS
    * results_df - dataframe - 
    
    """
    # Convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = pd.DataFrame.from_dict(results["medium list"])
    
    # Remove the columns (medium components) for which the upper and lower bound are the same
    bounds = results["medium component bounds"]
    for component, bound in bounds.items():
        if bound[0] == bound[1]:  # Check if the bounds are fixed
            results_df.drop(columns=component, inplace=True)  # Drop the column from the dataframe
    
    # drop sodium
    results_df.drop(columns = "EX_na1_e", inplace = True)

    # Add two new columns with growth rates and cost 
    results_df["growth rate"] = results["growth rate tensors"]
    results_df["medium cost"] = results["cost tensors"]
    results_df["production rate"] = results["production tensors"]

    # Extract data from results (growth rate, medium costs, and if a sample is Pareto optimal)
    results_df["is pareto"] = results["is pareto"]
    
    return(results_df)

In [None]:
def classify_results(results, results_df, MetModel, initial_medium):
    """
    Classifies all medium compositions based on how they perform compared to the baseline medium
    with respect to cost, growth rate, and production rate.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * results_df - dataframe - output of results_dic_to_df()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    * results_df - dataframe - original dataframe with new column containing performance class

    """
    if results["optimisation objective"] == "growth-production-cost":
        # Set the initial medium as medium in the model
        MetModel.medium = initial_medium
        MetModel.objective = results["model objective"]

        # Run optimization
        solution = MetModel.optimize()
        initial_growth_rate = solution.fluxes[results["biomass objective"]]
        initial_cost = calc_cost_tot(results["medium component costs"], initial_medium)[0].cpu().numpy()
        initial_production_rate = solution.fluxes[results["production objective"]]

        # Upper bound of biomass flux is the max. growth rate
        max_growth = MetModel.reactions.get_by_id(results["biomass objective"]).bounds[1]        

        # Define performance categories
        factors = ["cheaper optimal growth with super-opt production", # 0
                   "costlier optimal growth with super-opt production", # 1
                   "cheaper optimal growth with sub-opt production", # 2
                   "costlier optimal growth with sub-opt production", # 3
                   "sub-opt non-zero growth with super-opt production", # 4
                   "production without growth", # 5
                   "no growth and no production" # 6 "other"
                   ]

        # Add a new column to store performance classifications
        results_df["performance"] = ""

        # Loop over each row and categorize based on growth rate and production rate
        for index, row in results_df.iterrows():
            growth_rate = row["growth rate"]
            production_rate = row["production rate"]
            cost = row["medium cost"]

            if growth_rate == 0 and production_rate != 0:
                performance = factors[5] # "production without growth" or "" to exclude
            elif growth_rate >= max_growth and production_rate <= initial_production_rate and cost < initial_cost:
                performance = factors[2] # "cheaper optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate <= initial_production_rate and cost >= initial_cost:
                performance = factors[3] # "costlier optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate > initial_production_rate and cost < initial_cost:
                performance = factors[0] # "cheaper optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate > initial_production_rate and cost >= initial_cost:
                performance = factors[1] # "costlier optimal growth with super-opt production"
            elif growth_rate < max_growth and production_rate > initial_production_rate:
                performance = factors[4] # "sub-opt nonzero growth with super-opt production"
            else: 
                performance = factors[6] # "no growth and no production" = "other"

            # Assign the performance category to the dataframe
            results_df.at[index, "performance"] = performance

        # Convert the `performance` column to a categorical type
        results_df["performance"] = pd.Categorical(results_df["performance"], 
                                                   categories = factors, ordered = True)
        
        # Count the occurrences of each performance category
        category_counts = results_df["performance"].value_counts()
        # print("Category Counts:\n", category_counts)
    
    return(results_df, category_counts)

In [None]:
def classify_results_2(results, results_df, MetModel, initial_medium):
    """
    Classifies all medium compositions based on how they perform compared to the baseline medium
    with respect to cost, growth rate, and production rate.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * results_df - dataframe - output of results_dic_to_df()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    * results_df - dataframe - original dataframe with new column containing performance class

    """
    if results["optimisation objective"] == "growth-production-cost":
        # Set the initial medium as medium in the model
        MetModel.medium = initial_medium
        MetModel.objective = results["model objective"]

        # Run optimization
        solution = MetModel.optimize()
        initial_growth_rate = solution.fluxes[results["biomass objective"]]
        initial_cost = calc_cost_tot(results["medium component costs"], initial_medium)[0].cpu().numpy()
        initial_production_rate = solution.fluxes[results["production objective"]]

        # Upper bound of biomass flux is the max. growth rate
        max_growth = MetModel.reactions.get_by_id(results["biomass objective"]).bounds[1]        

        # Define performance categories
        factors = ["cheaper optimal growth with super-opt production", # 0
                   "costlier optimal growth with super-opt production", # 1
                   "cheaper optimal growth with sub-opt production", # 2
                   "costlier optimal growth with sub-opt production", # 3
                   "sub-opt growth with super-opt production", # 4
                   "sub-opt growth with sub-opt production", # 5
                  "(almost)no growth and (almost) no production" # 6 "other"
                   ]

        # Add a new column to store performance classifications
        results_df["performance"] = ""

        # Loop over each row and categorize based on growth rate and production rate
        for index, row in results_df.iterrows():
            growth_rate = row["growth rate"]
            production_rate = row["production rate"]
            cost = row["medium cost"]

            if growth_rate <= 0.05 and production_rate <= 0.0005:
                performance = factors[6] # "(almost)no growth and (almost) no production" or "" to exclude
            elif growth_rate >= max_growth and production_rate <= initial_production_rate and cost < initial_cost:
                performance = factors[2] # "cheaper optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate <= initial_production_rate and cost >= initial_cost:
                performance = factors[3] # "costlier optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate > initial_production_rate and cost < initial_cost:
                performance = factors[0] # "cheaper optimal growth with sub-opt production"
            elif growth_rate >= max_growth and production_rate > initial_production_rate and cost >= initial_cost:
                performance = factors[1] # "costlier optimal growth with super-opt production"
            elif growth_rate < max_growth and production_rate > initial_production_rate:
                performance = factors[4] # "sub-opt growth with sub-opt production"
            else: 
                performance = factors[5] # "sub-opt growth with sub-opt production"

            # Assign the performance category to the dataframe
            results_df.at[index, "performance"] = performance

        # Convert the `performance` column to a categorical type
        results_df["performance"] = pd.Categorical(results_df["performance"], 
                                                   categories = factors, ordered = True)
        
        # Count the occurrences of each performance category
        category_counts = results_df["performance"].value_counts()
        # print("Category Counts:\n", category_counts)
    
    return(results_df, category_counts)

## Dimenison Reduction

### PCA

In [None]:
def PCA_plot(results, initial_medium, MetModel, figname):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    Performs PCA on all medium compositions using only the components as variables.
    Plots the result colour-coding the data points by performance class.
    Lists the count of each category in legend.
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()

    '''Perform and plot PCA using only the medium components (columns starting with "EX_")'''
    # Select only the columns starting with "EX_"
    medium_components = results_df.filter(like="EX_", axis=1)

    # Standardize the medium components
    scaler = StandardScaler()
    medium_components_scaled = scaler.fit_transform(medium_components)

    # Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(medium_components_scaled)

    # Add PCA results to the dataframe for plotting
    results_df["PCA1"] = pca_result[:, 0]
    results_df["PCA2"] = pca_result[:, 1]
    # Converts performance values to new labels that include category counts
    results_df["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )
    
    # Plot the PCA results
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x = "PCA1", y = "PCA2", 
        hue = "performance_label", 
        data = results_df, 
        palette = "tab10", 
        size = "performance_label",
        sizes = (50, 50),
        s=100, edgecolor="k", alpha=0.8
    )
    plt.title("PCA of Medium Components Colored by Performance", fontsize=14)
    plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)", fontsize=12)
    plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)", fontsize=12)
    plt.grid(True)
    #plt.legend(title="Performance", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.legend(title = "Performance (count)")
    plt.tight_layout()
    figname = figname + ".png"
    plt.savefig(figname, dpi=300)
    plt.show()

In [None]:
def PCA_plot_2(results, initial_medium, MetModel, figname):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    Performs PCA on all medium compositions using only the components as variables.
    Plots the result colour-coding the data points by performance class.
    Lists the count of each category in legend.
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results_2(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()

    '''Perform and plot PCA using only the medium components (columns starting with "EX_")'''
    # Select only the columns starting with "EX_"
    medium_components = results_df.filter(like="EX_", axis=1)

    # Standardize the medium components
    scaler = StandardScaler()
    medium_components_scaled = scaler.fit_transform(medium_components)

    # Perform PCA
    pca = PCA(n_components=2)
    pca_result = pca.fit_transform(medium_components_scaled)

    # Add PCA results to the dataframe for plotting
    results_df["PCA1"] = pca_result[:, 0]
    results_df["PCA2"] = pca_result[:, 1]
    # Converts performance values to new labels that include category counts
    results_df["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )
    
    # Plot the PCA results
    plt.figure(figsize=(10, 8))
    sns.scatterplot(
        x = "PCA1", y = "PCA2", 
        hue = "performance_label", 
        data = results_df, 
        palette = "tab10", 
        size = "performance_label",
        sizes = (50, 50),
        s=100, edgecolor="k", alpha=0.8
    )
    plt.title("PCA of Medium Components Colored by Performance", fontsize=14)
    plt.xlabel(f"PC1 ({pca.explained_variance_ratio_[0]*100:.2f}% variance)", fontsize=12)
    plt.ylabel(f"PC2 ({pca.explained_variance_ratio_[1]*100:.2f}% variance)", fontsize=12)
    plt.grid(True)
    #plt.legend(title="Performance", bbox_to_anchor=(1.05, 1), loc="upper left")
    plt.legend(title = "Performance (count)")
    plt.tight_layout()
    figname = figname + ".png"
    plt.savefig(figname, dpi=300)
    plt.show()

### t-SNE

In [None]:
def t_SNE_plot(results, initial_medium, MetModel, figname, perplexity = 20):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    Performs t-SNE on all medium compositions using only the components as variables.
    Plots the result colour-coding the data points by performance class.
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure
    * perplexity - integer - parameter for t-SNE

    RETURNS
    -
    """

    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()

    '''Perform and plot t-SNE using only the medium components (columns starting with "EX_")'''
    # Select only the columns starting with "EX_"
    medium_components = results_df.filter(like="EX_", axis=1)
    
    tsne = TSNE(n_components = 2,
                init = "random",
                random_state = 42,
                perplexity = perplexity,
                learning_rate = 200,
                max_iter = 300)

    tsne_results = tsne.fit_transform(medium_components)
    results_df['tsne-2d-one'] = tsne_results[:,0]
    results_df['tsne-2d-two'] = tsne_results[:,1]
    # Converts performance values to new labels that include category counts
    results_df["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )

    # Plot t-SNE results
    plt.figure(figsize=(16,10))
    sns.scatterplot(
        x = "tsne-2d-one", y = "tsne-2d-two",
        hue = "performance_label",
        palette = sns.color_palette("hls", 10),
        data = results_df,
        legend = "full",
        alpha = 0.9
    )

    plt.title("t-SNE of Medium Components Colored by Performance", fontsize = 14)
    plt.legend(title = "Performance (count)")
    plt.tight_layout()
    figname = figname + ".png"
    plt.savefig(figname, dpi=300)
    plt.show()

### UMAP

In [None]:
def UMAP_plot(results, initial_medium, MetModel, figname, n_neighbors = 5, min_dist = 0.3):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    Performs UMAP on all medium compositions using only the components as variables.
    Plots the result colour-coding the data points by performance class.
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure
    * n_neighbours - integer - parameter for UMAP
    * min_dist - float - paramter for UMAP

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()
    n_factors = results_df['performance'].nunique()
    

    '''Perform and plot UMAP using only the medium components (columns starting with "EX_")'''
    # Select only the columns starting with "EX_"
    medium_components = results_df.filter(like = "EX_", axis=1)

    # Standardize the medium components
    scaler = StandardScaler()
    medium_components_scaled = scaler.fit_transform(medium_components)

    # perform UMAP
    umap_obj = umap.UMAP(n_neighbors = n_neighbors, min_dist = min_dist, random_state = 42)
    embedding = umap_obj.fit_transform(medium_components_scaled)

     # Add UMAP embedding to the dataframe
    results_df['umap-2d-one'] = embedding[:,0]
    results_df['umap-2d-two'] = embedding[:,1]
    # Converts performance values to new labels that include category counts
    results_df["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )

    # Plot UMAP results
    plt.figure(figsize=(16,10))
    sns.scatterplot(
        x = "umap-2d-one", y = "umap-2d-two",
        hue = "performance_label",
        size = "performance_label",
        sizes = (50, 50),
        palette = sns.color_palette(palette = "tab10"),
        data = results_df,
        legend = "full",
        alpha = 0.9
    )

    plt.title("UMAP of Medium Components Colored by Performance", fontsize = 14)
    plt.legend(title = "Performance (count)")
    plt.tight_layout()
    figname = figname + ".png"
    plt.savefig(figname, dpi=300)
    plt.show()

In [None]:
def UMAP_plot_2(results, initial_medium, MetModel, figname, n_neighbors = 5, min_dist = 0.3):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    Performs UMAP on all medium compositions using only the components as variables.
    Plots the result colour-coding the data points by performance class.
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure
    * n_neighbours - integer - parameter for UMAP
    * min_dist - float - paramter for UMAP

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results_2(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()
    n_factors = results_df['performance'].nunique()
    

    '''Perform and plot UMAP using only the medium components (columns starting with "EX_")'''
    # Select only the columns starting with "EX_"
    medium_components = results_df.filter(like = "EX_", axis=1)

    # Standardize the medium components
    scaler = StandardScaler()
    medium_components_scaled = scaler.fit_transform(medium_components)

    # perform UMAP
    umap_obj = umap.UMAP(n_neighbors = n_neighbors, min_dist = min_dist, random_state = 42)
    embedding = umap_obj.fit_transform(medium_components_scaled)

     # Add UMAP embedding to the dataframe
    results_df['umap-2d-one'] = embedding[:,0]
    results_df['umap-2d-two'] = embedding[:,1]
    # Converts performance values to new labels that include category counts
    results_df["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )

    # Plot UMAP results
    plt.figure(figsize=(16,10))
    sns.scatterplot(
        x = "umap-2d-one", y = "umap-2d-two",
        hue = "performance_label",
        size = "performance_label",
        sizes = (50, 50),
        palette = sns.color_palette(palette = "tab10"),
        data = results_df,
        legend = "full",
        alpha = 0.9
    )

    plt.title("UMAP of Medium Components Colored by Performance", fontsize = 14)
    plt.legend(title = "Performance (count)")
    plt.tight_layout()
    figname = figname + ".png"
    plt.savefig(figname, dpi=300)
    plt.show()

## Pairplots

In [None]:
def pairplot_best(results, top_percentage = 20, figname = "pairplot_best"):
    """
    Subsets the results to the top x percent media compositions with respect to growth per cost.
    Produces three different pairplots for all combinations media components, cost, growth-rate and whether a medium is pareto optimal
    for those top performing media.
    The first shows the distribution of the feature on the x-axis on the diagonal.
    In the second one, the dots are colour-coded by growth rate and in the third one by medium cost.
    Saves the figures (as png files)

    PARAMETERS
    * results - dictionary - output of BayesOpt...
    * top_percentage - integer - top x percent (growth/cost) to be plotted
    * figname - string - name under which to save the figure
    
    RETURNS
    -     
    """
    
    # convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = results_dic_to_df(results)
    # create new column with growth per cost
    results_df["growth per cost"] = results_df["growth rate"]/results_df["medium cost"]    
    # sort by growth/cost
    results_df = results_df.sort_values("growth per cost", ascending = False)
    # subset to top x percent
    cutoff = int(len(results_df)*(top_percentage/100))
    results_df_top20 = results_df.iloc[:cutoff]
    results_df_top20.drop(columns = "growth per cost", inplace = True) # exclude growth per cost
    
    # plot values for top 20%
    # diag_kind = "kde" - curve instead of barplots (takes longer)
    sns.pairplot(results_df_top20, diag_kind = "kde", corner = True)
    plt.savefig((figname + ".png"))
    sns.pairplot(results_df_top20, diag_kind = "kde", corner = True, hue = "growth rate")
    plt.savefig((figname + "hue-growth.png"))
    sns.pairplot(results_df_top20, diag_kind = "kde", corner = True, hue = "medium cost")
    plt.savefig((figname + "hue-cost.png"))
    

In [None]:
def pairplot_pareto(results, figname = "pairplot_pareto"):
    """
    Subsets the results to the media compositions that are Pareto optimal.
    Produces three different pairplots for all combinations media components, cost and growth-rate for those Pareto optimal media.
    The first shows the distribution of the feature on the x-axis on the diagonal. 
    In the second one, the dots are colour-coded by growth rate and in the third one by medium cost.
    Saves the figures (as png files)

    PARAMETERS
    * results - dictionary - output of BayesOpt...
    * figname - string - name under which to save the figure

    RETURNS
    -
    """
    # convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = results_dic_to_df(results)

    # Subset to data points on Pareto front
    results_pareto_df = results_df[results_df["is pareto"] == True]
    results_pareto_df = results_pareto_df.iloc[:, 0:-1]  # Exclude "is pareto" column

    # Pairplot all columns except for "is pareto"
    sns.pairplot(results_pareto_df, diag_kind="kde", corner=True)
    plt.title("Pair Plot for all tested media compositions that are Pareto optimal")
    plt.savefig(figname + ".png")    
    sns.pairplot(results_pareto_df, diag_kind="kde", corner=True, hue="growth rate")
    plt.savefig(figname + "hue-growth.png")
    sns.pairplot(results_pareto_df, diag_kind="kde", corner=True, hue="medium cost")
    plt.savefig(figname + "hue-cost.png")

## Stripplots

In [None]:
def stripplot_best(results, top_percentage = 20, figname = "stripplot_best"):
    """
    Subsets the results to the top x percent media compositions with respect to growth per cost.
    Produces a strip plot for all combinations media components for those top performing media.
    Saves the figure (as png file)

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * top_percentage - integer - top x percent (growth/cost) to be plotted
    * figname - string - name under which to save the figure
    
    RETURNS
    -     
    """
    ###TODO: plot twice and colour according to cost and growth

    # convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = results_dic_to_df(results)
    # create new column with growth per cost
    results_df["growth per cost"] = results_df["growth rate"]/results_df["medium cost"]    
    # sort by growth/cost
    results_df = results_df.sort_values("growth per cost", ascending = False)
    # subset to top x percent
    cutoff = int(len(results_df)*(top_percentage/100))
    results_df_top20 = results_df.iloc[:cutoff]
    results_df_top20.drop(columns = "growth per cost", inplace = True) # exclude growth per cost
    
    # plot values for top 20%
    #sns.stripplot(data = results_df_top20.iloc[:, 0:-1], jitter = False)

    # exclude growth rate, cost, etc as the distort the scale
    sns.stripplot(data = results_df_top20.iloc[:, 0:-4], jitter = False)
    plt.xticks(rotation = 90)
    plt.title(f"Strip Plot for the {top_percentage}% of media compositions with the best growth/cost ratio")
    plt.xlabel("Component")
    plt.ylabel("Upper Bound For Uptake Flux")
    plt.show

    figname = figname + ".png"
    plt.savefig(figname)

In [None]:
def stripplot_best_long_medium_cost(results, top_percentage = 20, 
                                    figname = "stripplot_long_best_hue-cost"):
    """
    Produces a stripplot for all media components, cost and growth for the top performing media compositions
    The media compositions are binned and colour-coded by medium cost. 
    For each medium component, a strip for each bin is plotted.
    Saves the figure (as png file)
    
    PARAMETERS
    * results - dictionary - output of BayesOpt...
    * top_percentage - integer - top x percent (growth/cost) to be plotted
    * figname - string - name under which to save the figure

    RETURNS
    -     
    """
    # convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = results_dic_to_df(results)
    # Calculate growth per cost
    results_df["growth per cost"] = results_df["growth rate"] / results_df["medium cost"]
    # Sort by growth per cost
    results_df = results_df.sort_values("growth per cost", ascending=False)
    # subset to top x percent
    cutoff = int(len(results_df)*(top_percentage/100))
    results_df_top20 = results_df.iloc[:cutoff]
    results_df_top20.drop(columns = "growth per cost", inplace = True) # exclude growth per cost

    # Convert DataFrame from wide to long format, excluding the last three columns
    results_df_long = pd.melt(results_df_top20.iloc[:, 0:-3], var_name="variable", value_name="value")

    # Repeat the medium cost for each entry in the melted DataFrame
    # The number of times to repeat is equal to the number of original columns melted
    results_df_long["medium cost"] = np.tile(results_df_top20["medium cost"].values, results_df_long.shape[0] // len(results_df_top20))

    # Create a larger figure
    plt.figure(figsize=(12, 6))  # Set the width and height in inches
    # Title & axes
    plt.title("Strip Plot Colored by Medium Cost")
    plt.xlabel("Component")
    plt.ylabel("Upper Bound For Uptake Flux")
    # Plot the stripplot with hue based on medium cost
    sns.stripplot(data = results_df_long, x = "variable", y = "value", hue = "medium cost", 
                  dodge = True, jitter = False)
    # Rotate x-axis labels by 90 degrees
    plt.xticks(rotation=90)
    # Show the plot
    plt.show()
    
    # Save the plot
    figname = figname + ".png"
    plt.savefig(figname)

In [None]:
def stripplot_best_long_growth_rate(results, top_percentage = 20, 
                                    figname = "stripplot_long_best_hue-growth"):
    """
    Produces a stripplot for all media components, cost and growth for the top performing media compositions.
    The media compositions are binned and colour-coded by growth rate. 
    For each medium component, a strip for each bin is plotted.
    Saves the figure (as png file)
    
    PARAMETERS
    * results - dictionary - output of BayesOpt...
    * top_percentage - integer - top x percent (growth/cost) to be plotted
    * figname - string - name under which to save the figure

    RETURNS
    -     
    """
    # convert result dictionary to a dataframe - columns are media components, growth, costs, growth per cost, rows are iterations
    results_df = results_dic_to_df(results)
    # Calculate growth per cost
    results_df["growth per cost"] = results_df["growth rate"] / results_df["medium cost"]
    # Sort by growth per cost
    results_df = results_df.sort_values("growth per cost", ascending=False)
    # Subset to top x percent
    cutoff = int(len(results_df)*(top_percentage/100))
    results_df_top20 = results_df.iloc[:cutoff]
    results_df_top20.drop(columns = "growth per cost", inplace = True) # exclude growth per cost

    # Convert DataFrame from wide to long format, excluding the last three columns
    results_df_long = pd.melt(results_df_top20.iloc[:, 0:-4], var_name="variable", value_name="value")

    # Repeat the growth rate for each entry in the melted DataFrame
    # The number of times to repeat is equal to the number of original columns melted
    results_df_long["growth rate"] = np.tile(results_df_top20["growth rate"].values, results_df_long.shape[0] // len(results_df_top20))

    # Create a larger figure
    plt.figure(figsize=(12, 6))  # Set the width and height in inches
    # Title & axes
    plt.title("Strip Plot Colored by Growth Rate")
    plt.xlabel("Component")
    plt.ylabel("Upper Bound For Uptake Flux")
    # Plot the stripplot with hue based on medium cost
    sns.stripplot(data=results_df_long, x="variable", y="value", hue="growth rate", dodge=True, jitter=False)
    # Rotate x-axis labels by 90 degrees
    plt.xticks(rotation=90)
    # Show the plot
    plt.show()
    
    # Save the plot
    figname = figname + ".png"
    plt.savefig(figname)

In [None]:
def stripplot_by_performance(results, initial_medium, MetModel, figname = "stripplot"):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    For each category a stripplot is created containing the strips for each medium component.
    The final plots are saved as png files.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()
    
    # Ensure performance is a categorical column
    performance_categories = results_df["performance"].unique()

    '''Plot Stripplot for each performance category'''
    for factor in performance_categories:        
        # Subset to the rows where performance equals the current factor
        subset_df = results_df[results_df["performance"] == factor]
        # Select only the columns starting with "EX_"; i.e., medium components
        medium_components = subset_df.filter(like="EX_", axis=1)
        
        # Create a new figure for each category
        plt.figure(figsize=(6, 6))
        sns.stripplot(data=medium_components, jitter=False)
        plt.xticks(rotation=90)
        plt.title(f"Stripplot for mediums of category: {factor}", fontsize=14)
        plt.xlabel("Component")
        plt.ylabel("Upper Bound For Uptake Flux")
        plt.tight_layout()
        
        # Save the figure to a file
        file_name = f"{figname}_{factor}.png"
        plt.savefig(file_name, dpi=300)
        print(f"Saved plot as {file_name}")
        
        # Show the figure
        plt.show()

In [None]:
def stripplot_long_by_performance(
    results,
    initial_medium,
    MetModel,
    figname = "stripplot_long_by-performance"
    ):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    A long stripplot is created containing the strips for each medium component for each class of medium compositions,
    grouped by medium components and colour-coded by performance class.
    The final plot containing all stripplots is saved as png file.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()
    medium_components = results_df.filter(like = "EX_", axis = 1)
    medium_components["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )
    
    
    # Convert DataFrame from wide to long format
    results_df_long = pd.melt(
        medium_components,
        id_vars=["performance_label"],  # Retain the performance column during the melt
        var_name="variable",
        value_name="value"
    )

    '''Plot'''
    plt.figure(figsize=(12, 6))  # Set the width and height in inches
    plt.title("Strip Plot Colored by Performance Category")
    plt.xlabel("Component")
    plt.ylabel("Upper Bound For Uptake Flux")
    # Plot the stripplot with hue based on performance
    sns.stripplot(
        data = results_df_long,
        x = "variable",
        y = "value",
        hue = "performance_label",
        dodge = True,
        jitter = False
    )
    plt.legend(title = "Performance (count)")
    # Rotate x-axis labels by 90 degrees
    plt.xticks(rotation = 90)
    # Show the plot
    plt.show()
    
    # Save the plot
    figname = figname + ".png"
    plt.savefig(figname)

In [None]:
def stripplot_long_by_performance_2(
    results,
    initial_medium,
    MetModel,
    figname = "stripplot_long_by-performance"
    ):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium.
    A long stripplot is created containing the strips for each medium component for each class of medium compositions,
    grouped by medium components and colour-coded by performance class.
    The final plot containing all stripplots is saved as png file.

    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results_2(results, results_df, MetModel, initial_medium)
    # Create a dictionary mapping performance categories to their counts
    category_counts_dict = category_counts.to_dict()
    medium_components = results_df.filter(like = "EX_", axis = 1)
    medium_components["performance_label"] = results_df["performance"].apply(
        lambda x: f"{x} ({category_counts_dict.get(x, 0)})"
    )
    
    
    # Convert DataFrame from wide to long format
    results_df_long = pd.melt(
        medium_components,
        id_vars=["performance_label"],  # Retain the performance column during the melt
        var_name="variable",
        value_name="value"
    )

    '''Plot'''
    plt.figure(figsize=(12, 6))  # Set the width and height in inches
    plt.title("Strip Plot Colored by Performance Category")
    plt.xlabel("Component")
    plt.ylabel("Upper Bound For Uptake Flux")
    # Plot the stripplot with hue based on performance
    sns.stripplot(
        data = results_df_long,
        x = "variable",
        y = "value",
        hue = "performance_label",
        dodge = True,
        jitter = False
    )
    plt.legend(title = "Performance (count)")
    # Rotate x-axis labels by 90 degrees
    plt.xticks(rotation = 90)
    # Show the plot
    plt.show()
    
    # Save the plot
    figname = figname + ".png"
    plt.savefig(figname)

# Heatmap

In [None]:
def medium_clustermap(results, initial_medium, MetModel, figname = "clustermap"):
    """
    Calls classify_results to classify all medium compositions by how they performed compared to inital_medium. 
    A heatmap in which all 0-1 normalised medium compositions are hierarchically clustered medium by similarity is produced. 
    A colour-coded performance-class column is added on the side to allow for the visual identification of grouping by performance. 
    The final plot is saved as png file.
    
    PARAMETERS
    * results - dictionary - output of media_BayesOpt()
    * initial_medium - dictionary - Medium for initial simulation
    * MetModel - cobra model - Metabolic model for simulation & used for optimisation
    * figname - string - name under which to save the figure

    RETURNS
    -
    
    """
    # Convert results dictionary to a dataframe
    results_df = results_dic_to_df(results)
    
    '''Classify according to result quality'''
    results_df, category_counts = classify_results(results, results_df, MetModel, initial_medium)

    '''Heatmap'''
    # Map performance categories to colors
    unique_performance = results_df["performance"].unique()
    palette = sns.color_palette("Set2", len(unique_performance))
    lut = dict(zip(unique_performance, palette))  # Map each category to a color
    row_colors = results_df["performance"].map(lut)

    # Select medium components
    medium_components = results_df.filter(like="EX_", axis=1)


    clustermap = sns.clustermap(
        medium_components, 
        standard_scale = 1, 
        row_colors=row_colors.to_numpy(),  # Convert to array for alignment
        cmap = "coolwarm",  # Custom color scale for the heatmap
        figsize = (13, 10)
    )

    # Create a separate legend for the performance categories
    legend_patches = [plt.Line2D([0], [0], marker = 'o', color = 'w', markersize = 10,
                                 markerfacecolor = color, label = label)
                        for label, color in lut.items()]
    # Adjust layout and add legend outside the heatmap
    plt.subplots_adjust(right = 0.7)  # Make space for the legend
    clustermap.ax_heatmap.legend(
        handles = legend_patches,
        title = "Performance",
        loc = "upper left",
        bbox_to_anchor = (1.1, 1),
        borderaxespad = 0
    )

    # Save the plot
    figname = figname + ".png"
    plt.savefig(figname)
    plt.show()