In [None]:
import os,sys
import pandas as pd
from sklearn import manifold, datasets
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD, PCA
import numpy as np
import joblib as jl
import rdkit
from rdkit.Chem import (DataStructs)
import seaborn as sns
import matplotlib
import matplotlib as mpl
from matplotlib import colors
from matplotlib import ticker
import matplotlib.pyplot as plt
from matplotlib.ticker import PercentFormatter
import matplotlib.lines as mlines
import mpl_toolkits.mplot3d

ROOT_DIR = os.path.dirname(os.path.abspath('/scratch/work/sandsth2/SCRIPTS/MolAnalysis'))
sys.path.append('/scratch/work/sandsth2/Projects/atm_datasets_manuscript/') 
sys.path.append(ROOT_DIR)
sys.path.append('/scratch/work/sandsth2/Projects/atm_datasets_manuscript/src/fingerprint_comparison')
sys.path.append('/scratch/work/sandsth2/Projects/atm_datasets_manuscript/data/output/')

from src.visualization import plot_fun_groups as pfg
from MolAnalysis.descriptors import *
from MolAnalysis.loaddata import *
from MolAnalysis.compileanalysis import *

In [None]:
import matplotlib.lines as mlines
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

def plot_dataset_distr(datasets, keys, bin_edges, i, axs, colors, title):
    plt.rc('font', size=8)
    
    # Set the palette
    colorblind_palette = ['#D55E00', '#E69F00', '#009E73', '#56B4E9', '#0072B2', '#CC79A7', '#F0E442', '#000000']
    sns.set_palette(colorblind_palette)
    colors = colorblind_palette
    kwargs = dict(histtype='stepfilled', alpha=0.65, density=True, stacked=True, ec="k")
    
    for j in np.arange(len(keys)):
        # Try reading the dataset
        total = read_dataset_with_recheck(datasets[keys[j]])

        if (i < 2): 
            ax = axs[0, i]
        elif (i > 1):
            i = i - 2
            ax = axs[1, i]
            i = i + 2
        
        ax.hist(bin_edges[:-1], bins=bin_edges, weights=total, color=colors[j], **kwargs)
        ax.legend(keys, fancybox=True, framealpha=0)
        
        # Add vertical dashed lines
        ax.axvline(x=0.1, color='k', linestyle='--')
        ax.axvline(x=0.4, color='k', linestyle='--')
        
        # Set the spines to have a black color
        set_spines_to_black(ax)


def plot_dataset_distr_outline(datasets, keys, bin_edges, i, axs, colors, title, legend):
    plt.rc('font', size=8)
    
    mapped_keys = keys.copy()
    
    if 'MassBank, Eu' in mapped_keys:
        mapped_keys[mapped_keys.index('MassBank, Eu')] = 'MassBank Eu'
        
    for j in np.arange(len(keys)):
        # Try reading the dataset
        total = read_dataset_with_recheck(datasets[keys[j]])
        
        kwargs = dict(alpha=1, density=True, histtype='step', linewidth=1, edgecolor=colors[j])
        
        if (i < 2): 
            ax = axs[0, i]
        elif (i > 1):
            i = i - 2
            ax = axs[1, i]
            i = i + 2
            
        ax.hist(bin_edges[:-1], bins=bin_edges, weights=total, **kwargs)
        # Create custom legend with colored lines
        legend_lines = [mlines.Line2D([], [], color=color, label=legend + '-' + key) for key, color in zip(mapped_keys, colors)]
        ax.legend(handles=legend_lines, fancybox=True, framealpha=0)
        
        # Add vertical dashed lines
        ax.axvline(x=0.1, color='k', linestyle='--', linewidth=1)
        ax.axvline(x=0.4, color='k', linestyle='--', linewidth=1)
        
        # Set the spines to have a black color
        set_spines_to_black(ax)


def read_dataset_with_recheck(file_path):
    """
    Reads a dataset from a CSV file. If the initial read yields fewer than 200 rows,
    the file is read again with header=None.
    """
    try:
        # Try reading the file normally
        data = pd.read_csv(file_path)
        if len(data) < 200:
            # Re-read with header=None if rows are fewer than 200
            data = pd.read_csv(file_path, header=None)
        return data
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
        raise


def set_spines_to_black(ax):
    """
    Sets the spines of a matplotlib axis to black and adjusts their linewidth.
    """
    ax.spines['bottom'].set_color('black')
    ax.spines['top'].set_color('black')
    ax.spines['right'].set_color('black')
    ax.spines['left'].set_color('black')

    # Set the linewidth of the spines
    ax.spines['bottom'].set_linewidth(1)
    ax.spines['top'].set_linewidth(1)
    ax.spines['right'].set_linewidth(1)
    ax.spines['left'].set_linewidth(1)


# Sum the max similarties (both row and column, separately)

In [None]:
def load_datasets():
    """Load dataset locations. Update to filelist?"""
    datasets={'QM9':"qm9_data.dump",
              'Quinones':"tabor_nosulf.dump",
              'Wang':'wang_data.dump',
              'Gecko':'gecko_full.dump',
              'ExpMoNA':"allmona_cleaned.dump",
              'nablaDFT':"nablaDFT.dump",
             'mb_eu':"mb_eu_cleaned.dump"}
    return datasets

In [None]:
datasets = load_datasets()
for key in datasets.keys():
    print(key)
    dir_maccs_Wang = '../data/output-allas/'+key+'/comp_Wang_MACCS/'
    dir_top_Wang = '../data/output-allas/'+key+'/comp_Wang_topological/'
    
    dir_maccs_Gecko = '../data/output-allas/'+key+'/comp_Gecko_MACCS/'
    dir_top_Gecko = '../data/output-allas/'+key+'/comp_Gecko_topological/'
    
    %run ../src/fingerprint_comparison/sum_max_sim_bins.py -i {dir_maccs_Wang} -o {dir_maccs_Wang} 
    %run ../src/fingerprint_comparison/sum_max_sim_bins.py  -i {dir_top_Wang} -o {dir_top_Wang}  



    %run ../src/fingerprint_comparison/sum_max_sim_bins.py -i {dir_maccs_Gecko} -o {dir_maccs_Gecko} 
    %run ../src/fingerprint_comparison/sum_max_sim_bins.py  -i {dir_top_Gecko} -o {dir_top_Gecko}  

    %run ../src/fingerprint_comparison/bin_sum_max_sim_row.py -d {dir_maccs_Wang} -nb 200 
    %run ../src/fingerprint_comparison/bin_sum_max_sim_row.py  -d {dir_top_Wang} -nb 200 



    %run ../src/fingerprint_comparison/bin_sum_max_sim_row.py -d {dir_maccs_Gecko} -nb 200
    %run ../src/fingerprint_comparison/bin_sum_max_sim_row.py  -d {dir_top_Gecko} -nb 200 


# Plot the max similarity of datasets with compounds in reference dataset

In [None]:
cm = 1/2.54
maccs_atm_datasets_wang={'Wang':"../data/output-allas/Wang/comp_Wang_MACCS/sum_total_row-col.csv.gz",
                         'Gecko':'../data/output-allas/Gecko/comp_Wang_MACCS/sum_total_col.csv',
          'Quinones':"../data/output-allas/Quinones/comp_Wang_MACCS/sum_total_col.csv"}

topo_atm_datasets_wang={'Wang':"../data/output-allas/Wang/comp_Wang_topological/sum_total_row-col.csv.gz",
                        'Gecko':'../data/output-allas/Gecko/comp_Wang_topological/sum_total_col.csv',
          'Quinones':"../data/output-allas/Quinones/comp_Wang_topological/sum_total_col.csv"}

maccs_atm_datasets_gecko={
                         'Gecko':'../data/output-allas/Gecko/comp_Gecko_MACCS/sum_total_row-col.csv.gz',
    'Wang':"../data/output-allas/Wang/comp_Gecko_MACCS/global_row_max_bins.csv.gz",
          'Quinones':"../data/output-allas/Quinones/comp_Gecko_MACCS/sum_total_col.csv"}

topo_atm_datasets_gecko={
                        'Gecko':'../data/output-allas/Gecko/comp_Gecko_topological/sum_total_row-col.csv.gz',
    'Wang':"../data/output-allas/Wang/comp_Gecko_topological/global_row_max_bins.csv.gz",
          'Quinones':"../data/output-allas/Quinones/comp_Gecko_topological/sum_total_col.csv"}

maccs_natm_datasets_wang={'Wang':"../data/output-allas/Wang/comp_Wang_MACCS/sum_total_row-col.csv.gz",
                          'MONA':'../data/output-allas/ExpMoNA/comp_Wang_MACCS/sum_total_col.csv',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Wang_MACCS/sum_total_col.csv',
          'QM9':'../data/output-allas/QM9/comp_Wang_MACCS/sum_total_col.csv',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Wang_MACCS/sum_total_col.csv"}

topo_natm_datasets_wang={'Wang':"../data/output-allas/Wang/comp_Wang_topological/sum_total_row-col.csv.gz",
                         'MONA':'../data/output-allas/ExpMoNA/comp_Wang_topological/sum_total_col.csv',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Wang_topological/sum_total_col.csv',
                         'QM9':'../data/output-allas/QM9/comp_Wang_topological/sum_total_col.csv',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Wang_topological/sum_total_col.csv"}

maccs_natm_datasets_gecko={'Gecko':"../data/output-allas/Gecko/comp_Gecko_MACCS/sum_total_row-col.csv.gz",
                          'MONA':'../data/output-allas/ExpMoNA/comp_Gecko_MACCS/sum_total_col.csv',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Gecko_MACCS/sum_total_col.csv',
          'QM9':'../data/output-allas/QM9/comp_Gecko_MACCS/sum_total_col.csv',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Gecko_MACCS/sum_total_col.csv"}

topo_natm_datasets_gecko={'Gecko':"../data/output-allas/Gecko/comp_Gecko_topological/sum_total_row-col.csv.gz",
                         'MONA':'../data/output-allas/ExpMoNA/comp_Gecko_topological/sum_total_col.csv',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Gecko_topological/sum_total_col.csv',
                         'QM9':'../data/output-allas/QM9/comp_Gecko_topological/sum_total_col.csv',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Gecko_topological/sum_total_col.csv"}

In [None]:
nbins = 200
bin_edges = np.linspace(0, 1, nbins + 1)
total = np.zeros(nbins)

# Define the color palette
colorblind_palette = ['#D55E00', '#E69F00', '#009E73', '#56B4E9', '#0072B2', '#CC79A7', '#F0E442']

# Set the palette
sns.set_palette(colorblind_palette)
colors = colorblind_palette

color_atm = [colors[6], colors[4], colors[5]]
color_natm = [colors[6],   colors[2],  colors[3], colors[0], colors[1]]


legend='Gecko'
fig, axs = plt.subplots(2,2, sharey=True, sharex=True, figsize=(18.0*cm,7.0*cm), dpi=300, gridspec_kw = {'wspace':0, 'hspace':0})
plot_dataset_distr_outline(topo_atm_datasets_gecko, list(topo_atm_datasets_gecko.keys()), bin_edges,
                   1, axs, color_atm, '../reports/figures/atm_maccs_150124', legend)
plot_dataset_distr_outline(topo_natm_datasets_gecko, list(topo_natm_datasets_gecko.keys()),bin_edges,
                   3, axs, color_natm, '../reports/figures/natm_maccs_150124', legend)
legend='Wang'
plot_dataset_distr_outline(topo_atm_datasets_wang, list(topo_atm_datasets_wang.keys()), bin_edges,
                   0, axs, color_atm, '../reports/figures/atm_topo_150124', legend)
plot_dataset_distr_outline(topo_natm_datasets_wang, list(topo_natm_datasets_wang.keys()), bin_edges,
                   2, axs, color_natm, '../reports/figures/natm_topo_wang_150124', legend)
fig.subplots_adjust(wspace=0, hspace=0)
fig.text(0.5, 0.01, 'Tanimoto similarity', ha='center')
fig.text(0.0, 0.5, r"$\rho$", va='center', rotation='vertical')

plt.rc('xtick', labelsize=8) 
plt.rc('ytick', labelsize=8) 
plt.tight_layout() 
plt.savefig('../reports/figures/all_tanimoto_top_outline_max-col_201124'+'.png',  dpi=300, transparent=True)

In [None]:
nbins = 200
bin_edges = np.linspace(0, 1, nbins + 1)
total = np.zeros(nbins)

# Define the color palette
colorblind_palette = ['#D55E00', '#E69F00', '#009E73', '#56B4E9', '#0072B2', '#CC79A7', '#F0E442']

# Set the palette
sns.set_palette(colorblind_palette)
colors = colorblind_palette

color_atm = [colors[6], colors[4], colors[5]]
color_natm = [colors[6],   colors[2],  colors[3], colors[0], colors[1]]


legend='Gecko'
fig, axs = plt.subplots(2,2, sharey=True, sharex=True, figsize=(18.0*cm,7.0*cm), dpi=300, gridspec_kw = {'wspace':0, 'hspace':0})
plot_dataset_distr_outline(maccs_atm_datasets_gecko, list(maccs_atm_datasets_gecko.keys()), bin_edges,
                   1, axs, color_atm, '../reports/figures/atm_maccs_150124', legend)
plot_dataset_distr_outline(maccs_natm_datasets_gecko, list(maccs_natm_datasets_gecko.keys()),bin_edges,
                   3, axs, color_natm, '../reports/figures/natm_maccs_150124', legend)
legend='Wang'
plot_dataset_distr_outline(maccs_atm_datasets_wang, list(maccs_atm_datasets_wang.keys()), bin_edges,
                   0, axs, color_atm, '../reports/figures/atm_topo_150124', legend)
plot_dataset_distr_outline(maccs_natm_datasets_wang, list(maccs_natm_datasets_wang.keys()), bin_edges,
                   2, axs, color_natm, '../reports/figures/natm_topo_wang_150124', legend)
fig.subplots_adjust(wspace=0, hspace=0)
fig.text(0.5, 0.01, 'Tanimoto similarity', ha='center')
fig.text(0.0, 0.5, r"$\rho$", va='center', rotation='vertical')

plt.rc('xtick', labelsize=8) 
plt.rc('ytick', labelsize=8) 
plt.tight_layout() 
plt.savefig('../reports/figures/all_tanimoto_maccs_outline_max-col_201124'+'.png',  dpi=300, transparent=True)

# Plot highest similarity for each ref molecule with dataset

In [None]:
cm = 1/2.54
maccs_atm_datasets_wang_ref={'Wang':"../data/output-allas/Wang/comp_Wang_MACCS/sum_total_row-col.csv.gz",
                         'Gecko':'../data/output-allas/Gecko/comp_Wang_MACCS/global_row_max_bins.csv.gz',
          'Quinones':"../data/output-allas/Quinones/comp_Wang_MACCS/global_row_max_bins.csv.gz"}

topo_atm_datasets_wang_ref={'Wang':"../data/output-allas/Wang/comp_Wang_topological/sum_total_row-col.csv.gz",
                        'Gecko':'../data/output-allas/Gecko/comp_Wang_topological/global_row_max_bins.csv.gz',
          'Quinones':"../data/output-allas/Quinones/comp_Wang_topological/global_row_max_bins.csv.gz"}

maccs_atm_datasets_gecko_ref={
                         'Gecko':'../data/output-allas/Gecko/comp_Gecko_MACCS/sum_total_row-col.csv.gz',
    'Wang':"../data/output-allas/Wang/comp_Gecko_MACCS/sum_total_col.csv",
          'Quinones':"../data/output-allas/Quinones/comp_Gecko_MACCS/global_row_max_bins.csv.gz"}

topo_atm_datasets_gecko_ref={
                        'Gecko':'../data/output-allas/Gecko/comp_Gecko_topological/sum_total_row-col.csv.gz',
    'Wang':"../data/output-allas/Wang/comp_Gecko_topological/sum_total_col.csv",
          'Quinones':"../data/output-allas/Quinones/comp_Gecko_topological/global_row_max_bins.csv.gz"}

maccs_natm_datasets_wang_ref={'Wang':"../data/output-allas/Wang/comp_Wang_MACCS/sum_total_row-col.csv.gz",
                          'MONA':'../data/output-allas/ExpMoNA/comp_Wang_MACCS/global_row_max_bins.csv.gz',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Wang_MACCS/global_row_max_bins.csv.gz',
          'QM9':'../data/output-allas/QM9/comp_Wang_MACCS/global_row_max_bins.csv.gz',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Wang_MACCS/global_row_max_bins.csv.gz"}

topo_natm_datasets_wang_ref={'Wang':"../data/output-allas/Wang/comp_Wang_topological/sum_total_row-col.csv.gz",
                         'MONA':'../data/output-allas/ExpMoNA/comp_Wang_topological/global_row_max_bins.csv.gz',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Wang_topological/global_row_max_bins.csv.gz',
                         'QM9':'../data/output-allas/QM9/comp_Wang_topological/global_row_max_bins.csv.gz',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Wang_topological/global_row_max_bins.csv.gz"}

maccs_natm_datasets_gecko_ref={'Gecko':"../data/output-allas/Gecko/comp_Gecko_MACCS/sum_total_row-col.csv.gz",
                          'MONA':'../data/output-allas/ExpMoNA/comp_Gecko_MACCS/global_row_max_bins.csv.gz',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Gecko_MACCS/global_row_max_bins.csv.gz',
          'QM9':'../data/output-allas/QM9/comp_Gecko_MACCS/global_row_max_bins.csv.gz',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Gecko_MACCS/global_row_max_bins.csv.gz"}

topo_natm_datasets_gecko_ref={'Gecko':"../data/output-allas/Gecko/comp_Gecko_topological/sum_total_row-col.csv.gz",
                         'MONA':'../data/output-allas/ExpMoNA/comp_Gecko_topological/global_row_max_bins.csv.gz',
          'MassBank, Eu':'../data/output-allas/mb_eu/comp_Gecko_topological/global_row_max_bins.csv.gz',
                         'QM9':'../data/output-allas/QM9/comp_Gecko_topological/global_row_max_bins.csv.gz',
          'nablaDFT':"../data/output-allas/nablaDFT/comp_Gecko_topological/global_row_max_bins.csv.gz"}

In [None]:
nbins = 200
bin_edges = np.linspace(0, 1, nbins + 1)
total = np.zeros(nbins)

# Define the color palette
colorblind_palette = ['#D55E00', '#E69F00', '#009E73', '#56B4E9', '#0072B2', '#CC79A7', '#F0E442']

# Set the palette
sns.set_palette(colorblind_palette)
colors = colorblind_palette

color_atm = [colors[6], colors[4], colors[5]]
color_natm = [colors[6],   colors[2],  colors[3], colors[0], colors[1]]


legend='Gecko'
fig, axs = plt.subplots(2,2, sharey=True, sharex=True, figsize=(18.0*cm,7.0*cm), dpi=300, gridspec_kw = {'wspace':0, 'hspace':0})
plot_dataset_distr_outline(topo_atm_datasets_gecko_ref, list(topo_atm_datasets_gecko.keys()), bin_edges,
                   1, axs, color_atm, '../reports/figures/atm_maccs_150124', legend)
plot_dataset_distr_outline(topo_natm_datasets_gecko_ref, list(topo_natm_datasets_gecko.keys()),bin_edges,
                   3, axs, color_natm, '../reports/figures/natm_maccs_150124', legend)
legend='Wang'
plot_dataset_distr_outline(topo_atm_datasets_wang_ref, list(topo_atm_datasets_wang.keys()), bin_edges,
                   0, axs, color_atm, '../reports/figures/atm_topo_150124', legend)
plot_dataset_distr_outline(topo_natm_datasets_wang_ref, list(topo_natm_datasets_wang.keys()), bin_edges,
                   2, axs, color_natm, '../reports/figures/natm_topo_wang_150124', legend)
fig.subplots_adjust(wspace=0, hspace=0)
fig.text(0.5, 0.01, 'Tanimoto similarity', ha='center')
fig.text(0.0, 0.5, r"$\rho$", va='center', rotation='vertical')

plt.rc('xtick', labelsize=8) 
plt.rc('ytick', labelsize=8) 
plt.tight_layout() 
plt.savefig('../reports/figures/all_tanimoto_top_outline_max-row_201124'+'.png',  dpi=300, transparent=True)

In [None]:
nbins = 200
bin_edges = np.linspace(0, 1, nbins + 1)
total = np.zeros(nbins)

# Define the color palette
colorblind_palette = ['#D55E00', '#E69F00', '#009E73', '#56B4E9', '#0072B2', '#CC79A7', '#F0E442']

# Set the palette
sns.set_palette(colorblind_palette)
colors = colorblind_palette

color_atm = [colors[6], colors[4], colors[5]]
color_natm = [colors[6],   colors[2],  colors[3], colors[0], colors[1]]


legend='Gecko'
fig, axs = plt.subplots(2,2, sharey=True, sharex=True, figsize=(18.0*cm,7.0*cm), dpi=300, gridspec_kw = {'wspace':0, 'hspace':0})
plot_dataset_distr_outline(maccs_atm_datasets_gecko_ref, list(maccs_atm_datasets_gecko.keys()), bin_edges,
                   1, axs, color_atm, '../reports/figures/atm_maccs_150124', legend)
plot_dataset_distr_outline(maccs_natm_datasets_gecko_ref, list(maccs_natm_datasets_gecko.keys()),bin_edges,
                   3, axs, color_natm, '../reports/figures/natm_maccs_150124', legend)
legend='Wang'
plot_dataset_distr_outline(maccs_atm_datasets_wang_ref, list(maccs_atm_datasets_wang.keys()), bin_edges,
                   0, axs, color_atm, '../reports/figures/atm_topo_150124', legend)
plot_dataset_distr_outline(maccs_natm_datasets_wang_ref, list(maccs_natm_datasets_wang.keys()), bin_edges,
                   2, axs, color_natm, '../reports/figures/natm_topo_wang_150124', legend)
fig.subplots_adjust(wspace=0, hspace=0)
fig.text(0.5, 0.01, 'Tanimoto similarity', ha='center')
fig.text(0.0, 0.5, r"$\rho$", va='center', rotation='vertical')

plt.rc('xtick', labelsize=8) 
plt.rc('ytick', labelsize=8) 
plt.tight_layout() 
plt.savefig('../reports/figures/all_tanimoto_maccs_outline_max-row_201124'+'.png',  dpi=300, transparent=True)

In [None]:
import numpy as np
import pandas as pd

def calculate_region_percentages(file_paths):
    """
    Calculates the percentage of counts in low (≤0.1), intermediate (0.1 to <0.4), 
    and high (≥0.4) regions for each dataset in the provided file paths dictionary.

    Parameters:
    - file_paths: dict, dictionary where keys are dataset names and values are paths to CSV files

    Returns:
    - pd.DataFrame with dataset names as rows and percentage counts for each region as columns
    """
    results = pd.DataFrame(columns=['Dataset', 'Low (≤0.1) %', 'Intermediate (0.1 to <0.4) %', 'High (≥0.4) %'])

    for dataset_name, file_path in file_paths.items():
        total_counts = pd.read_csv(file_path, header=None).values.flatten()
        nbins = len(total_counts)
        bin_edges = np.linspace(0, 1, nbins + 1)[:-1]
        total_sum = np.sum(total_counts)

        # Calculate percentage of counts in each region
        low_percentage = np.sum(total_counts[bin_edges <= 0.1]) / total_sum * 100
        intermediate_percentage = np.sum(total_counts[(bin_edges > 0.1) & (bin_edges < 0.4)]) / total_sum * 100
        high_percentage = np.sum(total_counts[bin_edges >= 0.4]) / total_sum * 100

        # Append the results to the DataFrame, rounding to 1 decimal place
        results = results.append({
            'Dataset': dataset_name,
            'Low (≤0.1) %': round(low_percentage, 1),
            'Intermediate (0.1 to <0.4) %': round(intermediate_percentage, 1),
            'High (≥0.4) %': round(high_percentage, 1)
        }, ignore_index=True)

    return results



# Calculate and combine results for each group
maccs_wang_results = pd.concat([
    calculate_region_percentages(maccs_atm_datasets_wang),
    calculate_region_percentages(maccs_natm_datasets_wang)
], axis=0, ignore_index=True)

maccs_gecko_results = pd.concat([
    calculate_region_percentages(maccs_atm_datasets_gecko), # If you have maccs_atm_datasets_gecko, replace this
    calculate_region_percentages(maccs_natm_datasets_gecko) # If you have maccs_natm_datasets_gecko, replace this
], axis=0, ignore_index=True)

topo_gecko_results = pd.concat([
    calculate_region_percentages(topo_atm_datasets_gecko),
    calculate_region_percentages(topo_natm_datasets_gecko)
], axis=0, ignore_index=True)

topo_wang_results = pd.concat([
    calculate_region_percentages(topo_atm_datasets_wang),
    calculate_region_percentages(topo_natm_datasets_wang)
], axis=0, ignore_index=True)

# Export to Excel
with pd.ExcelWriter("results_tables_max.xlsx") as writer:
    maccs_wang_results.to_excel(writer, sheet_name="MACCS_Wang_col", index=False)
    maccs_gecko_results.to_excel(writer, sheet_name="MACCS_Gecko_col", index=False)
    topo_gecko_results.to_excel(writer, sheet_name="Topological_Gecko_col", index=False)
    topo_wang_results.to_excel(writer, sheet_name="Topological_Wang_col", index=False)

print("Results saved to 'results_tables_max_col.xlsx'")


In [None]:
# Calculate and combine results for each group
maccs_wang_results_ref = pd.concat([
    calculate_region_percentages(maccs_atm_datasets_wang_ref),
    calculate_region_percentages(maccs_natm_datasets_wang_ref)
], axis=0, ignore_index=True)

maccs_gecko_results_ref = pd.concat([
    calculate_region_percentages(maccs_atm_datasets_gecko_ref),
    calculate_region_percentages(maccs_natm_datasets_gecko_ref) 
], axis=0, ignore_index=True)

topo_gecko_results_ref = pd.concat([
    calculate_region_percentages(topo_atm_datasets_gecko_ref),
    calculate_region_percentages(topo_natm_datasets_gecko_ref)
], axis=0, ignore_index=True)

topo_wang_results_ref = pd.concat([
    calculate_region_percentages(topo_atm_datasets_wang_ref),
    calculate_region_percentages(topo_natm_datasets_wang_ref)
], axis=0, ignore_index=True)

# Export to Excel
with pd.ExcelWriter("results_tables_max_row.xlsx") as writer:
    maccs_wang_results_ref.to_excel(writer, sheet_name="MACCS_Wang_row", index=False)
    maccs_gecko_results_ref.to_excel(writer, sheet_name="MACCS_Gecko_row", index=False)
    topo_gecko_results_ref.to_excel(writer, sheet_name="Topological_Gecko_row", index=False)
    topo_wang_results_ref.to_excel(writer, sheet_name="Topological_Wang_row", index=False)

print("Results saved to 'results_tables_max_row.xlsx'")