In [None]:
import os
import pandas as pd
import numpy as np


def plot_microbiome_export_data(
    accession,
    group_info,
    taxonomy='phylum',
    work_dir=None,
    meta_file=None,
    host=None,
    order_by='Bacteroidota'
):
    # Read metadata
    pie_meta = pd.read_csv(meta_file, dtype=str)
    sub = pie_meta[(pie_meta['accession'] == accession)
                   & (pie_meta['info'] == group_info)]
    group_list = sub.groupby('value')['sample'].apply(list).to_dict()

    # Check if the summary file exists
    summary_path = os.path.join(work_dir, host, f"{accession}_summary.xlsx")
    if not os.path.exists(summary_path):
        print(f"Skip:{accession} - {group_info} not have a corresponding summary file")
        return None

    # Read abundance table and metaData
    abundance = pd.read_excel(summary_path, sheet_name=taxonomy, index_col=0)
    meta_data = pd.read_excel(summary_path, sheet_name='metaData', index_col=0)

    # Consistency check of sample size
    if abundance.shape[1] != meta_data.shape[0]:
        print(f"skip: {accession} - {group_info} Incomplete download data")
        return None

    # Normalize abundances (column sum normalization)
    abundance_norm = abundance / abundance.sum(axis=0)
    # Simplify the row name by removing the prefix from "X__Y" and retaining Y
    abundance_norm.index = abundance_norm.index.str.split(
        f"{taxonomy[0]}__", n=1).str[-1]

    # Check if order_fy exists
    if order_by not in abundance_norm.index:
        print(f"{accession} - {group_info} No {order_by} detected in the middle, unable to sort.")

    # Build long-format data
    # Convert abundance_norm to a table of samples × microorganisms.
    mat = abundance_norm.T  # shape: samples × phylum
    # merge with meta_data
    long_df = pd.concat([meta_data.reset_index(drop=False),
                        mat.reset_index(drop=True)], axis=1)
    if order_by in abundance_norm.index:
        long_df = long_df.sort_values(by=order_by)
    long_df = long_df.set_index(long_df.index.astype(str))
    long_df['SampleID'] = long_df.index

    # melt
    plot_data = long_df.melt(
        id_vars=['SampleID'] + list(meta_data.columns),
        value_vars=list(mat.columns),
        var_name='Phylum',
        value_name='Abundance'
    )

    # Convert column type
    plot_data['Abundance'] = plot_data['Abundance'].astype(float)
    # Add Sample Name column
    if 'Sample Name' not in plot_data.columns:
        plot_data['Sample Name'] = long_df['Sample Name'].values.repeat(
            len(mat.columns))

    # Add group label GSMgroup
    def get_group(sample):
        for grp, samples in group_list.items():
            if sample in samples:
                return grp
        return np.nan

    plot_data['GSMgroup'] = plot_data['Sample Name'].apply(get_group)
    plot_data = plot_data.dropna(subset=['GSMgroup'])

    # Construct output results
    export_data = plot_data[['Sample Name',
                             'Phylum', 'GSMgroup', 'Abundance']].copy()
    export_data.columns = ['sample', 'microbial_level', 'metaclass', 'number']
    export_data['accession'] = accession
    export_data['id'] = np.arange(len(export_data))
    export_data = export_data[[
        'id', 'sample', 'microbial_level', 'metaclass', 'accession', 'number']]

    return export_data

In [None]:
work_dir = "D:/Project/gutDBase/bracken_summary_GSE_filtered"
host = "human"
meta_file = "D:/Project/gutDBase/metadata/hum_pie.csv"

pie_meta = pd.read_csv(meta_file, dtype=str)
accessions = pie_meta['accession'].unique()
combo_table = pd.DataFrame({'accession': accessions, 'info': [
                            'metaclass'] * len(accessions)})

In [None]:
all_data = []
taxonomy = "phylum"
output_dir = os.path.join(
    "D:/Project/gutDBase/test", host)
os.makedirs(output_dir, exist_ok=True)

for _, row in combo_table.iterrows():
    acc = row['accession']
    info = row['info']
    print(f"processing: {acc} - {info}")
    try:
        res = plot_microbiome_export_data(
            accession=acc,
            group_info=info,
            taxonomy=taxonomy,
            work_dir=work_dir,
            meta_file=meta_file,
            host=host,
            order_by="Bacteroidota"
        )
        if res is not None:
            all_data.append(res)
    except Exception as e:
        print(f"⚠️ exception: {e}")

if all_data:
    all_combined = pd.concat(all_data, ignore_index=True)
    out_path = os.path.join(output_dir, f"combined_microbiome_data_{taxonomy}.csv")
    all_combined.to_csv(out_path, index=False)
    print(f"✅ All data has been successfully saved to {out_path}")
else:
    print("⚠️ No available data to export.")