In [None]:
import os
import pandas as pd
from skbio.diversity import alpha_diversity
from scipy.stats import mannwhitneyu


def plot_alpha_diversity_export_data(
    accession,
    group_info,
    taxonomy='species',
    work_dir=None,
    meta_file=None,
    host=None
):
    """
    Calculate the alpha diversity of the specified access (Chao1, Shannon, Simpson) and export metadata information

    Args:
        accession (str): Number, such as GSE123456
        group_info (str): Used to extract metadata grouping information (such as metaclass)
        taxonomy (str): microbial hierarchy
        work_dir (str): Working directory
        meta_file (str): metadata CSV path
        host (str): Subdirectory name (e.g. human)

    Returns:
        pd.DataFrame: A table containing alpha diversity and metadata
    """
    # read metadata
    pie_meta = pd.read_csv(meta_file, dtype=str)
    sub = pie_meta[(pie_meta['accession'] == accession)
                   & (pie_meta['info'] == group_info)]
    sample_to_meta = dict(zip(sub['sample'], sub['value']))

    summary_path = os.path.join(work_dir, host, f"{accession}_summary.xlsx")
    if not os.path.exists(summary_path):
        print(f"skip: {accession} no summary file")
        return None

    # Read the abundance table (original count)
    abundance = pd.read_excel(summary_path, sheet_name=taxonomy, index_col=0)
    abundance = abundance.T  # Each row is a sample.

    # Match samples in the metadata
    matched_samples = [s for s in abundance.index if s in sample_to_meta]
    if not matched_samples:
        print(f"skip: {accession} - {group_info} No matching samples")
        return None

    abundance = abundance.loc[matched_samples]

    # Calculate alpha diversity
    chao1 = alpha_diversity('chao1', abundance.values, ids=abundance.index)
    shannon = alpha_diversity('shannon', abundance.values, ids=abundance.index)
    simpson = alpha_diversity('simpson', abundance.values, ids=abundance.index)

    result = pd.DataFrame({
        'sample': abundance.index,
        'chao1': chao1.values,
        'shannon': shannon.values,
        'simpson': simpson.values,
        'metaclass': [sample_to_meta[sample] for sample in abundance.index],
        'accession': accession
    })

    result.insert(0, 'id', range(len(result)))

    # Statistical testing
    groups = result['metaclass'].unique()
    if len(groups) == 2:
        g1, g2 = groups

        def calc_pval(metric):
            g1_vals = result[result['metaclass'] == g1][metric]
            g2_vals = result[result['metaclass'] == g2][metric]
            stat, pval = mannwhitneyu(g1_vals, g2_vals, alternative='two-sided')
            return pval

        p_chao1 = calc_pval('chao1')
        p_shannon = calc_pval('shannon')
        p_simpson = calc_pval('simpson')

        pval_str = f"{p_chao1};{p_shannon};{p_simpson}"
        result['p_value'] = pval_str
    else:
        result['p_value'] = 'NA'
        print("Warning: There are not two groups, unable to perform Mann–Whitney U test.")

    return result

In [None]:
work_dir = "D:/Project/gutDBase/bracken_summary_GSE_filtered"
host = "mouse"
meta_file = "D:/Project/gutDBase/metadata/mus_pie.csv"

pie_meta = pd.read_csv(meta_file, dtype=str)
accessions = pie_meta['accession'].unique()
combo_table = pd.DataFrame({'accession': accessions, 'info': [
                            'metaclass'] * len(accessions)})

In [None]:
all_data = []
taxonomy = 'species'
output_dir = os.path.join(
    "D:/Project/gutDBase/Alpha_diversity_metaclass_csv", host)
os.makedirs(output_dir, exist_ok=True)

for _, row in combo_table.iterrows():
    acc = row['accession']
    info = row['info']
    print(f"processing {acc} - {info}")
    try:
        res = plot_alpha_diversity_export_data(
            accession=acc,
            group_info=info,
            taxonomy=taxonomy,
            work_dir=work_dir,
            meta_file=meta_file,
            host=host
        )
        if res is not None:
            all_data.append(res)
    except Exception as e:
        print(f"⚠️ exception occurred: {e}")

if all_data:
    all_combined = pd.concat(all_data, ignore_index=True)
    out_path = os.path.join(output_dir, "alpha_diversity_data.csv")
    all_combined.to_csv(out_path, index=False)
    print(f"✅ All data has been successfully saved to {out_path}")
else:
    print("⚠️ No available data to export.")