In [None]:
import os
import pandas as pd
from skbio.diversity import beta_diversity
from skbio.stats.ordination import pcoa


def plot_beta_diversity_pcoa_export_data(
    accession,
    group_info,
    taxonomy='species',
    work_dir=None,
    meta_file=None,
    host=None
):
    """
    Calculate Bray-Curtis distance PCoA analysis and export sample principal coordinate information.

    returns:
        pd.DataFrame: Table containing the main coordinates, grouping, and sample information.
    """
    # 读取 metadata
    pie_meta = pd.read_csv(meta_file, dtype=str)
    sub = pie_meta[(pie_meta['accession'] == accession)
                   & (pie_meta['info'] == group_info)]
    sample_to_meta = dict(zip(sub['sample'], sub['value']))

    summary_path = os.path.join(work_dir, host, f"{accession}_summary.xlsx")
    if not os.path.exists(summary_path):
        print(f"skip: {accession} no summary file")
        return None

    # Read abundance data
    abundance = pd.read_excel(summary_path, sheet_name=taxonomy, index_col=0).T
    matched_samples = [s for s in abundance.index if s in sample_to_meta]
    if len(matched_samples) < 2:
        print(f"skip: {accession} - {group_info} Insufficient samples to calculate PCoA.")
        return None

    abundance = abundance.loc[matched_samples]

    # Normalization (sum by column)
    abundance_norm = abundance.div(abundance.sum(axis=1), axis=0).fillna(0)

    # Calculate Bray-Curtis distance
    dist_matrix = beta_diversity(
        'braycurtis', abundance_norm.values, ids=abundance_norm.index)

    # Perform PCoA analysis
    pcoa_result = pcoa(dist_matrix)
    pcoa_points = pcoa_result.samples.iloc[:, :2]
    pcoa_points.columns = ['umapx', 'umapy']

    # Calculate contribution rate
    contributions = pcoa_result.proportion_explained[:2] * 100
    contribution_label = f"{contributions[0]:.2f}%;{contributions[1]:.2f}%"

    # Construct the result table
    result = pcoa_points.reset_index().rename(columns={'index': 'sample'})
    result['metaclass'] = result['sample'].map(sample_to_meta)
    result['accession'] = accession
    result['contribution'] = contribution_label
    result.insert(0, 'id', range(len(result)))

    return result

In [None]:
work_dir = "D:/Project/gutDBase/bracken_summary_GSE_filtered"
host = "mouse"
meta_file = "D:/Project/gutDBase/metadata/mus_pie.csv"

pie_meta = pd.read_csv(meta_file, dtype=str)
accessions = pie_meta['accession'].unique()
combo_table = pd.DataFrame({'accession': accessions, 'info': [
                            'metaclass'] * len(accessions)})

In [None]:
all_data = []
taxonomy = 'species'
output_dir = os.path.join(
    "D:/Project/gutDBase/Beta_diversity_metaclass_csv", host)
os.makedirs(output_dir, exist_ok=True)

for _, row in combo_table.iterrows():
    acc = row['accession']
    info = row['info']
    print(f"processing {acc} - {info}")
    try:
        res = plot_beta_diversity_pcoa_export_data(
            accession=acc,
            group_info=info,
            taxonomy=taxonomy,
            work_dir=work_dir,
            meta_file=meta_file,
            host=host
        )
        if res is not None:
            all_data.append(res)
    except Exception as e:
        print(f"⚠️ exception occurred: {e}")

if all_data:
    all_combined = pd.concat(all_data, ignore_index=True)
    out_path = os.path.join(output_dir, "combined_beta_diversity_data.csv")
    all_combined.to_csv(out_path, index=False)
    print(f"✅ All data has been successfully saved to {out_path}")
else:
    print("⚠️ No available data to export.")