In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
def preprocess_LEfSe(abundance_df, group_series, accession):
    """
    Generate input data in LEfSe format and save it as a .csv file.

    args:
        abundance_df (pd.DataFrame): Microbial abundance data (operational taxonomic units, listed as samples)
        group_series (pd.Series): Sample grouping information (index is the sample name, value is the group)
        accession (str): Project identifier, used for output naming
        work_dir (str): Output directory
    """

    # Merge all rows
    group_series_df = pd.DataFrame(group_series).T
    group_series_df.index = ['Group']
    df_microbe = abundance_df.T[group_series_df.columns]
    df_out = pd.concat([group_series_df, df_microbe], axis=0)
    df_out.index.name = 'Subject'

    return df_out

In [None]:
def hock(
    accession,
    taxonomy,
    work_dir,
    meta_file,
    host,
    group_info,
    group_filter=None,
    output_path=None,
):
    """
    Calculate the correlation of microbial and host gene expression (after LEfSe selection), and create a significance mask for gene/species selection.

    returns:
        DataFrame: id, species, gene, correlation, accession
    """

    # 1. Obtain sample grouping
    pie_meta = pd.read_csv(meta_file, dtype=str)
    sub = pie_meta[(pie_meta['accession'] == accession)
                   & (pie_meta['info'] == group_info)]
    if group_filter:
        for key, val in group_filter.items():
            sub = sub[sub[key].isin(val)]

    if sub.empty:
        print(f"⚠️ {accession}-{group_info} No qualifying samples.")
        return None

    sample_list = sub['sample'].tolist()
    sample_to_meta = dict(zip(sub['sample'], sub['value']))

    # 2. Read the microbial abundance table.
    summary_path = os.path.join(work_dir, host, f"{accession}_summary.xlsx")
    if not os.path.exists(summary_path):
        print(f"❌ The file does not exist {summary_path}")
        return None

    abundance = pd.read_excel(summary_path, sheet_name=taxonomy, index_col=0).T
    abundance = abundance.loc[abundance.index.intersection(sample_list)]
    if abundance.shape[0] < 2:
        print(f"⚠️ Insufficient sample size, skipping {accession}")
        return None

    group_series = pd.Series(sample_to_meta)

    preprocess_data = preprocess_LEfSe(abundance, group_series, accession)

    preprocess_data.to_csv(os.path.join(output_path, f"{accession}_lefse_input.tsv"), sep='\t')

    return preprocess_data

In [None]:
work_dir = "D:/Project/gutDBase/bracken_summary_GSE_filtered"
host = "human"
meta_file = "D:/Project/gutDBase/metadata/hum_pie.csv"

pie_meta = pd.read_csv(meta_file, dtype=str)
accessions = pie_meta['accession'].unique()
combo_table = pd.DataFrame({'accession': accessions, 'info': [
                            'metaclass'] * len(accessions)})
taxonomy = 'species'
output_dir = os.path.join(
    "D:/Project/gutDBase/LEfSe_preprocess", host)
os.makedirs(output_dir, exist_ok=True)


In [None]:
for _, row in combo_table.iterrows():
    acc = row['accession']
    info = row['info']
    print(f"processing: {acc} - {info}")
    try:
        hock(
            accession=acc,
            group_info=info,
            taxonomy=taxonomy,
            work_dir=work_dir,
            meta_file=meta_file,
            host=host,
            output_path=output_dir)
    except Exception as e:
        print(f"⚠️ exception occurred: {e}")