In [None]:
import pandas as pd
import os
import numpy as np

In [None]:
work_dir = "D:/Project/gutDBase/bracken_summary_GSE_filtered"
host = "human"
meta_file = "D:/Project/gutDBase/metadata/hum_pie.csv"

pie_meta = pd.read_csv(meta_file, dtype=str)
accessions = pie_meta['accession'].unique()
combo_table = pd.DataFrame({'accession': accessions, 'info': [
                            'metaclass'] * len(accessions)})
taxonomy = 'phylum'
all_data_association = []
all_data_LDA = []
output_dir = os.path.join(
    "D:/Project/gutDBase/Microbiome_host_association_metaclass_csv", host)
exp_foler = os.path.join(
    "D:/Project/gutDBase/exp", host)
os.makedirs(output_dir, exist_ok=True)

In [None]:
import os
import numpy as np
import pandas as pd
from scipy.stats import pearsonr


def plot_microbiome_host_association_export_data(
    accession,
    taxonomy,
    work_dir,
    meta_file,
    host,
    expression_file,
    group_info,
    group_filter=None,
    cor_p_filter=0.05
):
    """
    Calculate the correlation of microbial and host gene expression (after LEfSe selection), and create a significance mask for gene/species selection.

    returns:
        DataFrame: id, species, gene, correlation, accession
    """

    # 1. Obtain sample grouping
    pie_meta = pd.read_csv(meta_file, dtype=str)
    sub = pie_meta[(pie_meta['accession'] == accession)
                   & (pie_meta['info'] == group_info)]
    if group_filter:
        for key, val in group_filter.items():
            sub = sub[sub[key].isin(val)]

    if sub.empty:
        print(f"⚠️ {accession}-{group_info} No qualifying samples.")
        return None

    sample_list = sub['sample'].tolist()
    sample_to_meta = dict(zip(sub['sample'], sub['value']))

    # 2. Read the microbial abundance table.
    summary_path = os.path.join(work_dir, host, f"{accession}_summary.xlsx")
    if not os.path.exists(summary_path):
        print(f"❌ file not found: {summary_path}")
        return None

    abundance = pd.read_excel(summary_path, sheet_name=taxonomy, index_col=0).T
    abundance = abundance.loc[abundance.index.intersection(sample_list)]
    if abundance.shape[0] < 2:
        print(f"⚠️ sample number less than 2: {accession}")
        return None

    group_series = pd.Series(sample_to_meta)

    if not os.path.exists(Rf"D:\Project\gutDBase\LEfSe_result\{host}\{accession}\lefse.LDA.xls"):
        print(f"❌ file not found: {Rf'D:\Project\gutDBase\LEfSe_result\{host}\{accession}\lefse.LDA.xls'}")
        return None

    lefse_result = pd.read_csv(Rf"D:\Project\gutDBase\LEfSe_result\{host}\{accession}\lefse.LDA.xls",sep='\t',header=None)
    lefse_result.columns = ['Taxonomy','mean abundance(log10)','the class with the highest mean','LDA score','p value']
    selected_lefse_result = lefse_result[lefse_result['LDA score']>2].copy()
    selected_lefse_result['Taxonomy'] = selected_lefse_result['Taxonomy'].str.replace(
    r'\.(?=[kpcofgs]__)', '|', regex=True)
    selected_taxonomy = selected_lefse_result['Taxonomy'].tolist()
    selected_species = [tax for tax in selected_taxonomy if tax in abundance.columns]

    if selected_species == []:
        print(f"⚠️ no selected species: {accession}")
        return None

    # 4. Read gene expression data
    gene_exp = pd.read_csv(expression_file, sep="\t", index_col=0)
    gene_exp = gene_exp.loc[:, gene_exp.columns.intersection(sample_list)]

    # 5. Normalization
    micro_abu = abundance[selected_species].T
    micro_abu_norm = micro_abu.div(micro_abu.sum(axis=0), axis=1).fillna(0)
    gene_exp_norm = gene_exp.div(gene_exp.sum(axis=0), axis=1).fillna(0)

    # 6. Pearson correlation significance mask
    cor_result = []
    for sp in micro_abu_norm.index:
        for gene in gene_exp_norm.index:
            corr, p = pearsonr(micro_abu_norm.loc[sp], gene_exp_norm.loc[gene])
            cor_result.append([sp.split('|')[-1], gene, corr if p < cor_p_filter else 0, p])

    df_cor = pd.DataFrame(cor_result, columns=[
                          'species', 'gene', 'correlation', 'p_value'])
    df_cor['accession'] = accession

    # 7. Retain genes and microorganisms with non-zero relevance
    pivot = df_cor.pivot(index='species', columns='gene',
                         values='correlation').fillna(0)
    species_keep = pivot[(pivot != 0).any(axis=1)].index
    gene_keep = pivot.loc[species_keep].T[(
        pivot.loc[species_keep].T != 0).any(axis=1)].index

    df_final = df_cor[df_cor['species'].isin(
        species_keep) & df_cor['gene'].isin(gene_keep)]
    df_final = df_final[['species', 'gene', 'correlation', 'accession']]
    df_final.insert(0, 'id', range(len(df_final)))

    # 8. Build LDA table
    df_LDA = selected_lefse_result[['Taxonomy','the class with the highest mean', 'LDA score']].copy()
    df_LDA['full_taxonomy'] = df_LDA['Taxonomy']
    df_LDA['Taxonomy'] = df_LDA['Taxonomy'].apply(lambda tax: tax.split('|')[-1])
    df_LDA['level'] = taxonomy
    df_LDA['taxa'] = df_LDA['Taxonomy'].apply(lambda tax: tax.split('__',1)[-1])
    df_LDA['accession'] = accession
    df_LDA.columns = ['species', 'group', 'ldascore','full_taxonomy','level','taxa','accession']
    df_LDA.insert(0, 'id', range(len(df_LDA)))

    return df_final, df_LDA

In [None]:
for _, row in combo_table.iterrows():
    acc = row['accession']
    info = row['info']
    print(f"processing {acc} - {info}")
    try:
        association, LDA_score = plot_microbiome_host_association_export_data(
            accession=acc,
            group_info=info,
            taxonomy=taxonomy,
            work_dir=work_dir,
            meta_file=meta_file,
            host=host,
            expression_file=os.path.join(
                exp_foler, f"{acc}_bulk_GeneExpression.txt"))
        if association is not None:
            all_data_association.append(association)
        if LDA_score is not None:
            all_data_LDA.append(LDA_score)
    except Exception as e:
        print(f"⚠️ exception occurred: {e}")

if all_data_association:
    all_combined = pd.concat(all_data_association, ignore_index=True)
    out_path = os.path.join(
        output_dir, f"microbiome_host_association_data_{taxonomy}.csv")
    all_combined.columns = ['id', 'cell_type1', 'cell_type2',
                            'correlation', 'accession']
    all_combined.to_csv(out_path, index=False)
    print(f"✅ All data has been successfully saved to {out_path}")
else:
    print("⚠️ No available data to export.")

if all_data_LDA:
    all_LDA = pd.concat(all_data_LDA, ignore_index=True)
    out_path = os.path.join(
        output_dir, f"LDA_score_data_{taxonomy}.csv")
    all_LDA.to_csv(out_path, index=False)
    print(f"✅ All data has been successfully saved to {out_path}")
else:
    print("⚠️ No available data to export.")

In [None]:
LDA_df = pd.read_csv(R"D:\Project\gutDBase\LEfSe_result\GSE46513\lefse.LDA.xls",sep='\t',header=None)

In [None]:
LDA_df.head()

In [None]:
LDA_df.columns = ['Taxonomy','mean abundance(log10)','the class with the highest mean','LDA score','p value']

In [None]:
selected_LDA_df = LDA_df[LDA_df['LDA score']>2].copy()
selected_LDA_df.head()

In [None]:
selected_LDA_df['Taxonomy'] = selected_LDA_df['Taxonomy'].str.replace(
    r'\.(?=[kpcofgs]__)', '|', regex=True
)

In [None]:
selected_LDA_df.head()

In [None]:
LDA_df[LDA_df[3]>2][0].tolist()