In [None]:
import pandas as pd
import os
import glob


host = 'mouse'
host_short = 'mus'
taxonomy = 'species'
metadata = pd.read_csv(
    Rf"D:\Project\gutDBase\metadata\{host_short}_pie.csv", dtype=str)
target_seq_type = 'Expression profiling by high throughput sequencing'

Expression profiling by high throughput sequencing

Expression profiling by array

In [None]:
all_gse_list = metadata[metadata['info'] == 'metaclass']['accession'].unique()

In [None]:
disease_type_list = metadata[metadata['info'] == 'metaclass']['value'].unique()
disease_type_list

In [None]:
exist_gse_list = glob.glob(
    Rf"D:\Project\gutDBase\bracken_summary_GSE_filtered\{host}\*.xlsx")

In [None]:
disease_gse_dict = {}
for gse in all_gse_list:
    record = metadata[(metadata['info'] == 'metaclass')
                      & (metadata['accession'] == gse)]
    disease_type = record['value'].unique().tolist()
    disease_type.remove('normal')
    count = record.shape[0]
    old_dict = disease_gse_dict.get(list(disease_type)[0], {})
    old_dict[gse] = count
    new_dict = old_dict
    disease_gse_dict[list(disease_type)[0]] = new_dict

In [None]:
disease_gse_dict

In [None]:
for disease, dict_list in disease_gse_dict.items():
    for gse, ref_count in dict_list.items():
        summary_file = os.path.join(
            Rf"D:\Project\gutDBase\bracken_summary_GSE_filtered\{host}\{gse.split('_')[0]}_summary.xlsx")
        if os.path.exists(summary_file):
            df = pd.read_excel(summary_file, index_col=0,
                               sheet_name='metaData')
            true_count = df.shape[0]
            if true_count == ref_count:
                print(f"{gse} is correct")
            else:
                print(
                    f"{gse} has {ref_count} references, but {true_count} in the database")

In [None]:
seq_type = pd.read_csv(R"D:\Project\gutDBase\metadata\seq_type.csv")

In [None]:
gse_cRNA = seq_type[seq_type['Study type'].isin(
    [target_seq_type]) & (seq_type['Species'] == host_short)]['Accession'].tolist()
gse_cRNA = list(set(gse_cRNA))

In [None]:
gse_cRNA

In [None]:
disease_select_gse_dict = {}
for disease, dict_list in disease_gse_dict.items():
    select_gse = []
    for gse, ref_count in dict_list.items():
        summary_file = os.path.join(
            Rf"D:\Project\gutDBase\bracken_summary_GSE_filtered\{host}\{gse.split('_')[0]}_summary.xlsx")
        if os.path.exists(summary_file):
            abundance = pd.read_excel(
                summary_file, index_col=0, sheet_name=taxonomy)
            meta_data = pd.read_excel(
                summary_file, index_col=0, sheet_name='metaData')
            if abundance.shape[1] != meta_data.shape[0]:
                print(f"{gse} abundance and meta data not match")
            else:
                if gse in gse_cRNA:
                    select_gse.append(gse)
                    print(f"{gse} selected")
                else:
                    print(f"{gse} not selected due to not target seq type")
    disease_select_gse_dict[disease] = select_gse

In [None]:
disease_select_gse_dict

In [None]:
def abundance_feature_union_set(selected_gse_list: list[str]):
    abundance_t_dict: dict[str, pd.DataFrame] = {}
    all_features = set()
    for gse in selected_gse_list:
        abundance_t = pd.read_excel(
            Rf"D:\Project\gutDBase\bracken_summary_GSE_filtered\{host}\{gse.split('_')[0]}_summary.xlsx", index_col=0, sheet_name=taxonomy).T
        # abundance_t.columns = abundance_t.columns.str.replace(
        #     R"\.\d+$", "", regex=True)
        # abundance_t = abundance_t.groupby(abundance_t.columns, axis=1).mean()
        abundance_t_dict[gse] = abundance_t
        all_features.update(abundance_t.columns)

    all_features = sorted(all_features, reverse=True)

    new_abundance_t_dict = {}
    for gse in selected_gse_list:
        new_abundance_t = abundance_t_dict[gse].copy()
        new_abundance_t = new_abundance_t.reindex(
            columns=all_features, fill_value=0)
        new_abundance_t_dict[gse] = new_abundance_t

    return new_abundance_t_dict

In [None]:
def extract_label(selected_gse_list: list[str]):
    label_dict: dict[str, pd.Series] = {}
    for gse in selected_gse_list:
        records = metadata[(metadata['info'] == 'metaclass')
                           & (metadata['accession'] == gse)]
        y = records[['sample', 'value']]
        y = y.set_index('sample')['value'].map(
            lambda x: 0 if x == 'normal' else 1)
        label_dict[gse] = y
    return label_dict

In [None]:
def composite_gse_x_y_dict(abundance_t_dict: dict[str, pd.DataFrame], label_dict: dict[str, pd.Series], selected_gse_list: list[str]):
    gse2x_y_dict = {}
    for gse in selected_gse_list:
        x = abundance_t_dict[gse]
        y = label_dict[gse]
        shared_index = x.index.intersection(y.index)
        x = x.loc[shared_index]
        y = y.loc[shared_index]
        # metadata deduplication
        y = y[~y.index.duplicated(keep="first")]
        gse2x_y_dict[gse] = {"x": x, "y": y}
    return gse2x_y_dict

In [None]:
gse2_x_y_dict: dict[str, dict[str, pd.DataFrame | pd.Series]] = {}
for disease, gse_list in disease_select_gse_dict.items():
    if len(gse_list) < 2:
        print(f"{disease} insufficient quantity of fewer than 2, skipping.")
        continue
    else:
        print(f"{disease}: {len(gse_list)}")
        gse_abundance_t_dict = abundance_feature_union_set(gse_list)
        gse_label_dict = extract_label(gse_list)
        x_y_dict = composite_gse_x_y_dict(
            gse_abundance_t_dict, gse_label_dict, gse_list)
        gse2_x_y_dict[disease] = x_y_dict

In [None]:
for disease, gse_x_y in gse2_x_y_dict.items():
    for gse, x_y in gse_x_y.items():
        print(f"{disease}\t{gse}\t{x_y['x'].shape}\t{x_y['y'].shape}")

In [None]:
import numpy as np
import pandas as pd
from debiasm import DebiasMClassifier


def merge_and_add_batch_labels(gse_x_y_dict: dict[str, dict[str, pd.DataFrame | pd.Series]]):
    """
    Merge multiple GSE datasets under a disease, with batch numbers in the first column.
    Return X_all, y_all, batch_1abels
    """
    X_list, y_list, batch_list = [], [], []
    feature_names: list[str] = []
    sample_names: list[str] = []
    for batch_id, (gse, x_y) in enumerate(gse_x_y_dict.items()):
        feature_names = x_y["x"].columns.tolist()
        sample_names.extend(x_y["x"].index.tolist())
        x = x_y["x"].values
        y = x_y["y"].values

        # Add batch number column before X
        batch_col = np.full((x.shape[0], 1), batch_id)
        x_with_batch = np.hstack((batch_col, x))

        X_list.append(x_with_batch)
        y_list.append(y)
        batch_list.extend([gse]*x.shape[0])

    X_all = np.vstack(X_list)
    y_all = np.concatenate(y_list)

    return X_all, y_all, batch_list, feature_names, sample_names

In [None]:
def split_training_validation(X_all: np.ndarray, y_all: np.ndarray):
    """
    input:
        X_all: numpy array, shape (n_samples, n_features+1)，first column is batch_id
        y_all: numpy array, shape (n_samples,)
    output:
        X_train, X_val, y_train, y_val
    """

    # Ensure that y_all is a numpy array
    if not isinstance(y_all, np.ndarray):
        y_all = y_all.to_numpy()

    # Check if the sample size is aligned
    assert X_all.shape[0] == y_all.shape[0], f"Sample size mismatch: X={X_all.shape[0]}, y={y_all.shape[0]}"

    # Select the maximum batch_id as the validation set
    batch_ids = X_all[:, 0].astype(int)
    unique, counts = np.unique(batch_ids, return_counts=True)
    most_common_batch = unique[counts.argmax()]   # The batch_id with the most occurrences
    val_inds = (batch_ids == most_common_batch)

    # split the data into training and validation sets
    X_train, X_val = X_all[~val_inds], X_all[val_inds]
    y_train, y_val = y_all[~val_inds], y_all[val_inds]

    print(f"total: {len(y_all)} | training set: {len(y_train)} | validation set: {len(y_val)} | unique batch_ids: {len(np.unique(batch_ids))}")

    # check if there are any NaN
    print("X_train has NaN:", np.isnan(X_train).any())
    print("X_val has NaN:", np.isnan(X_val).any())
    print("y_train has NaN:", pd.isnull(y_train).any())
    print("y_val has NaN:", pd.isnull(y_val).any())

    return X_train, X_val, y_train, y_val

In [None]:
def debias_batch_analysis(X_train, X_val, y_train, y_val, X_all):

    dmc = DebiasMClassifier(x_val=X_val)
    dmc.fit(X_train, y_train)

    X_debiassed = dmc.transform(X_all)
    return X_debiassed

In [None]:
disease2_debiased = {}

for disease, gse_x_y in gse2_x_y_dict.items():
    print(f"Processing {disease}, with a total of {len (gse_x_y)} datasets")
    temp_gse, temp_x_y = next(iter(gse_x_y.items()))
    # feature_names = temp_x_y['x'].columns.tolist()
    # print(f"There are {len (feature_name)} features in total")
    # sample_names = []
    # for gse, x_y in gse_x_y.items():
    #     gses = x_y['y'].index.tolist()
    #     sample_names.extend(gses)
    X_all, y_all, batch_labels, feature_names, sample_names = merge_and_add_batch_labels(gse_x_y)

    print("has NaN:", np.isnan(X_all).any())
    print("NaN number:", np.isnan(X_all).sum())

    # If it is a DataFrame, you can check the specific location
    if not isinstance(X_all, np.ndarray):
        nan_locs = X_all.isnull().sum().sort_values(ascending=False)
        print("The top 10 features with the most NaN:")
        print(nan_locs.head(10))

    X_train, X_val, y_train, y_val = split_training_validation(X_all, y_all)
    X_debiassed = debias_batch_analysis(X_train, X_val, y_train, y_val, X_all)
    X_debiassed = pd.DataFrame(
        X_debiassed, columns=feature_names, index=sample_names)

    sample_y_batch = pd.DataFrame(
        {'y': y_all, 'batch': batch_labels}, index=sample_names)
    sample_y_batch.index.name = 'Sample Name'

    disease2_debiased[disease] = {
        "X_raw": X_all,
        "X_debiassed": X_debiassed,
        "sample_y_batch": sample_y_batch
    }

---

In [None]:
def get_lefse_preprocess(disease: str)->pd.DataFrame:
    """
    lefse preprocess
    """
    # Calculate the composition of each sample
    mat = disease2_debiased[disease]['X_debiassed'].copy()
    mat_meta = disease2_debiased[disease]['sample_y_batch'].copy()
    mat_meta.replace({'y': {1: disease, 0: 'normal'}}, inplace=True)
    mat_meta['Sample Name'] = mat_meta.index
    mat.insert(0, 'Group', mat_meta.loc[mat.index]['y'])
    mat = mat.T
    mat.index.name = 'Subject'

    return mat

In [None]:
for disease, abu_data in disease2_debiased.items():
    mat = get_lefse_preprocess(disease)
    mat.to_csv(Rf"D:\Project\gutDBase\debias\{host}\{disease}.tsv",sep='\t')