In [21]:
import os
import pandas as pd
import numpy as np
import statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif, SelectFpr, mutual_info_classif
from functools import partial

In [23]:
def write_line(log_file, text=""):
    """Append one line to a text file (auto-creates)."""
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(text + "\n")

def run_experiment(base_dir, group_comparison, random_seed, log_file, selector_name="none", selector=None):

    """
    selector_name: "none" or "kbest"
    selector: sklearn selector object (fit on train, transform both)
    k: number of features (for logging only)
    """
    results = []

    for fold in range(1, 6):
        train_file = os.path.join(base_dir, f"{fold}fold/multiplex.train.tsv")
        test_file  = os.path.join(base_dir, f"{fold}fold/multiplex.test.tsv")

        # --- Load ---
        train_df = pd.read_csv(train_file, sep="\t", index_col=0)
        test_df  = pd.read_csv(test_file,  sep="\t", index_col=0)

        # group filters
        if group_comparison == "cVSneg":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 2) | (train_df.loc["acpa"] == 0)]
            test_selected_cols  = test_df.columns[(test_df.loc["acpa"]  == 2) | (test_df.loc["acpa"]  == 0)]
        if group_comparison == "cVSpos":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 1) | (train_df.loc["acpa"] == 0)]
            test_selected_cols  = test_df.columns[(test_df.loc["acpa"]  == 1) | (test_df.loc["acpa"]  == 0)]
        if group_comparison == "cVSra":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"].isin([0,1,2]))]
            test_selected_cols  = test_df.columns[(test_df.loc["acpa"].isin([0,1,2]))]

        # Convert 2 -> 1 for binary
        train_df.loc["acpa"] = train_df.loc["acpa"].replace(2, 1)
        test_df.loc["acpa"]  = test_df.loc["acpa"].replace(2, 1)

        # apply column selection
        train_df = train_df[train_selected_cols]
        test_df  = test_df[test_selected_cols]

        # Drop label helper rows
        for lab in ["acpa_neg", "acpa_pos", "control"]:
            if lab in train_df.index:
                train_df = train_df.drop(lab)
            if lab in test_df.index:
                test_df = test_df.drop(lab)

        # y is first row; ensure integer 0/1
        y_train = train_df.iloc[0].astype(int)
        y_test  = test_df.iloc[0].astype(int)

        # X are remaining rows (samples x features)
        X_train_df = train_df.iloc[1:].T.copy()
        X_test_df  = test_df.iloc[1:].T.copy()

        feature_names = np.array(X_train_df.columns)

        # # --- Feature selection (fit on train, apply to both) ---
        # if selector is not None:
        #     sel = selector
        #     sel.fit(X_train_df, y_train)
        #     X_train = sel.transform(X_train_df)
        #     X_test  = sel.transform(X_test_df)
        #     selected_mask = sel.get_support()
        #     selected_features = feature_names[selected_mask]  #because feature selection, get subset
        # else:
        #     X_train = X_train_df.values
        #     X_test  = X_test_df.values
        #     selected_features = feature_names #because no feature selection, just proceed

                        # --- feature selection ---
        if selector_name == "mutual_info_classif":
            mi_func = partial(mutual_info_classif, random_state=random_seed)
            sel = SelectKBest(score_func=mi_func)   # fresh selector per fold
        elif selector is not None:
            # clone() is safest if you passed a fitted sklearn object
            from sklearn.base import clone
            sel = clone(selector)
        else:
            sel = None

        if sel is not None:
            X_train = sel.fit_transform(X_train_df, y_train)
            X_test  = sel.transform(X_test_df)
            selected_features = X_train_df.columns[sel.get_support()]
        else:
            X_train, X_test = X_train_df.values, X_test_df.values
            selected_features = X_train_df.columns

        # Save selected train matrix for tracking (optional)
        selected_features_df = pd.DataFrame(X_train, columns=selected_features)
        out_dir = "/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative"
        os.makedirs(out_dir, exist_ok=True)
        selected_features_df.to_csv(
            os.path.join(out_dir, f"{fold}fold_selected_features_{selector_name}.tsv"),
            index=False, sep="\t"
        )

        # --- Model ---
        clf = RandomForestClassifier(random_state=random_seed)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # --- Metrics ---
        acc = accuracy_score(y_test, y_pred)

        # AUC: needs at least one pos and one neg in y_test
        if len(np.unique(y_test)) == 2:
            # RandomForest supports predict_proba
            y_prob = clf.predict_proba(X_test)[:, 1]
            auc = roc_auc_score(y_test, y_prob)
        else:
            auc = np.nan  # undefined if only one class present in this fold's test

        # Importances mapped to selected features
        importances = pd.Series(clf.feature_importances_, index=selected_features).sort_values(ascending=False)

        results.append({
            "fold": fold,
            "accuracy": acc,
            "auc": auc,
            "report": classification_report(y_test, y_pred, zero_division=0),
            "importances": importances
        })

    # --- Summary ---
    accuracies = [r["accuracy"] for r in results]
    aucs = [r["auc"] for r in results]  # may include NaNs

    avg_acc = statistics.mean(accuracies)
    std_acc = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0

    # Use nan-robust stats for AUC
    auc_arr = np.array(aucs, dtype=float)
    avg_auc = float(np.nanmean(auc_arr)) if np.any(~np.isnan(auc_arr)) else np.nan
    std_auc = float(np.nanstd(auc_arr)) if np.any(~np.isnan(auc_arr)) else np.nan

    print(f"\n=== {selector_name.upper()} ===")
    for r in results:
        auc_str = "nan" if np.isnan(r["auc"]) else f"{r['auc']:.4f}"
        
    print(f"Average accuracy: {avg_acc:.6f} ± {std_acc:.6f}")
    print(f"Average AUC     : {avg_auc if np.isnan(avg_auc) else f'{avg_auc:.6f}'} ± {std_auc if np.isnan(std_auc) else f'{std_auc:.6f}'}")

    # also append summary to the same log (still appending)
    if log_file:
        write_line(log_file, f"{group_comparison}\t{selector_name}\t{avg_auc:.6f}\t{std_auc:.6f}\t{avg_acc:.6f}\t{std_acc:.6f}")
        
    return results

In [24]:
log_file = "/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative/results.multiomics.log.tsv"

In [25]:
#multi-omics
base_dir = "//Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet"
random_seed = 225

print ("ACPA-negative")

# -------- Run A) No feature selection --------
_ = run_experiment(base_dir, "cVSneg", random_seed, log_file, selector_name="none", selector=None)

# -------- Run B) SelectKBest (ANOVA) --------
# k = 10  # <- change this as you like (e.g., 10, 30, 50, 90)
seed = 225
_ = run_experiment(base_dir, "cVSneg", random_seed, log_file,
    selector_name="kbest_anova",
    selector=SelectKBest(score_func=f_classif)
)

# -------- Run B) SelectKBest (ANOVA) --------
_ = run_experiment(base_dir, "cVSneg", random_seed, log_file,
    selector_name="fpr_anova_q0.05",
    selector=SelectFpr(score_func=f_classif, alpha=0.05)
)

# # -------- Run B) SelectKBest (ANOVA) --------
# _ = run_experiment(base_dir, "cVSneg", random_seed, log_file,
#     selector_name="anova k=90",
#     selector=SelectKBest(score_func=f_classif, k=90)
# )

_ = run_experiment(base_dir, "cVSneg", random_seed, log_file,
    selector_name="mutual_info_classif",
    selector=SelectKBest(score_func=mutual_info_classif)
)

# _ = run_experiment(base_dir, "cVSneg", random_seed, log_file,
#     selector_name="mutual_info_classif k=90",
#     selector=SelectKBest(score_func=mutual_info_classif, k=90)
# )

ACPA-negative

=== NONE ===
Average accuracy: 0.637500 ± 0.120221
Average AUC     : 0.765625 ± 0.155074

=== KBEST_ANOVA ===
Average accuracy: 0.800000 ± 0.068465
Average AUC     : 0.884375 ± 0.050967

=== FPR_ANOVA_Q0.05 ===
Average accuracy: 0.762500 ± 0.189572
Average AUC     : 0.834375 ± 0.183379

=== MUTUAL_INFO_CLASSIF ===
Average accuracy: 0.787500 ± 0.104583
Average AUC     : 0.879687 ± 0.057111


In [26]:
random_seed = 18
print ("ACPA-positive")

# -------- Run A) No feature selection --------
_ = run_experiment(base_dir, "cVSpos", random_seed, log_file, selector_name="none", selector=None)


_ = run_experiment(base_dir, "cVSpos", random_seed, log_file,
    selector_name="kbest_anova",
    selector=SelectKBest(score_func=f_classif)
)

# -------- Run B) SelectKBest (ANOVA) --------
_ = run_experiment(base_dir, "cVSpos", random_seed, log_file,
    selector_name="fpr_anova_q0.05",
    selector=SelectFpr(score_func=f_classif, alpha=0.05)
)

# # -------- Run B) SelectKBest (ANOVA) --------
# _ = run_experiment(base_dir, "cVSpos", random_seed, log_file,
#     selector_name="anova k=70",
#     selector=SelectKBest(score_func=f_classif, k=70)
# )

_ = run_experiment(base_dir, "cVSpos", random_seed, log_file,
    selector_name="mutual_info_classif",
    selector=SelectKBest(score_func=mutual_info_classif)
)

# _ = run_experiment(base_dir, "cVSpos", random_seed, log_file,
#     selector_name="mutual_info_classif k=70",
#     selector=SelectKBest(score_func=mutual_info_classif, k=70)
# )

ACPA-positive

=== NONE ===
Average accuracy: 0.637500 ± 0.120221
Average AUC     : 0.742188 ± 0.114394

=== KBEST_ANOVA ===
Average accuracy: 0.750000 ± 0.044194
Average AUC     : 0.882812 ± 0.064234

=== FPR_ANOVA_Q0.05 ===
Average accuracy: 0.687500 ± 0.088388
Average AUC     : 0.740625 ± 0.087193

=== MUTUAL_INFO_CLASSIF ===
Average accuracy: 0.825000 ± 0.120221
Average AUC     : 0.923438 ± 0.043188


In [27]:
random_seed = 174
print ("RA")
# -------- Run A) No feature selection --------
_ = run_experiment(base_dir, "cVSra", random_seed, log_file, selector_name="none", selector=None)


_ = run_experiment(base_dir, "cVSra", random_seed, log_file,
    selector_name="kbest_anova",
    selector=SelectKBest(score_func=f_classif)
)

# -------- Run B) SelectKBest (ANOVA) --------
_ = run_experiment(base_dir, "cVSra", random_seed, log_file,
    selector_name="fpr_anova_q0.05",
    selector=SelectFpr(score_func=f_classif, alpha=0.05)
)

# # -------- Run B) SelectKBest (ANOVA) --------
# _ = run_experiment(base_dir, "cVSra", random_seed, log_file,
#     selector_name="anova k=70",
#     selector=SelectKBest(score_func=f_classif, k=70)
# )

_ = run_experiment(base_dir, "cVSra", random_seed, log_file,
    selector_name="mutual_info_classif",
    selector=SelectKBest(score_func=mutual_info_classif)
)

# _ = run_experiment(base_dir, "cVSra", random_seed, log_file,
#     selector_name="mutual_info_classif k=70",
#     selector=SelectKBest(score_func=mutual_info_classif, k=70)
# )

RA

=== NONE ===
Average accuracy: 0.708333 ± 0.121478
Average AUC     : 0.725781 ± 0.141175

=== KBEST_ANOVA ===
Average accuracy: 0.758333 ± 0.099478
Average AUC     : 0.870313 ± 0.066768

=== FPR_ANOVA_Q0.05 ===
Average accuracy: 0.741667 ± 0.068465
Average AUC     : 0.826562 ± 0.094709

=== MUTUAL_INFO_CLASSIF ===
Average accuracy: 0.858333 ± 0.095924
Average AUC     : 0.896094 ± 0.053708
