In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif, SelectFpr, mutual_info_classif

In [8]:
def write_line(log_file, text=""):
    """Append one line to a text file (auto-creates)."""
    with open(log_file, "a", encoding="utf-8") as f:
        f.write(text + "\n")

def run_experiment(base_dir, group_comparison, random_seed, n_features, benchmark_auc, permutation_n, log_file):

    """
    selector_name: "none" or "kbest"
    selector: sklearn selector object (fit on train, transform both)
    k: number of features (for logging only)
    """
    
    performance_count = 0
    permutation_idx = 0

    while permutation_idx < permutation_n:

        selected_features_once = None  # will be filled on first fold (from TRAIN columns only)
        results = []
        # if permutation_idx % 100 == 0:
        #     print (permutation_idx)

        for fold in range(1, 6):
            train_file = os.path.join(base_dir, f"{fold}fold/multiplex.train.tsv")
            test_file  = os.path.join(base_dir, f"{fold}fold/multiplex.test.tsv")

            # --- Load ---
            train_df = pd.read_csv(train_file, sep="\t", index_col=0)
            test_df  = pd.read_csv(test_file,  sep="\t", index_col=0)

            # group filters
            if group_comparison == "cVSneg":
                train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 2) | (train_df.loc["acpa"] == 0)]
                test_selected_cols  = test_df.columns[(test_df.loc["acpa"]  == 2) | (test_df.loc["acpa"]  == 0)]
            if group_comparison == "cVSpos":
                train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 1) | (train_df.loc["acpa"] == 0)]
                test_selected_cols  = test_df.columns[(test_df.loc["acpa"]  == 1) | (test_df.loc["acpa"]  == 0)]
            if group_comparison == "cVSra":
                train_selected_cols = train_df.columns[(train_df.loc["acpa"].isin([0,1,2]))]
                test_selected_cols  = test_df.columns[(test_df.loc["acpa"].isin([0,1,2]))]

            # Convert 2 -> 1 for binary
            train_df.loc["acpa"] = train_df.loc["acpa"].replace(2, 1)
            test_df.loc["acpa"]  = test_df.loc["acpa"].replace(2, 1)

            # apply column selection
            train_df = train_df[train_selected_cols]
            test_df  = test_df[test_selected_cols]

            # Drop label helper rows
            for lab in ["acpa_neg", "acpa_pos", "control"]:
                if lab in train_df.index:
                    train_df = train_df.drop(lab)
                if lab in test_df.index:
                    test_df = test_df.drop(lab)

            # y is first row; ensure integer 0/1
            y_train = train_df.iloc[0].astype(int)
            y_test  = test_df.iloc[0].astype(int)

            # X are remaining rows (samples x features)
            X_train_df = train_df.iloc[1:].T.copy()
            X_test_df  = test_df.iloc[1:].T.copy()

            feature_names = np.array(X_train_df.columns)

            # non-deterministic pick (no seed)
            if fold == 1:
                selected_features_once = np.random.choice(feature_names, size=n_features, replace=False) #update selected_features
                selected_features = selected_features_once
                # print (selected_features)
            else:
                selected_features = selected_features_once

            X_train = X_train_df[selected_features]
            X_test  = X_test_df[selected_features]

            # --- Model ---
            clf = RandomForestClassifier(random_state=random_seed)
            clf.fit(X_train, y_train)
            y_pred = clf.predict(X_test)

            # --- Metrics ---
            acc = accuracy_score(y_test, y_pred)

            # AUC: needs at least one pos and one neg in y_test
            if len(np.unique(y_test)) == 2:
                # RandomForest supports predict_proba
                y_prob = clf.predict_proba(X_test)[:, 1]
                auc = roc_auc_score(y_test, y_prob)
            else:
                auc = np.nan  # undefined if only one class present in this fold's test

            results.append({
                "fold": fold,
                "accuracy": acc,
                "auc": auc,
                "report": classification_report(y_test, y_pred, zero_division=0),
            })

        # --- Summary ---
        # print (results)
        accuracies = [r["accuracy"] for r in results]
        aucs = [r["auc"] for r in results]  # may include NaNs

        avg_acc = statistics.mean(accuracies)
        std_acc = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0

        # Use nan-robust stats for AUC
        auc_arr = np.array(aucs, dtype=float)
        avg_auc = float(np.nanmean(auc_arr)) if np.any(~np.isnan(auc_arr)) else np.nan
        std_auc = float(np.nanstd(auc_arr)) if np.any(~np.isnan(auc_arr)) else np.nan

        for r in results:
            auc_str = "nan" if np.isnan(r["auc"]) else f"{r['auc']:.4f}"
            
        # print(f"Average accuracy: {avg_acc:.6f} ± {std_acc:.6f}")
        # print(f"Average AUC     : {avg_auc if np.isnan(avg_auc) else f'{avg_auc:.6f}'} ± {std_auc if np.isnan(std_auc) else f'{std_auc:.6f}'}")
        if log_file:
            write_line(log_file, f"{group_comparison}\t{permutation_idx}\t{avg_auc:.6f}\t{std_auc:.6f}\t{avg_acc:.6f}\t{std_acc:.6f}\t{list(selected_features)}")

        if avg_auc > benchmark_auc:
            print (permutation_idx)
            performance_count += 1
        
        permutation_idx += 1
 
    return performance_count, permutation_idx

In [9]:
#multi-omics
base_dir = "//Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet"
log_file = "/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative/results.multiomics.permutation.cVSneg.log.tsv"
random_seed = 225
# n_features = 10
n_features = 90
benchmark_auc = 0.92 

print ("ACPA-negative")

# -------- Run A) No feature selection --------
performance_count, permutation_idx = run_experiment(base_dir, "cVSneg", random_seed, n_features, benchmark_auc, 10000, log_file)


ACPA-negative
8583
9825


In [10]:
#multi-omics
base_dir = "//Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet"
log_file = "/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative/results.multiomics.permutation.cVSpos.log.tsv"
random_seed = 18
# n_features = 10
n_features = 70
benchmark_auc = 0.93

print ("ACPA-positive")

# -------- Run A) No feature selection --------
performance_count, permutation_idx = run_experiment(base_dir, "cVSpos", random_seed, n_features, benchmark_auc, 10000, log_file)


ACPA-positive


In [11]:
#multi-omics
base_dir = "//Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet"
log_file = "/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative/results.multiomics.permutation.cVSra.log.tsv"
random_seed = 174
# n_features = 10
n_features = 70
benchmark_auc = 0.93

print ("RA")

# -------- Run A) No feature selection --------
performance_count, permutation_idx = run_experiment(base_dir, "cVSra", random_seed, n_features, benchmark_auc, 10000, log_file)


ACPA-positive
2708
7170
