In [12]:
import os
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import classification_report, accuracy_score, roc_auc_score
from sklearn.feature_selection import SelectKBest, f_classif


In [11]:

root = Path("/Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet")
comparison = "acpa_neg_vs_control"  # "acpa_neg_vs_control" | "acpa_pos_vs_control" | "ra_vs_control"
random_state = 42

results = []

def load_split(fold_dir: Path, split: str):
    f = fold_dir / f"multiplex.{split}.tsv"
    df = pd.read_csv(f, sep="\t", index_col=0)

    # Drop the first row (sample IDs across columns)
    df = df.iloc[1:, :]

    # Label rows (after drop: row0=row2 original, etc.)
    acpa_neg = df.iloc[0, :].astype(int)
    acpa_pos = df.iloc[1, :].astype(int)
    control  = df.iloc[2, :].astype(int)

    if comparison == "acpa_neg_vs_control":
        y = np.where(acpa_neg == 1, "acpa_neg", "control")
    elif comparison == "acpa_pos_vs_control":
        y = np.where(acpa_pos == 1, "acpa_pos", "control")
    elif comparison == "ra_vs_control":
        y = np.where((acpa_neg == 1) | (acpa_pos == 1), "ra", "control")
    else:
        raise ValueError("Unknown comparison")

    # Features = rows 5..end (index 3..end after dropping first row)
    X = df.iloc[3:, :].T
    X = X.apply(pd.to_numeric, errors="coerce").fillna(0.0)
    y = pd.Series(y, index=X.index).astype("category")

    return X, y

for i in range(1, 6):  # 1..5 folds
    fold_dir = root / f"{i}fold"

    # Load data
    X_train, y_train = load_split(fold_dir, "train")
    X_test,  y_test  = load_split(fold_dir, "test")

    # --- Baseline RF on full feature set (train only) ---
    rf = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state,
        n_jobs=-1,
        class_weight="balanced",
    )
    rf.fit(X_train, y_train)

    # --- SelectFromModel using median importance (train only) ---
    selector = SelectFromModel(rf, threshold="median", prefit=True)
    X_train_sel = selector.transform(X_train)
    X_test_sel  = selector.transform(X_test)

    # --- Refit RF on selected features ---
    rf_sel = RandomForestClassifier(
        n_estimators=n_estimators,
        random_state=random_state,
        n_jobs=-1,
        class_weight="balanced",
    )
    rf_sel.fit(X_train_sel, y_train)

    # --- Evaluation ---
    preds = rf_sel.predict(X_test_sel)
    acc = accuracy_score(y_test, preds)

    # AUC (binary): probability of the non-"control" class
    if len(rf_sel.classes_) == 2:
        positive_label = [c for c in rf_sel.classes_ if c != "control"][0]
        pos_idx = list(rf_sel.classes_).index(positive_label)
        proba = rf_sel.predict_proba(X_test_sel)[:, pos_idx]
        auc = roc_auc_score((y_test == positive_label).astype(int), proba)
    else:
        auc = np.nan  # just in case

    print(f"\n=== Fold {i} — {comparison} ===")
    print(f"Selected features: {X_train_sel.shape[1]}")
    print(f"Accuracy: {acc:.3f} | AUC: {auc:.3f}")
    print(classification_report(y_test, preds, digits=3, zero_division=0))





=== Fold 1 — acpa_neg_vs_control ===
Selected features: 8340
Accuracy: 0.667 | AUC: 0.375
              precision    recall  f1-score   support

    acpa_neg      0.000     0.000     0.000         8
     control      0.667     1.000     0.800        16

    accuracy                          0.667        24
   macro avg      0.333     0.500     0.400        24
weighted avg      0.444     0.667     0.533        24






=== Fold 2 — acpa_neg_vs_control ===
Selected features: 8340
Accuracy: 0.667 | AUC: 0.527
              precision    recall  f1-score   support

    acpa_neg      0.000     0.000     0.000         8
     control      0.667     1.000     0.800        16

    accuracy                          0.667        24
   macro avg      0.333     0.500     0.400        24
weighted avg      0.444     0.667     0.533        24






=== Fold 3 — acpa_neg_vs_control ===
Selected features: 8340
Accuracy: 0.667 | AUC: 0.492
              precision    recall  f1-score   support

    acpa_neg      0.000     0.000     0.000         8
     control      0.667     1.000     0.800        16

    accuracy                          0.667        24
   macro avg      0.333     0.500     0.400        24
weighted avg      0.444     0.667     0.533        24






=== Fold 4 — acpa_neg_vs_control ===
Selected features: 8340
Accuracy: 0.667 | AUC: 0.707
              precision    recall  f1-score   support

    acpa_neg      0.000     0.000     0.000         8
     control      0.667     1.000     0.800        16

    accuracy                          0.667        24
   macro avg      0.333     0.500     0.400        24
weighted avg      0.444     0.667     0.533        24






=== Fold 5 — acpa_neg_vs_control ===
Selected features: 8340
Accuracy: 0.667 | AUC: 0.691
              precision    recall  f1-score   support

    acpa_neg      0.000     0.000     0.000         8
     control      0.667     1.000     0.800        16

    accuracy                          0.667        24
   macro avg      0.333     0.500     0.400        24
weighted avg      0.444     0.667     0.533        24

