In [1]:
import os
import pandas as pd
import numpy as np
import statistics
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_selection import SelectKBest, f_classif

In [None]:
def run_experiment(base_dir, group_comparison, selector_name="none", selector=None, k=None):
    """
    selector_name: "none" or "kbest"
    selector: sklearn selector object (fit on train, transform both)
    k: number of features (for logging only)
    """
    results = []

    for fold in range(1, 6):
        train_file = os.path.join(base_dir, f"{fold}fold/multiplex.train.tsv")
        test_file  = os.path.join(base_dir, f"{fold}fold/multiplex.test.tsv")

        # --- Load ---
        train_df = pd.read_csv(train_file, sep="\t", index_col=0)
        test_df  = pd.read_csv(test_file,  sep="\t", index_col=0)

        #negVSpos
        if group_comparison == "cVSneg":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 2) | (train_df.loc["acpa"] == 0)]
            test_selected_cols = test_df.columns[(test_df.loc["acpa"] == 2) | (test_df.loc["acpa"] == 0)]

        if group_comparison == "cVSpos":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 1) | (train_df.loc["acpa"] == 0)]
            test_selected_cols = test_df.columns[(test_df.loc["acpa"] == 1) | (test_df.loc["acpa"] == 0)]

        if group_comparison == "cVSra":
            train_selected_cols = train_df.columns[(train_df.loc["acpa"] == 2) | (train_df.loc["acpa"] == 1) | (train_df.loc["acpa"] == 0)]
            test_selected_cols = test_df.columns[(test_df.loc["acpa"] == 2)| (test_df.loc["acpa"] == 1) | (test_df.loc["acpa"] == 0)]

        # Transform ACPA values: convert 2 to 1 for binary classification (1 or 0)
        train_df.loc["acpa"] = train_df.loc["acpa"].replace(2, 1)
        test_df.loc["acpa"] = test_df.loc["acpa"].replace(2, 1)

        train_df = train_df[train_selected_cols]
        test_df = test_df[test_selected_cols]

        # Drop rows with index 'acpa_neg', 'acpa_pos', and 'control'
        train_df = train_df.drop(['acpa_neg', 'acpa_pos', 'control'])
        test_df  = test_df.drop(['acpa_neg', 'acpa_pos', 'control'])

        y_train = train_df.iloc[0]
        y_test  = test_df.iloc[0]

        X_train_df = train_df.iloc[1:].T.copy()
        X_test_df  = test_df.iloc[1:].T.copy()

        feature_names = np.array(X_train_df.columns)

        # --- Feature selection (fit on train, apply to both) ---
        if selector is not None:
            sel = selector 
            sel.fit(X_train_df, y_train)
            X_train = sel.transform(X_train_df)
            X_test  = sel.transform(X_test_df)

            # map importances back to selected feature names
            selected_mask = sel.get_support()
            selected_features = feature_names[selected_mask]
            
        else:
            X_train = X_train_df
            X_test  = X_test_df
            selected_features = feature_names

        # Save dataframe with selected features for tracking
        selected_features_df = pd.DataFrame(X_train, columns=selected_features)
        selected_features_df.to_csv(
            os.path.join("/Users/m221138/RA_ACPA_multiomics/analysis/machine_learning_r1.1/5fold/enet_2condition_alternative/", f"{fold}fold_selected_features_{selector_name}.tsv"),
            index=False,
            sep="\t"
        )
        # --- Model ---
        clf = RandomForestClassifier(random_state=seed)
        clf.fit(X_train, y_train)
        y_pred = clf.predict(X_test)

        # --- Metrics & importances (mapped to selected feature names) ---
        acc = accuracy_score(y_test, y_pred)
        importances = pd.Series(clf.feature_importances_, index=selected_features)\
                        .sort_values(ascending=False)

        results.append({
            "fold": fold,
            "accuracy": acc,
            "report": classification_report(y_test, y_pred, zero_division=0),
            "importances": importances
        })

    # --- Summary ---
    accuracies = [r["accuracy"] for r in results]
    avg_acc = statistics.mean(accuracies)
    std_acc = statistics.stdev(accuracies) if len(accuracies) > 1 else 0.0

    print(f"\n=== {selector_name.upper()} RESULTS{f' (k={k})' if k else ''} ===")
    for r in results:
        print(f"Fold {r['fold']} accuracy: {r['accuracy']:.4f}")
    print(f"Average accuracy: {avg_acc:.6f} ± {std_acc:.6f}")

    # Return results if you want to aggregate importances outside
    return results



In [154]:
base_dir = "//Users/m221138/RA_ACPA_multiomics/analysis/5fold_data_r1.1/network_construction_enet"
seed = 225

In [155]:
# -------- Run A) No feature selection --------
_ = run_experiment(base_dir, "cVSneg", selector_name="none", selector=None)

# -------- Run B) SelectKBest (ANOVA) --------
k = 10  # <- change this as you like (e.g., 10, 30, 50, 90)
_ = run_experiment(base_dir, "cVSneg",
    selector_name="kbest_anova",
    selector=SelectKBest(score_func=f_classif, k=k),
    k=k
)


=== NONE RESULTS ===
Fold 1 accuracy: 0.5000
Fold 2 accuracy: 0.6875
Fold 3 accuracy: 0.5625
Fold 4 accuracy: 0.6250
Fold 5 accuracy: 0.8125
Average accuracy: 0.637500 ± 0.120221
['pyruvate' 'sphingosine 1-phosphate' 'X-12462' 'X-19438'
 'p_METRNL_21705-33' 'p_CNTN5_3299-29' 'p_RGMB_3331-8' 'p_PGAM1_3896-5'
 'p_NAMPT_5011-11' 'p_COL15A1_8974-172']
['pyruvate' '1-oleoyl-2-docosahexaenoyl-GPC (18:1/22:6)*'
 '3,5-dichloro-2,6-dihydroxybenzoic acid' 'X-12462' 'X-15245' 'X-19438'
 'X-24295' 'p_BLVRB_17148-7' 'p_NAMPT_5011-11' 'p_CTRB2_5648-28']
['ornithine' 'lactate' 'pyruvate' 'sphingosine 1-phosphate'
 '3,5-dichloro-2,6-dihydroxybenzoic acid' 'X-12104' 'X-12462' 'X-19438'
 'X-24295' 'p_NAMPT_5011-11']
['pyruvate' '1-oleoyl-2-docosahexaenoyl-GPC (18:1/22:6)*' 'X-12462'
 'X-15245' 'X-19438' 'p_TNFRSF17_2665-26' 'p_CNTN5_3299-29'
 'p_RGMB_3331-8' 'p_CHRDL1_3362-61' 'p_NAMPT_5011-11']
['pyruvate' 'sarcosine' 'cys-gly, oxidized' 'sphingosine 1-phosphate'
 '3,5-dichloro-2,6-dihydroxybenzoic ac