In [6]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import accuracy_score, roc_auc_score

In [None]:

def load_matrix(path, sep="\t"):
    """
    Reads a matrix where rows are features and columns are samples.
    Row 1 = class labels; rows 2–4 are discarded.
    Returns:
      X_all (samples x features), y (samples,), feature_names (list)
    """
    df = pd.read_csv(path, sep=sep, index_col=0)

    # Labels: first row
    y = df.iloc[0].astype(int).to_numpy()

    # Optional: if labels contain '2', map to 1 for binary
    # (remove this if your labels are already 0/1)
    if 2 in np.unique(y):
        y = np.where(y == 2, 1, y)

    # Drop rows 1–4 (0-based: 0..3)
    df_feats = df.iloc[4:]  # keep from 5th row onward

    # Samples x features
    X_all = df_feats.T.to_numpy()
    feature_names = df_feats.index.to_list()  # after transpose: columns = original row names
    # Actually, after transpose columns are df_feats.index
    # so record properly:
    feature_names = df_feats.index.to_numpy()  # features after transpose are these names
    # Correct names after transpose are the original row indices (df_feats.index)
    # But we need them aligned with columns of X_all:
    # X_all.shape = (n_samples, n_features)
    # columns = df_feats.index
    return X_all, y, df_feats.index.to_numpy()

def random_k_baseline(
    X_all,
    y,
    k_features,
    target_auc,
    base_seed,
    n_iterations=100,
    n_splits=5
):
    """
    For each iteration:
      - choose K random features
      - 5-fold Stratified CV with RF
      - record mean ACC and mean AUC over folds
    Returns:
      iter_acc (n_iterations,), iter_auc (n_iterations,)
    """
    n_samples, n_features = X_all.shape
    if k_features > n_features:
        raise ValueError(f"k_features ({k_features}) > total features ({n_features})")

    # Binary AUC only; check we have two classes overall
    uniq = np.unique(y)
    if uniq.size != 2:
        raise ValueError(f"Expected binary labels; got classes {uniq}")

    rng = np.random.default_rng(base_seed)
    iter_acc = np.zeros(n_iterations, dtype=float)
    iter_auc = np.zeros(n_iterations, dtype=float)

    for it in range(n_iterations):
        # pick k random feature indices
        feat_idx = rng.choice(n_features, size=k_features, replace=False)

        skf = StratifiedKFold(n_splits=n_splits, shuffle=True)
        fold_accs = []
        fold_aucs = []

        for train_idx, test_idx in skf.split(X_all, y):
            X_train = X_all[train_idx][:, feat_idx]
            X_test  = X_all[test_idx][:, feat_idx]
            y_train = y[train_idx]
            y_test  = y[test_idx]

            clf = RandomForestClassifier(random_state=base_seed)
            clf.fit(X_train, y_train)

            y_pred = clf.predict(X_test)
            acc = accuracy_score(y_test, y_pred)

            # AUC
            # (StratifiedKFold should keep both classes in test, but be safe)
            if np.unique(y_test).size == 2:
                y_prob = clf.predict_proba(X_test)[:, 1]
                auc = roc_auc_score(y_test, y_prob)
            else:
                # if a rare split happens to be single-class, skip AUC for that fold
                auc = np.nan

            fold_accs.append(acc)
            fold_aucs.append(auc)

        iter_acc[it] = np.mean(fold_accs)
        # average AUC over folds ignoring NaNs
        fa = np.array(fold_aucs, dtype=float)
        iter_auc[it] = np.nanmean(fa) if np.any(~np.isnan(fa)) else np.nan

    return iter_acc, iter_auc

# -------------------
# Example usage
# -------------------
# path = "/path/to/your_matrix.tsv"  # rows=features, cols=samples
# X_all, y, feat_names = load_matrix(path, sep="\t")
# k = 50                   # <-- you set this
# iter_acc, iter_auc = random_k_baseline(X_all, y, k_features=k, n_iterations=100)

# Quick summary
# print(f"ACC mean={iter_acc.mean():.3f}, std={iter_acc.std(ddof=1):.3f}")
# print(f"AUC mean={np.nanmean(iter_auc):.3f}, std={np.nanstd(iter_auc, ddof=1):.3f}")


In [None]:
path = "/Users/m221138/RA_ACPA_multiomics/preprocessed_data/2_omics/two_omics_multiplex.v3.tsv"  # rows=features, cols=samples
X_all, y, feat_names = load_matrix(path, sep="\t")
k = 90                   # <-- you set this
seed=225
target_auc = 0.92
iter_acc, iter_auc = random_k_baseline(X_all, y, k_features=k, n_iterations=100, base_seed=seed, target_auc=target_auc)

# Quick summary
print(f"ACC mean={iter_acc.mean():.3f}, std={iter_acc.std(ddof=1):.3f}")
print(f"AUC mean={np.nanmean(iter_auc):.3f}, std={np.nanstd(iter_auc, ddof=1):.3f}")


ACC mean=0.669, std=0.032
AUC mean=0.631, std=0.063
