In [1]:
import transformers
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from joblib import Parallel, delayed
import itertools
import numpy as np
import pandas as pd
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.datasets import load_wine

In [None]:
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=1000, n_features = 1000, n_informative=500, random_state=42, n_classes= 3)
X = X - X.min()  # make minimum value 0
X = X * 10        # scale to get values in count-like range
X = np.random.poisson(lam=X).astype(int)

y = y.astype("str")
study_labels = np.random.choice(['A', 'B', 'C'], size=X.shape[0], p=[0.2, 0.5, 0.3])
X

array([[5759, 5712, 5878, ..., 5655, 5409, 5825],
       [5720, 5649, 5650, ..., 5843, 5903, 5437],
       [5595, 5713, 5711, ..., 5839, 5694, 5641],
       ...,
       [5785, 5659, 5645, ..., 5663, 5820, 5639],
       [5667, 5662, 5715, ..., 5770, 5550, 5694],
       [5641, 5769, 5332, ..., 5854, 5535, 5751]])

In [3]:
# CV setup
outer_cv = KFold(n_splits=3, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_genes': [200, 500, 999],
    'C': [100, 1e3, 1e4],
    'gamma': [1e-4, 1e-3,'auto'],
    'kernel': ['rbf'],
    'class_weight': ["balanced", None]
}

param_combos = list(ParameterGrid(param_grid))

model = SVC
pipe = Pipeline([
    ('DEseq2', transformers.DESeq2RatioNormalizer()),
    ('feature_selection', transformers.FeatureSelection()),
    ('scaler', StandardScaler())
])

In [4]:
# Function to evaluate one inner fold + hyperparam combo
def evaluate_inner_fold(outer_fold, inner_fold, 
                        train_inner_idx, val_inner_idx, 
                        X_train_outer, y_train_outer, study_labels,
                        model, 
                        pipe,
                        params,
                        type = "standard"):
    
      
    def standard_eval():
        clf.fit(X_train_inner, y_train_inner)
        preds = clf.predict(X_val_inner)
        return {
            'outer_fold': outer_fold,
            'inner_fold': inner_fold,
            'params': params,
            'accuracy': accuracy_score(y_val_inner, preds),
            'f1_macro': f1_score(y_val_inner, preds, average='macro'),
            'f1_per_class': f1_score(y_val_inner, preds, average=None)
        }

    def ovr_eval():
        results = []
        classes = np.unique(y_train_inner)
        for cl in classes:
            y_train_bin = [cl if yy == cl else "other" for yy in y_train_inner]
            y_val_bin = [cl if yy == cl else "other" for yy in y_val_inner]

            clf.fit(X_train_inner, y_train_bin)
            preds = clf.predict(X_val_inner)
            results.append({
                'outer_fold': outer_fold,
                'inner_fold': inner_fold,
                'class': cl,
                'params': params,
                'accuracy': accuracy_score(y_val_bin, preds),
                'f1_binary': f1_score(y_val_bin, preds, average='binary', pos_label=cl)
            })
        return results

    def ovo_eval():
        results = []
        classes = np.unique(y_train_inner)
        for i, j in itertools.combinations(classes, 2):
            train_mask = [(yy == i or yy == j) for yy in y_train_inner]
            val_mask = [(yy == i or yy == j) for yy in y_val_inner]

            X_train_ij = X_train_inner[train_mask]
            y_train_ij = [yy for yy in y_train_inner if yy == i or yy == j]

            X_val_ij = X_val_inner[val_mask]
            y_val_ij = [yy for yy in y_val_inner if yy == i or yy == j]

            clf.fit(X_train_ij, y_train_ij)
            preds = clf.predict(X_val_ij)
            results.append({
                'outer_fold': outer_fold,
                'inner_fold': inner_fold,
                'class_0': i,
                'class_1': j,
                'params': params,
                'accuracy': accuracy_score(y_val_ij, preds),
                'f1_binary': f1_score(y_val_ij, preds, average='binary', pos_label=i)
            })
        return results

    # Dispatch table for clean logic
    eval_dispatch = {
        'standard': standard_eval,
        'OvR': ovr_eval,
        'OvO': ovo_eval
    }

    if type not in eval_dispatch:
        raise ValueError(f"Unsupported evaluation type: {type}")
    
    ### Setup pipeline and classifier
    pipe_inner = clone(pipe)
    # Set n_genes and drop it from params
    #pipe_inner.set_params()
    
    n_genes = params.pop('n_genes')
    ### Subset training data and validation data
    ## And perform normalization and feature selection
    X_train_inner = X_train_outer[train_inner_idx]
    study_labels_inner = study_labels[train_inner_idx]
    X_train_inner = pipe_inner.fit_transform(X_train_inner, 
                                             feature_selection__study_per_patient=study_labels_inner, 
                                             feature_selection__n_genes=n_genes)
    y_train_inner = y_train_outer[train_inner_idx]
    
    X_val_inner = X_train_outer[val_inner_idx]
    X_val_inner = pipe_inner.transform(X_val_inner)
    y_val_inner = y_train_outer[val_inner_idx]
       
    ### Set classifier
    clf = model(**params)
    params['n_genes'] = n_genes
    return eval_dispatch[type]()

In [5]:
all_results = []

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    X_train_outer, X_test_outer = X[train_idx], X[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]
    study_labels_outer = study_labels[train_idx]
    
    inner_tasks = []
    for inner_fold, (train_inner_idx, val_inner_idx) in enumerate(inner_cv.split(X_train_outer)):
        for params in param_combos:
            inner_tasks.append(delayed(evaluate_inner_fold)(
                outer_fold, inner_fold,
                train_inner_idx, val_inner_idx,
                X_train_outer, y_train_outer, study_labels,
                model,
                pipe,
                params,
                type = "OvR" # standard, OvR, OvO
            ))

    # Run inner CV tasks in parallel (adjust n_jobs to number of CPU cores)
    inner_results = Parallel(n_jobs=-1, verbose=1)(inner_tasks)
    if isinstance(inner_results[0], dict):
        # Flat list of dictionaries
        all_results.extend(inner_results)
    elif isinstance(inner_results[0], list):
        # List of lists of dictionaries
        for res in inner_results:
            all_results.extend(res)
    else:
        raise ValueError("Unexpected structure in inner_results")


# Convert to DataFrame
df_parallel_results = pd.DataFrame(all_results)
df_parallel_results


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    6.6s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   13.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    2.4s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   13.7s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  34 tasks      | elapsed:    3.4s
[Parallel(n_jobs=-1)]: Done 162 out of 162 | elapsed:   11.5s finished


Unnamed: 0,outer_fold,inner_fold,class,params,accuracy,f1_binary
0,0,0,0,"{'C': 100, 'class_weight': 'balanced', 'gamma'...",0.585586,0.445783
1,0,0,1,"{'C': 100, 'class_weight': 'balanced', 'gamma'...",0.590090,0.461538
2,0,0,2,"{'C': 100, 'class_weight': 'balanced', 'gamma'...",0.500000,0.424870
3,0,0,0,"{'C': 100, 'class_weight': 'balanced', 'gamma'...",0.675676,0.485714
4,0,0,1,"{'C': 100, 'class_weight': 'balanced', 'gamma'...",0.644144,0.431655
...,...,...,...,...,...,...
1453,2,2,1,"{'C': 10000.0, 'class_weight': None, 'gamma': ...",0.698198,0.361905
1454,2,2,2,"{'C': 10000.0, 'class_weight': None, 'gamma': ...",0.707207,0.285714
1455,2,2,0,"{'C': 10000.0, 'class_weight': None, 'gamma': ...",0.653153,0.153846
1456,2,2,1,"{'C': 10000.0, 'class_weight': None, 'gamma': ...",0.702703,0.250000


In [6]:
def process_cv_results(df, param_grid, score_col='f1_binary'):
    #  Extract param names and expand 'params'
    param_names = list(param_grid.keys())
    params_df = df['params'].apply(pd.Series)

    # Normalize None values for groupby
    for col in param_names:
        if col in params_df.columns:
            params_df[col] = params_df[col].apply(lambda x: 'none' if x is None else x)

    #Combine expanded params with original DataFrame
    df_with_params = pd.concat([df.drop(columns=['params']), params_df], axis=1)

    # Determine group-by strategy based on evaluation type
    if 'class' in df_with_params.columns:
        # OvR
        group_cols = param_names + ['class']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby('class')[score_col].idxmax()].reset_index(drop=True)

    elif 'class_0' in df_with_params.columns and 'class_1' in df_with_params.columns:
        # OvO
        group_cols = param_names + ['class_0', 'class_1']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby(['class_0', 'class_1'])[score_col].idxmax()].reset_index(drop=True)

    else:
        # Standard multiclass
        group_cols = param_names
        score_col = 'f1_macro'  # override if not passed
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[[summary[score_col].idxmax()]].reset_index(drop=True)

    return summary, best

summary_df, best_per_class_df = process_cv_results(
    df_parallel_results,
    param_grid=param_grid
)
best_per_class_df

Unnamed: 0,n_genes,C,gamma,kernel,class_weight,class,f1_binary
0,500,100.0,0.0001,rbf,balanced,0,0.474248
1,500,100.0,0.0001,rbf,none,1,0.481247
2,500,100.0,0.0001,rbf,balanced,2,0.487927


In [62]:
from sklearn.datasets import load_iris
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from joblib import Parallel, delayed
import itertools
import numpy as np
import pandas as pd

# Load data
X, y = load_iris(return_X_y=True)
y = y.astype("str")
print(np.unique(y))
# CV setup
outer_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [10, 20],
    'max_depth': [3, 5]
}
param_combos = list(itertools.product(param_grid['n_estimators'], param_grid['max_depth']))

# Function to evaluate one inner fold + hyperparam combo
def evaluate_inner_fold(outer_fold, inner_fold, train_inner_idx, val_inner_idx, X_train_outer, y_train_outer, n_estimators, max_depth):
    results = []
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        ))
    ])
    X_train_inner = X_train_outer[train_inner_idx]
    y_train_inner = y_train_outer[train_inner_idx]
    X_val_inner = X_train_outer[val_inner_idx]
    y_val_inner = y_train_outer[val_inner_idx]

    classes = np.unique(y_train_inner)
    for cl in classes:
        y_train_inner_ovr = [cl if yy == cl else "other" for yy in y_train_inner]
        y_val_inner_ovr = [cl if yy == cl else "other" for yy in y_val_inner]
        
        pipe.fit(X_train_inner, y_train_inner_ovr)
        preds = pipe.predict(X_val_inner)
        acc = accuracy_score(y_val_inner_ovr, preds)
        f1_bi = f1_score(y_val_inner_ovr, preds, average='binary', pos_label=cl)
        results.append({
            'outer_fold': outer_fold,
            'class': cl,
            'inner_fold': inner_fold,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'accuracy': acc,
            'f1_binary': f1_bi
        })
    return(results)

# Outer CV loop with parallel inner loop
all_results = []

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):
    X_train_outer, X_test_outer = X[train_idx], X[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]

    inner_tasks = []
    for inner_fold, (train_inner_idx, val_inner_idx) in enumerate(inner_cv.split(X_train_outer, y_train_outer)):
        for n_estimators, max_depth in param_combos:
            inner_tasks.append(delayed(evaluate_inner_fold)(
                outer_fold, inner_fold,
                train_inner_idx, val_inner_idx,
                X_train_outer, y_train_outer,
                n_estimators, max_depth
            ))

    # Run inner CV tasks in parallel (adjust n_jobs to number of CPU cores)
    inner_results = Parallel(n_jobs=-1, verbose=1)(inner_tasks)
    for res in inner_results:
        all_results.extend(res)

# Convert to DataFrame
df_parallel_results = pd.DataFrame(all_results)
df_parallel_results

['0' '1' '2']


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.2s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  20 out of  20 | elapsed:    0.1s finished


Unnamed: 0,outer_fold,class,inner_fold,n_estimators,max_depth,accuracy,f1_binary
0,0,0,0,10,3,1.000000,1.000000
1,0,1,0,10,3,0.875000,0.823529
2,0,2,0,10,3,0.875000,0.800000
3,0,0,0,10,5,1.000000,1.000000
4,0,1,0,10,5,0.875000,0.823529
...,...,...,...,...,...,...,...
295,4,1,4,20,3,0.958333,0.941176
296,4,2,4,20,3,0.958333,0.933333
297,4,0,4,20,5,1.000000,1.000000
298,4,1,4,20,5,0.958333,0.941176


In [63]:
from sklearn.datasets import load_iris
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, f1_score
from joblib import Parallel, delayed
import itertools
import numpy as np
import pandas as pd

# Load data
X, y = load_iris(return_X_y=True)
y = y.astype("str")
# CV setup
outer_cv = KFold(n_splits=3, shuffle=True, random_state=42)
inner_cv = KFold(n_splits=3, shuffle=True, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_estimators': [10, 20],
    'max_depth': [3, 5]
}
param_combos = list(itertools.product(param_grid['n_estimators'], param_grid['max_depth']))

# Function to evaluate one inner fold + hyperparam combo
def evaluate_inner_fold(outer_fold, inner_fold, train_inner_idx, val_inner_idx, X_train_outer, y_train_outer, n_estimators, max_depth):
    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('clf', RandomForestClassifier(
            n_estimators=n_estimators,
            max_depth=max_depth,
            random_state=42
        ))
    ])
    X_train_inner = X_train_outer[train_inner_idx]
    y_train_inner = y_train_outer[train_inner_idx]
    X_val_inner = X_train_outer[val_inner_idx]
    y_val_inner = y_train_outer[val_inner_idx]

    classes = np.unique(y_train_inner)
    results = []
    for i, j in itertools.combinations(classes, r = 2):
        train_mask = [(yy == i or yy == j) for yy in y_train_inner]
        val_mask = [(yy == i or yy == j) for yy in y_val_inner]

        X_train_ij = X_train_inner[train_mask]
        y_train_ij = [yy for yy in y_train_inner if yy == i or yy == j]

        X_val_ij = X_val_inner[val_mask]
        y_val_ij = [yy for yy in y_val_inner if yy == i or yy == j]
        
        pipe.fit(X_train_ij, y_train_ij)
        preds = pipe.predict(X_val_ij)
        acc = accuracy_score(y_val_ij, preds)
        f1_bi = f1_score(y_val_ij, preds, average='binary', pos_label=i)
        results.append({
            'outer_fold': outer_fold,
            'class_0': i,
            'class_1': j,
            'inner_fold': inner_fold,
            'n_estimators': n_estimators,
            'max_depth': max_depth,
            'accuracy': acc,
            'f1_binary': f1_bi
        })
    return(results)

# Outer CV loop with parallel inner loop
all_results = []

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X)):
    X_train_outer, X_test_outer = X[train_idx], X[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]

    inner_tasks = []
    for inner_fold, (train_inner_idx, val_inner_idx) in enumerate(inner_cv.split(X_train_outer)):
        for n_estimators, max_depth in param_combos:
            inner_tasks.append(delayed(evaluate_inner_fold)(
                outer_fold, inner_fold,
                train_inner_idx, val_inner_idx,
                X_train_outer, y_train_outer,
                n_estimators, max_depth
            ))

    # Run inner CV tasks in parallel (adjust n_jobs to number of CPU cores)
    inner_results = Parallel(n_jobs=-1, verbose=1)(inner_tasks)
    for res in inner_results:
        all_results.extend(res)

# Convert to DataFrame
df_parallel_results = pd.DataFrame(all_results)
df_parallel_results

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  10 out of  12 | elapsed:    0.1s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  12 out of  12 | elapsed:    0.1s finished


Unnamed: 0,outer_fold,class_0,class_1,inner_fold,n_estimators,max_depth,accuracy,f1_binary
0,0,0,1,0,10,3,1.000000,1.000000
1,0,0,2,0,10,3,1.000000,1.000000
2,0,1,2,0,10,3,0.818182,0.818182
3,0,0,1,0,10,5,1.000000,1.000000
4,0,0,2,0,10,5,1.000000,1.000000
...,...,...,...,...,...,...,...,...
103,2,0,2,2,20,3,1.000000,1.000000
104,2,1,2,2,20,3,0.920000,0.909091
105,2,0,1,2,20,5,1.000000,1.000000
106,2,0,2,2,20,5,1.000000,1.000000
