In [1]:
import transformers

from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    cohen_kappa_score,
    matthews_corrcoef
)
from sklearn.model_selection import ParameterGrid
from sklearn.base import clone
from sklearn.utils import class_weight

from joblib import Parallel, delayed
import itertools
import numpy as np
import pandas as pd

# Classifiers
from sklearn.svm import SVC
from xgboost import XGBClassifier

In [2]:
from train_test import load_data, filter_data, encode_labels
X, y, study_labels = load_data("/home/jeppe/Documents/Leukem.ai/data")
X, y, study_labels = filter_data(X, y, study_labels, min_n = 20)
y, label_mapping = encode_labels(y)

  studies_series: 2974
  X_df: (60660, 2974)
  y_series: 2974
  Studies: 2974
  X shape: (2974, 60660)
  y: 2974
  Studies: 1914
  X shape: (1914, 60660)
  y: 1914


In [3]:
class WeightedXGBClassifier:
    def __init__(self, class_weight=False, **xgb_params):
        self.class_weight = class_weight
        self.xgb_params = xgb_params
        self.model = XGBClassifier(**xgb_params)

    def fit(self, X, y):
        if self.class_weight:
            sample_weights = class_weight.compute_sample_weight(class_weight='balanced', y=y)
        else:
            sample_weights = None

        self.model.fit(X, y, sample_weight=sample_weights)
        return self

    def predict(self, X):
        return self.model.predict(X)
    
    def predict_proba(self, X):
        return self.model.predict_proba(X)
    
    def score(self, X, y):
        return self.model.score(X, y)
    
    def get_params(self, deep=True):
        # Include class_weight in params for grid search
        return {'class_weight': self.class_weight, **self.model.get_params(deep)}
    
    def set_params(self, **params):
        # Extract and store class_weight separately
        if 'class_weight' in params:
            self.class_weight = params.pop('class_weight')
        
        self.model.set_params(**params)
        return self

In [4]:
# CV setup
outer_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)
inner_cv = StratifiedKFold(n_splits=2, shuffle=True, random_state=42)

# Hyperparameter grid
param_grid = {
    'n_genes': [1000, 2000, 3000],
    'C': [10, 100, 1000],
    'gamma': ['auto', 0.001, 0.01],
    'kernel': ['rbf'],
    'class_weight': ["balanced", None],
    'probability': [True]
}
param_grid = {
    'n_genes': [1000],
    'class_weight': [True, False],
    'max_depth': [2,5]
}
param_combos = list(ParameterGrid(param_grid))

#model = SVC
model = WeightedXGBClassifier
pipe = Pipeline([
    ('DEseq2', transformers.DESeq2RatioNormalizer()),
    ('feature_selection', transformers.FeatureSelection()),
    ('scaler', StandardScaler())
])

In [12]:
studies_as_folds = [
        "BEATAML1.0-COHORT",
        "AAML0531",
        "AAML1031",
        "TCGA-LAML",
        "LEUCEGENE"
    ]
from sklearn.preprocessing import LabelEncoder
# Function to evaluate one inner fold + hyperparam combo
def evaluate_inner_fold(outer_fold, inner_fold,
                        processed_X, y_train_inner, y_val_inner,
                        model,
                        params,
                        type = "standard"):
    
      
    def standard_eval():
        le = LabelEncoder()
        y_train_inner_enc = le.fit_transform(y_train_inner)
        clf.fit(X_train_inner, y_train_inner_enc)
        preds = clf.predict(X_val_inner)
        preds = le.inverse_transform(preds)
        
        return {
            'outer_fold': outer_fold,
            'inner_fold': inner_fold,
            'params': params,
            'accuracy': accuracy_score(y_val_inner, preds),
            'f1_macro': f1_score(y_val_inner, preds, average='macro'),
            'mcc': matthews_corrcoef(y_val_inner, preds),
            'kappa': cohen_kappa_score(y_val_inner, preds)
        }

    def ovr_eval():
        results = []
        classes = np.unique(y_train_inner)
        for cl in classes:
            y_train_bin = [1 if yy == cl else 0 for yy in y_train_inner]
            y_val_bin = [1 if yy == cl else 0 for yy in y_val_inner]

            y_train_bin = np.array(y_train_bin, dtype=np.int32)
            y_val_bin = np.array(y_val_bin, dtype=np.int32)

            clf.fit(X_train_inner, y_train_bin)
            preds = clf.predict_proba(X_val_inner)
            preds = preds[:, 1]
            preds = (preds >= 0.5).astype(int)
            results.append({
                'outer_fold': outer_fold,
                'inner_fold': inner_fold,
                'class': cl,
                'params': params,
                'accuracy': accuracy_score(y_val_bin, preds),
                'f1_binary': f1_score(y_val_bin, preds, average='binary', pos_label=1),
                'mcc': matthews_corrcoef(y_val_bin, preds),
                'kappa': cohen_kappa_score(y_val_bin, preds)
            })
        return results

    def ovo_eval():
        results = []
        classes = np.unique(y_train_inner)
        for i, j in itertools.combinations(classes, 2):
            train_mask = [(yy == i or yy == j) for yy in y_train_inner]
            val_mask = [(yy == i or yy == j) for yy in y_val_inner]

            X_train_ij = X_train_inner[train_mask]
            y_train_ij = np.array([yy for yy in y_train_inner if yy == i or yy == j], dtype=np.int32) 
            y_train_ij = (y_train_ij == i).astype(np.int32)
             
            X_val_ij = X_val_inner[val_mask]
            y_val_ij = np.array([yy for yy in y_val_inner if yy == i or yy == j], dtype=np.int32)
            y_val_ij = (y_val_ij == i).astype(np.int32) 

            clf.fit(X_train_ij, y_train_ij)
            preds = clf.predict_proba(X_val_ij)
            preds = preds[:, 1]
            preds = (preds >= 0.5).astype(int)
            results.append({
                'outer_fold': outer_fold,
                'inner_fold': inner_fold,
                'class_0': i,
                'class_1': j,
                'params': params,
                'accuracy': accuracy_score(y_val_ij, preds),
                'f1_binary': f1_score(y_val_ij, preds, average='binary', pos_label=1),
                'mcc': matthews_corrcoef(y_val_ij, preds),
                'kappa': cohen_kappa_score(y_val_ij, preds)
            })
        return results

    # Dispatch table for clean logic
    eval_dispatch = {
        'standard': standard_eval,
        'OvR': ovr_eval,
        'OvO': ovo_eval
    }

    if type not in eval_dispatch:
        raise ValueError(f"Unsupported evaluation type: {type}")
    
    # Select preprocessed data
    n_genes = params.pop('n_genes')
    X_train_inner, X_val_inner = processed_X[n_genes]

    # Set classifier
    clf = clone(model(**params))
    params['n_genes'] = n_genes

    return eval_dispatch[type]()

def pre_process_data(n_genes_list, X_train_outer, train_inner_idx, val_inner_idx, study_labels_outer, pipe):
        
        X_train_inner = X_train_outer[train_inner_idx]
        X_val_inner = X_train_outer[val_inner_idx]

        study_labels_inner = study_labels_outer[train_inner_idx]
        
        y_train_inner = y_train_outer[train_inner_idx]
        y_val_inner = y_train_outer[val_inner_idx]

        y_train_inner = np.array(y_train_inner, dtype=np.int32)
        y_val_inner = np.array(y_val_inner, dtype=np.int32)
        
        processed_X = {}
        for n_genes_i in n_genes_list:
            pipe_inner = clone(pipe)

            X_train_inner_proc = pipe_inner.fit_transform(X_train_inner, 
                                                feature_selection__study_per_patient=study_labels_inner, 
                                                feature_selection__n_genes=n_genes_i)
            X_val_inner_proc = pipe_inner.transform(X_val_inner)


            processed_X[n_genes_i] = [X_train_inner_proc, X_val_inner_proc]
        return processed_X, y_train_inner, y_val_inner

all_results = []

combined = [str(a) + " " + str(b) for a, b in zip(y, study_labels)]

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, combined)):
    print("outer_fold")
    print(outer_fold)
    X_train_outer = X[train_idx]
    y_train_outer = y[train_idx]
    study_labels_outer = study_labels[train_idx]
    
    combined_outer = [str(a) + " " + str(b) for a, b in zip(y_train_outer, study_labels_outer)]
    
    inner_tasks = []
    for inner_fold, (train_inner_idx, val_inner_idx) in enumerate(inner_cv.split(X_train_outer, combined_outer)):
        print("inner_fold")
        print(inner_fold)

        processed_X, y_train_inner, y_val_inner = pre_process_data(
            param_grid["n_genes"], 
            X_train_outer, 
            train_inner_idx, 
            val_inner_idx, 
            study_labels_outer,
            pipe)

        for params in param_combos:
            inner_tasks.append(delayed(evaluate_inner_fold)(
                outer_fold, inner_fold,
                processed_X, y_train_inner, y_val_inner,
                model,
                params,
                type = "standard" # standard, OvR, OvO
            ))

    # Run inner CV tasks in parallel (adjust n_jobs to number of CPU cores)
    inner_results = Parallel(n_jobs=12, verbose=1)(inner_tasks)
    if isinstance(inner_results[0], dict):
        # Flat list of dictionaries
        all_results.extend(inner_results)
    elif isinstance(inner_results[0], list):
        # List of lists of dictionaries
        for res in inner_results:
            all_results.extend(res)
    else:
        raise ValueError("Unexpected structure in inner_results")


# Convert to DataFrame
df_parallel_results = pd.DataFrame(all_results)



outer_fold
0




inner_fold
0
inner_fold
1


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of   8 | elapsed:   27.3s remaining:   45.5s
[Parallel(n_jobs=12)]: Done   8 out of   8 | elapsed:   31.0s finished


outer_fold
1




inner_fold
0
inner_fold
1


[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done   3 out of   8 | elapsed:   19.2s remaining:   32.0s
[Parallel(n_jobs=12)]: Done   8 out of   8 | elapsed:   23.5s finished


In [13]:
def process_cv_results(df, param_grid, label_mapping, score_col):
    #  Extract param names and expand 'params'
    param_names = list(param_grid.keys())
    params_df = df['params'].apply(pd.Series)

    # Normalize None values for groupby
    for col in param_names:
        if col in params_df.columns:
            params_df[col] = params_df[col].apply(lambda x: 'none' if x is None else x)

    #Combine expanded params with original DataFrame
    df_with_params = pd.concat([df.drop(columns=['params']), params_df], axis=1)

    # Determine group-by strategy based on evaluation type
    if 'class' in df_with_params.columns:
        # OvR
        group_cols = param_names + ['class']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby('class')[score_col].idxmax()].reset_index(drop=True)

    elif 'class_0' in df_with_params.columns and 'class_1' in df_with_params.columns:
        # OvO
        group_cols = param_names + ['class_0', 'class_1']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby(['class_0', 'class_1'])[score_col].idxmax()].reset_index(drop=True)

    else:
        # Standard multiclass
        group_cols = param_names
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[[summary[score_col].idxmax()]].reset_index(drop=True)

    int_to_label = {v: k for k, v in label_mapping.items()}
    if 'class' in best.columns:
        # OvR case
        best['class_label'] = best['class'].map(int_to_label)
        return best

    elif 'class_0' in best.columns and 'class_1' in best.columns:
        # OvO case
        best['class_0_label'] = best['class_0'].map(int_to_label)
        best['class_1_label'] = best['class_1'].map(int_to_label)
        return best

    else:
        return best

best_per_class_df = process_cv_results(
    df_parallel_results,
    param_grid=param_grid,
    label_mapping = label_mapping,
    score_col = "kappa"
)
best_per_class_df["kappa"].mean() # np.float64(0.5714442426161126)

np.float64(0.7161821410455829)

In [None]:
# Define the studies to use as folds
studies_as_folds = [
    "BEATAML1.0-COHORT",
    "AAML0531",
    "AAML1031",
    "TCGA-LAML",
    "LEUCEGENE"
]

def pre_process_data_study_based(n_genes_list,
                                 X_train_inner, X_val_inner,
                                 y_train_inner, y_val_inner, # y values aren't strictly needed here but kept for consistency
                                 study_labels_inner, # Labels corresponding to X_train_inner
                                 pipe):
    """
    Preprocesses inner training and validation sets for different n_genes.
    Fits the pipeline ONLY on the inner training set.
    """
    processed_X = {}
    for n_genes_i in n_genes_list:
        # Clone the pipeline for this specific n_genes setting
        pipe_inner = clone(pipe)

        X_train_inner_proc = pipe_inner.fit_transform(X_train_inner, y_train_inner, # Pass y if needed by steps
                                                  feature_selection__study_per_patient=study_labels_inner,
                                                  feature_selection__n_genes=n_genes_i)

        # Transform the inner validation data (1 study) using the fitted pipeline
        X_val_inner_proc = pipe_inner.transform(X_val_inner)

        processed_X[n_genes_i] = [X_train_inner_proc, X_val_inner_proc]

    # Return the dictionary of processed data and the original inner y values
    return processed_X, y_train_inner, y_val_inner


all_results = []
n_jobs = 12 # Set desired number of parallel jobs
X = np.array(X, dtype=np.float32)

n_genes_list = param_grid["n_genes"] # Get list of n_genes to process

# Outer Loop: Iterate through each study to be used as the TEST set
for test_study_name in studies_as_folds:
    print(f"\n--- Outer Loop: Holding out Study '{test_study_name}' for Testing ---")

    # Create masks for outer split
    test_mask = (study_labels == test_study_name)
    train_mask = ~test_mask

    # Outer training set (N-1 studies)
    X_train_outer = X[train_mask]
    y_train_outer = y[train_mask]
    study_labels_outer = study_labels[train_mask] # Labels for outer training set

    # Get the unique studies present in the outer training set
    train_studies = np.unique(study_labels_outer)
    print(f"Outer training set contains studies: {train_studies.tolist()}")

    outer_fold_tasks = []

    # Inner Loop: Iterate through each study in the outer training set to be used as VALIDATION set
    for validation_study_name in train_studies:
        print(f"  Inner Loop: Validating on Study '{validation_study_name}'")
        # Create masks for inner split (relative to outer training data)
        val_inner_mask = (study_labels_outer == validation_study_name)
        train_inner_mask = ~val_inner_mask

        # Inner training set (N-2 studies)
        X_train_inner = X_train_outer[train_inner_mask]
        y_train_inner = y_train_outer[train_inner_mask]
        study_labels_inner = study_labels_outer[train_inner_mask] # Labels for inner training

        # Inner validation set (1 study)
        X_val_inner = X_train_outer[val_inner_mask]
        y_val_inner = y_train_outer[val_inner_mask]


        # --- Pre-process Data ONCE for this inner fold ---
        # This computes processed versions for all n_genes values
        processed_X_inner, y_train_inner_proc, y_val_inner_proc = pre_process_data_study_based(
            n_genes_list,
            X_train_inner, X_val_inner,
            y_train_inner, y_val_inner,
            study_labels_inner, # Pass inner training labels for pipeline fitting
            pipe
        )


        # --- Create tasks for hyperparameter evaluation for THIS inner fold ---
        for params in param_combos:
            # Append a delayed evaluation task for each hyperparameter combination
            outer_fold_tasks.append(delayed(evaluate_inner_fold)(
                test_study_name,        # Identifier for the outer fold (held-out test study)
                validation_study_name,  # Identifier for the inner fold (validation study)
                processed_X_inner,      # Pre-calculated processed data for all n_genes
                y_train_inner_proc,     # Inner training labels
                y_val_inner_proc,       # Inner validation labels
                model,                  # Classifier class
                params,                 # Current hyperparameter combination
                type = "standard"         # Choose evaluation type: "standard", "OvR", "OvO"
            ))
        # --- End Hyperparameter Loop ---
    # --- End Inner Loop ---

    # --- Execute tasks for the current outer fold in parallel ---
    if outer_fold_tasks:
        inner_results_list = Parallel(n_jobs=n_jobs, verbose=1)(outer_fold_tasks)

        # Flatten the results if needed (depends on eval_type)
        for res_item in inner_results_list:
            if isinstance(res_item, list): # OvR or OvO might return lists
                all_results.extend(res_item)
            elif isinstance(res_item, dict): # Standard eval returns dict
                all_results.append(res_item)
            else:
                 print(f"Warning: Unexpected result type encountered: {type(res_item)}")
        print(f"  Finished evaluations for outer fold '{test_study_name}'.")
    else:
        print(f"  No evaluation tasks generated for outer fold '{test_study_name}'.")

df_parallel_results_study_as_fold = pd.DataFrame(all_results)


--- Outer Loop: Holding out Study 'BEATAML1.0-COHORT' for Testing ---
Outer training set contains studies: ['AAML0531', 'AAML1031', 'LEUCEGENE', 'TCGA-LAML']
  Inner Loop: Validating on Study 'AAML0531'
    Preprocessing data for inner fold (Train N=1042, Val N=486)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: AAML0531, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'AAML1031'
    Preprocessing data for inner fold (Train N=984, Val N=544)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: AAML1031, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'LEUCEGENE'
    Preprocessing data for inner fold (Train N=1159, Val N=369)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'TCGA-LAML'
    Preprocessing data for inner fold (Train N=1399, Val N=129)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: TCGA-LAML, 

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  10 out of  16 | elapsed:  2.9min remaining:  1.7min
[Parallel(n_jobs=12)]: Done  16 out of  16 | elapsed:  3.9min finished


  Finished evaluations for outer fold 'BEATAML1.0-COHORT'.

--- Outer Loop: Holding out Study 'AAML0531' for Testing ---
Outer training set contains studies: ['AAML1031', 'BEATAML1.0-COHORT', 'LEUCEGENE', 'TCGA-LAML']
  Inner Loop: Validating on Study 'AAML1031'
    Preprocessing data for inner fold (Train N=884, Val N=544)...
  For study: AAML0531, mask sum == 0
  For study: AAML1031, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'BEATAML1.0-COHORT'
    Preprocessing data for inner fold (Train N=1042, Val N=386)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: AAML0531, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'LEUCEGENE'
    Preprocessing data for inner fold (Train N=1059, Val N=369)...
  For study: AAML0531, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'TCGA-LAML'
    Preprocessing data for inner fold (Train N=1299, Val N=129)...
  For study: AAML053

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  10 out of  16 | elapsed:   38.6s remaining:   23.1s
[Parallel(n_jobs=12)]: Done  16 out of  16 | elapsed:   50.0s finished


  Finished evaluations for outer fold 'AAML0531'.

--- Outer Loop: Holding out Study 'AAML1031' for Testing ---
Outer training set contains studies: ['AAML0531', 'BEATAML1.0-COHORT', 'LEUCEGENE', 'TCGA-LAML']
  Inner Loop: Validating on Study 'AAML0531'
    Preprocessing data for inner fold (Train N=884, Val N=486)...
  For study: AAML0531, mask sum == 0
  For study: AAML1031, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'BEATAML1.0-COHORT'
    Preprocessing data for inner fold (Train N=984, Val N=386)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: AAML1031, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'LEUCEGENE'
    Preprocessing data for inner fold (Train N=1001, Val N=369)...
  For study: AAML1031, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'TCGA-LAML'
    Preprocessing data for inner fold (Train N=1241, Val N=129)...
  For study: AAML1031, mask su

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  10 out of  16 | elapsed:   38.0s remaining:   22.8s
[Parallel(n_jobs=12)]: Done  16 out of  16 | elapsed:   49.3s finished


  Finished evaluations for outer fold 'AAML1031'.

--- Outer Loop: Holding out Study 'TCGA-LAML' for Testing ---
Outer training set contains studies: ['AAML0531', 'AAML1031', 'BEATAML1.0-COHORT', 'LEUCEGENE']
  Inner Loop: Validating on Study 'AAML0531'
    Preprocessing data for inner fold (Train N=1299, Val N=486)...
  For study: AAML0531, mask sum == 0
  For study: TCGA-LAML, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'AAML1031'
    Preprocessing data for inner fold (Train N=1241, Val N=544)...
  For study: AAML1031, mask sum == 0
  For study: TCGA-LAML, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'BEATAML1.0-COHORT'
    Preprocessing data for inner fold (Train N=1399, Val N=386)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: TCGA-LAML, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'LEUCEGENE'
    Preprocessing data for inner fold (Train N=1416, Val N=369)...
  For study: TCGA-LAML, mas

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  10 out of  16 | elapsed:   56.7s remaining:   34.0s
[Parallel(n_jobs=12)]: Done  16 out of  16 | elapsed:  1.1min finished


  Finished evaluations for outer fold 'TCGA-LAML'.

--- Outer Loop: Holding out Study 'LEUCEGENE' for Testing ---
Outer training set contains studies: ['AAML0531', 'AAML1031', 'BEATAML1.0-COHORT', 'TCGA-LAML']
  Inner Loop: Validating on Study 'AAML0531'
    Preprocessing data for inner fold (Train N=1059, Val N=486)...
  For study: AAML0531, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'AAML1031'
    Preprocessing data for inner fold (Train N=1001, Val N=544)...
  For study: AAML1031, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'BEATAML1.0-COHORT'
    Preprocessing data for inner fold (Train N=1159, Val N=386)...
  For study: BEATAML1.0-COHORT, mask sum == 0
  For study: LEUCEGENE, mask sum == 0
    Preprocessing done.
  Inner Loop: Validating on Study 'TCGA-LAML'
    Preprocessing data for inner fold (Train N=1416, Val N=129)...
  For study: TCGA-LAML, ma

[Parallel(n_jobs=12)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=12)]: Done  10 out of  16 | elapsed:   38.8s remaining:   23.3s


  Finished evaluations for outer fold 'LEUCEGENE'.


[Parallel(n_jobs=12)]: Done  16 out of  16 | elapsed:   52.5s finished


In [15]:
def process_cv_results(df, param_grid, label_mapping, score_col):
    #  Extract param names and expand 'params'
    param_names = list(param_grid.keys())
    params_df = df['params'].apply(pd.Series)

    # Normalize None values for groupby
    for col in param_names:
        if col in params_df.columns:
            params_df[col] = params_df[col].apply(lambda x: 'none' if x is None else x)

    #Combine expanded params with original DataFrame
    df_with_params = pd.concat([df.drop(columns=['params']), params_df], axis=1)

    # Determine group-by strategy based on evaluation type
    if 'class' in df_with_params.columns:
        # OvR
        group_cols = param_names + ['class']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby('class')[score_col].idxmax()].reset_index(drop=True)

    elif 'class_0' in df_with_params.columns and 'class_1' in df_with_params.columns:
        # OvO
        group_cols = param_names + ['class_0', 'class_1']
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[summary.groupby(['class_0', 'class_1'])[score_col].idxmax()].reset_index(drop=True)

    else:
        # Standard multiclass
        group_cols = param_names
        summary = df_with_params.groupby(group_cols)[score_col].mean().reset_index()
        best = summary.loc[[summary[score_col].idxmax()]].reset_index(drop=True)

    int_to_label = {v: k for k, v in label_mapping.items()}
    if 'class' in best.columns:
        # OvR case
        best['class_label'] = best['class'].map(int_to_label)
        return best

    elif 'class_0' in best.columns and 'class_1' in best.columns:
        # OvO case
        best['class_0_label'] = best['class_0'].map(int_to_label)
        best['class_1_label'] = best['class_1'].map(int_to_label)
        return best

    else:
        return best

best_per_class_df = process_cv_results(
    df_parallel_results_study_as_fold,
    param_grid=param_grid,
    label_mapping = label_mapping,
    score_col = "kappa"
)
best_per_class_df["kappa"].mean() # np.float64(0.5714442426161126)

np.float64(0.6951718400876972)

In [16]:
for outer_fold in (0,1):
    df2 = df_parallel_results[df_parallel_results['outer_fold'] == outer_fold]
    print(process_cv_results(
        df2,
        param_grid=param_grid,
        label_mapping = label_mapping,
        score_col = "kappa"
    ))

   n_genes  class_weight  max_depth     kappa
0     1000          True          5  0.717737
   n_genes  class_weight  max_depth     kappa
0     1000          True          2  0.715102


In [None]:
type_eval = "OvO"
columns_to_drop = ["f1_macro","f1_binary", "class_label", "class_0_label", "class_1_label", "kappa", "mcc", "accuracy"]
columns_to_drop_existing = [col for col in columns_to_drop if col in best_per_class_df.columns]

per_class_results = []
overall_results = []

int_to_label = {v: k for k, v in label_mapping.items()}

for outer_fold, (train_idx, test_idx) in enumerate(outer_cv.split(X, y)):

    X_train_outer, X_test_outer = X[train_idx], X[test_idx]
    y_train_outer, y_test_outer = y[train_idx], y[test_idx]
    study_labels_outer = study_labels[train_idx]

    classes_in_fold = np.unique(y_train_outer)
    
    def pre_process_data(df, pipe):
        processed_X = {}
        for n_genes_i in list(set(df["n_genes"])):
            pipe_outer = clone(pipe)    
            X_train_n_genes = pipe_outer.fit_transform(X_train_outer, 
                                                    feature_selection__study_per_patient=study_labels_outer, 
                                                    feature_selection__n_genes=n_genes_i)
            X_test_n_genes = pipe_outer.transform(X_test_outer)
            processed_X[n_genes_i] = [X_train_n_genes, X_test_n_genes]
        return processed_X
    
    processed_X = pre_process_data(best_per_class_df, pipe)
    results_per_class = []

    df_outer_fold_results = df_parallel_results[df_parallel_results['outer_fold'] == outer_fold].copy()

    # Find the best parameters based ONLY on this outer fold's inner results
    best_params_for_outer_fold = process_cv_results(
        df_outer_fold_results,
        param_grid=param_grid,
        label_mapping=label_mapping,
        score_col='kappa' 
    )

    def standard_eval():
        df = best_params_for_outer_fold
        df = df.drop(columns=columns_to_drop_existing)
        params = params.iloc[0].to_dict()

        n_genes = params.pop('n_genes')
        X_train_n_genes = processed_X[n_genes][0]
        X_test_n_genes = processed_X[n_genes][1]

        y_train_outer = np.array(y_train_outer, dtype=np.int32)
        y_test_outer = np.array(y_test_outer, dtype=np.int32)

        clf = clone(model(**params))
        clf.fit(X_train_n_genes, y_train_outer)
        preds = clf.predict(X_test_n_genes)
        #preds = preds[:, 1]
        #preds = (preds >= 0.5).astype(int)
        results_per_class = {}
        results_overall = {
            'outer_fold': outer_fold,
            'accuracy': accuracy_score(y_test_outer, preds),
            'f1_macro': f1_score(y_test_outer, preds, average='macro'),
            'mcc': matthews_corrcoef(y_test_outer, preds),
            'kappa': cohen_kappa_score(y_test_outer, preds)
        }
        return results_overall, results_per_class

    def ovr_eval():
        df = best_params_for_outer_fold
        df = df.drop(columns=columns_to_drop_existing)
        df = df[df['class'].isin(classes_in_fold)]

        prob_df = pd.DataFrame(index = test_idx, columns = classes_in_fold)

        for cl in classes_in_fold:        
            params = df[df["class"] == cl]
            params = params.iloc[0].to_dict()

            n_genes = params.pop('n_genes')
            params.pop('class', None)

            X_train_n_genes = processed_X[n_genes][0]
            X_test_n_genes = processed_X[n_genes][1]

            if params["class_weight"]=='none':
                params.pop('class_weight')
                
            clf = clone(model(**params))
            params['n_genes'] = n_genes

            y_train_bin = [1 if yy == cl else 0 for yy in y_train_outer]
            y_test_bin = [1 if yy == cl else 0 for yy in y_test_outer]

            y_train_bin = np.array(y_train_bin, dtype=np.int32)
            y_test_bin = np.array(y_test_bin, dtype=np.int32)

            clf.fit(X_train_n_genes, y_train_bin)

            preds_proba = clf.predict_proba(X_test_n_genes)[:, 1]
            preds = (preds_proba >= 0.5).astype(int)
            results_per_class.append({
                'outer_fold': outer_fold,
                'class': cl,
                'accuracy': accuracy_score(y_test_bin, preds),
                'f1_binary': f1_score(y_test_bin, preds, average='binary', pos_label=1),
                'mcc': matthews_corrcoef(y_test_bin, preds),
                'kappa': cohen_kappa_score(y_test_bin, preds)
            })
            prob_df[cl] = preds_proba

        results_overall = {
            'outer_fold': outer_fold,
            'accuracy': accuracy_score(y_test_outer, prob_df.idxmax(axis=1)),
            'f1_macro': f1_score(y_test_outer, prob_df.idxmax(axis=1), average='macro'),
            'mcc': matthews_corrcoef(y_test_outer, prob_df.idxmax(axis=1)),
            'kappa': cohen_kappa_score(y_test_outer, prob_df.idxmax(axis=1))
        }
        return results_overall, results_per_class

    def ovo_eval():
        df = best_params_for_outer_fold
        df = df.drop(columns=columns_to_drop_existing)
        df = df[(df['class_0'].isin(classes_in_fold)) & (df['class_1'].isin(classes_in_fold))]

        results_per_class = []
        
        # Voting
        prob_df = pd.DataFrame(0, index = test_idx, columns = classes_in_fold)
        
        for index, df_row in df.iterrows():
            i = df_row["class_0"]
            j = df_row["class_1"]

            params = df_row.to_dict()

            n_genes = params.pop('n_genes')
            params.pop('class_0', None)
            params.pop('class_1', None)

            X_train_n_genes, X_test_n_genes = processed_X[n_genes]

            train_mask = [(yy == i or yy == j) for yy in y_train_outer]
            val_mask = [(yy == i or yy == j) for yy in y_test_outer]

            X_train_ij = X_train_n_genes[train_mask]
            y_train_ij = np.array([yy for yy in y_train_outer if yy == i or yy == j], dtype=np.int32) 
            y_train_ij = (y_train_ij == i).astype(np.int32) # label '1' for class i, and '0' for class j

            X_val_ij = X_test_n_genes[val_mask]
            y_val_ij = np.array([yy for yy in y_test_outer if yy == i or yy == j], dtype=np.int32) 
            y_val_ij = (y_val_ij == i).astype(np.int32) # label '1' for class i, and '0' for class j
            
            if params["class_weight"]=='none':
                params.pop('class_weight')

            clf = clone(model(**params))
            
            clf.fit(X_train_ij, y_train_ij)
            
            preds = clf.predict_proba(X_val_ij)
            preds = preds[:, 1]
            preds = (preds >= 0.5).astype(int)

            results_per_class.append({
                'outer_fold': outer_fold,
                'class_0': i,
                'class_1': j,
                'f1_binary': f1_score(y_val_ij, preds, average='binary', pos_label=1),
                'mcc': matthews_corrcoef(y_val_ij, preds),
                'kappa': cohen_kappa_score(y_val_ij, preds)
            })

            preds_all_test_orig = clf.predict_proba(X_test_n_genes)
            preds_all_test_orig = preds_all_test_orig[:, 1]
            preds_all_test = (preds_all_test_orig >= 0.5).astype(int)

            for prediction_idx, prediction in enumerate(preds_all_test):
                if prediction == 1:
                    prob_df.loc[test_idx[prediction_idx], i] += 1
                else:
                    prob_df.loc[test_idx[prediction_idx], j] += 1

        # votes
        preds_all = prob_df.idxmax(axis=1)

        results_overall = {
            'outer_fold': outer_fold,
            #'accuracy': accuracy_score(y_test_outer, preds_all),
            #'f1_macro': f1_score(y_test_outer, preds_all, average='macro'),
            'mcc votes': matthews_corrcoef(y_test_outer, preds_all),
            'kappa votes': cohen_kappa_score(y_test_outer, preds_all)
        }
        return results_overall, results_per_class

    # Dispatch table for clean logic
    eval_dispatch = {
        'OvR': ovr_eval,
        'OvO': ovo_eval
    }

    if type_eval not in eval_dispatch:
        raise ValueError(f"Unsupported evaluation type: {type}")
    
    results_overall, results_per_class = eval_dispatch[type_eval]()
    per_class_results.append(results_per_class)
    print(results_overall)
    overall_results.append(results_overall)

{'outer_fold': 0, 'mcc votes': np.float64(0.8450958861113496), 'kappa votes': np.float64(0.8447055763390022)}


KeyboardInterrupt: 

In [None]:
# OvO: {'outer_fold': 0, 'mcc votes': np.float64(0.8450958861113496), 'kappa votes': np.float64(0.8447055763390022)}

In [41]:
per_class_results[0]

[{'outer_fold': 0,
  'class_0': 0,
  'class_1': 1,
  'f1_binary': 0.9158878504672897,
  'mcc': np.float64(0.8066855435212543),
  'kappa': np.float64(0.8049792531120332)},
 {'outer_fold': 0,
  'class_0': 0,
  'class_1': 2,
  'f1_binary': 0.9636363636363636,
  'mcc': np.float64(0.7414141414141414),
  'kappa': np.float64(0.7414141414141414)},
 {'outer_fold': 0,
  'class_0': 0,
  'class_1': 3,
  'f1_binary': 1.0,
  'mcc': np.float64(1.0),
  'kappa': np.float64(1.0)},
 {'outer_fold': 0,
  'class_0': 0,
  'class_1': 4,
  'f1_binary': 0.9818181818181818,
  'mcc': np.float64(0.7318181818181818),
  'kappa': np.float64(0.7318181818181818)},
 {'outer_fold': 0,
  'class_0': 0,
  'class_1': 5,
  'f1_binary': 0.9557522123893806,
  'mcc': np.float64(0.9163448298943221),
  'kappa': np.float64(0.915156744319816)},
 {'outer_fold': 0,
  'class_0': 0,
  'class_1': 6,
  'f1_binary': 0.9345794392523364,
  'mcc': np.float64(0.7504283707412239),
  'kappa': np.float64(0.7459677419354839)},
 {'outer_fold': 0,
 

In [40]:
len(np.unique(y)) * (len(np.unique(y))-1) / 2

136.0