In [1]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import train_test, transformers, classifiers

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid, ParameterSampler
import datetime
import pandas as pd
import argparse
import random

from pathlib import Path


n_jobs = 1 # Get the value of n_jobs from the parsed arguments
# Get the number of inner and outer folds
k_out = 2
k_in = 2

# Get the current date and time in string format
time = datetime.datetime.now().strftime("%Y%m%d_%H%M")

# Create the output directory if it doesn't exist
model_type = "NN"
output_dir = f"/Users/jsevere2/Documents/AML_PhD/leukem_ai/out_test/{model_type}/{time}"
os.makedirs(output_dir, exist_ok=True)
print(f"Output dir is {output_dir}")

# Load and prepare data
print("Loading and preparing data")

base_path = "/Users/jsevere2/Documents/AML_PhD/leukem_ai"
data_path = base_path + "/data"

X, y, study_labels = train_test.load_data(data_path)
X, y, study_labels = train_test.filter_data(X, y, study_labels, min_n = 10)
y, label_mapping = train_test.encode_labels(y)

# Define the model and parameter grid   
if model_type == "XGBOOST":
    model = classifiers.WeightedXGBClassifier
    param_grid = {
        'n_genes': [2000, 3000, 5000],
        'class_weight': [True],
        'max_depth': [2, 3, 5],
        'learning_rate': [0.05, 0.1],
        'n_estimators': [100, 200],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1.0]
    }
elif model_type == "SVM":
    from sklearn.svm import SVC
    model = SVC
    param_grid = {
        'n_genes': [1000, 2000, 3000],
        'C': [0.1, 1, 10, 100, 1000],  
        'gamma': ['auto', 'scale', 0.0001, 0.001, 0.01, 0.1],  
        'class_weight': ["balanced", None],
        'probability': [True]
    }
elif model_type == "NN":
    model = classifiers.NeuralNet
    param_grid = {
        "n_genes": [10000],
        "n_neurons": [
            #[800, 400, 100],
            [400, 200, 50],
            #[200, 100, 25],
            #[800, 400],
            #[400, 200],
            #[200, 100],
            #[100, 50],
            
        ],
        "use_batch_norm": [False],
        "dropout_rate": [0.5],
        "batch_size": [32],
        "patience": [1],
        "l2_reg": [0.001],
        "class_weight": [True],
        "min_delta": [0.001],
        "learning_rate": [0.0001],
        "loss_function": ["focal"],
    }
else:
    raise ValueError(f"Model type {model_type} not supported")

# If needed downsample param_list
full_param_list = list(ParameterGrid(param_grid))

# Batch norm and dropout do not play nicely together, waste of compute
if model_type == "NN":
    full_param_list = [
        params for params in full_param_list
        if not (params['use_batch_norm'] and params['dropout_rate'] > 0)
    ]

# Downsample if needed
n_downsample = 1
if len(full_param_list) > n_downsample:
    param_list = random.sample(full_param_list, k=n_downsample)
else:
    param_list = full_param_list


Output dir is /Users/jsevere2/Documents/AML_PhD/leukem_ai/out_test/NN/20250623_1622
Loading and preparing data


  studies_series: 2834
  X_df: (60660, 2834)
  y_series: 2834
  Studies: 2834
  X shape: (2834, 60660)
  y: 2834


  Studies: 2206
  X shape: (2206, 60660)
  y: 2206


In [2]:
# Define the pipeline
pipe = Pipeline([
    ('DEseq2', transformers.DESeq2RatioNormalizer()),
    ('feature_selection', transformers.FeatureSelection2()),
    ('scaler', StandardScaler())
])
print("Pipeline set up")

# Start the inner cross-validation process
print("Starting inner cross-validation process.")
# Iterate through different multiclass classification strategies
# standard: Uses the classifier's default multiclass handling
# OvO: One-vs-One strategy - trains binary classifier between each pair of classes
# OvR: One-vs-Rest strategy - trains binary classifier for each class against all others
if model_type == "NN":
    multi_types = ["standard"]
else:
    multi_types = ["standard", "OvO", "OvR"]

multi_types = ["standard"]

Pipeline set up
Starting inner cross-validation process.


In [4]:
fold_type = "CV"
if fold_type == "CV":
    for multi_type in multi_types:
        df = train_test.run_inner_cv(
            X, y, study_labels, model, param_list, n_jobs, pipe, 
            multi_type=multi_type, k_out=k_out, k_in=k_in,
            model_type = model_type
            )
        
        # Convert encoded labels back to original class names
        df = train_test.restore_labels(df, label_mapping)
        
        # Save results to CSV file with model type, strategy and timestamp
        df.to_csv(f"{output_dir}/{model_type}_inner_cv_{multi_type}_{time}.csv")   
elif fold_type == "loso":
    for multi_type in multi_types:
        df = train_test.run_inner_cv_loso(
            X, y, study_labels, model, param_list, n_jobs, pipe, 
            multi_type=multi_type,
            model_type = model_type
            )
        
        # Convert encoded labels back to original class names
        df = train_test.restore_labels(df, label_mapping)
        
        # Save results to CSV file with model type, strategy and timestamp
        df.to_csv(f"{output_dir}/{model_type}_inner_cv_loso_{multi_type}_{time}.csv")   
else:
    raise ValueError(f"Fold type {fold_type} not supported.")

print("Cross-validation process finished.")



outer_fold
0




inner_fold
0
Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0163 - loss: 6.7333 - val_accuracy: 0.0525 - val_loss: 4.1163
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0425 - loss: 5.5716 - val_accuracy: 0.0707 - val_loss: 3.9033
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0478 - loss: 5.1112 - val_accuracy: 0.1105 - val_loss: 3.7639
Epoch 4/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0413 - loss: 4.5513 - val_accuracy: 0.1359 - val_loss: 3.6546
Epoch 5/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0554 - loss: 5.0359 - val_accuracy: 0.1757 - val_loss: 3.5976
Epoch 6/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.0736 - loss: 4.4527 - val_accuracy: 0.2246 - val_loss: 3.5296
Epo



Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 16ms/step - accuracy: 0.0334 - loss: 6.2985 - val_accuracy: 0.0833 - val_loss: 3.8778
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.0901 - loss: 4.5235 - val_accuracy: 0.1576 - val_loss: 3.6820
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1092 - loss: 4.6340 - val_accuracy: 0.2428 - val_loss: 3.5571
Epoch 4/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.0926 - loss: 4.4959 - val_accuracy: 0.2627 - val_loss: 3.4757
Epoch 5/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1179 - loss: 4.7128 - val_accuracy: 0.3025 - val_loss: 3.3947
Epoch 6/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.1395 - loss: 4.0033 - val_accuracy: 0.3297 - val_loss: 3.3356
Epoch 7/10000
[