In [8]:
import sys
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  
#sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

import train_test, transformers, classifiers

from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import ParameterGrid, ParameterSampler
import datetime
import pandas as pd
import argparse
import random

from pathlib import Path


n_jobs = 4 # Get the value of n_jobs from the parsed arguments
# Get the number of inner and outer folds
k_out = 2
k_in = 2

# Get the current date and time in string format
time = datetime.datetime.now().strftime("%Y%m%d_%H%M")

# Create the output directory if it doesn't exist
model_type = "NN"
output_dir = f"/Users/jsevere2/Documents/AML_PhD/leukem_ai/out/{model_type}/{time}"
os.makedirs(output_dir, exist_ok=True)
print(f"Output dir is {output_dir}")

# Load and prepare data
print("Loading and preparing data")

base_path = "/Users/jsevere2/Documents/AML_PhD/leukem_ai"
data_path = base_path + "/data"

X, y, study_labels = train_test.load_data(data_path)
X, y, study_labels = train_test.filter_data(X, y, study_labels, min_n = 20)
y, label_mapping = train_test.encode_labels(y)

# Define the model and parameter grid   
if model_type == "XGBOOST":
    model = classifiers.WeightedXGBClassifier
    param_grid = {
        'n_genes': [2000, 3000, 5000],
        'class_weight': [True],
        'max_depth': [2, 3, 5],
        'learning_rate': [0.05, 0.1],
        'n_estimators': [100, 200],
        'min_child_weight': [1, 3, 5],
        'gamma': [0, 0.1],
        'subsample': [0.8],
        'colsample_bytree': [0.8],
        'reg_alpha': [0, 0.1],
        'reg_lambda': [1.0]
    }
elif model_type == "SVM":
    from sklearn.svm import SVC
    model = SVC
    param_grid = {
        'n_genes': [1000, 2000, 3000],
        'C': [0.1, 1, 10, 100, 1000],  
        'gamma': ['auto', 'scale', 0.0001, 0.001, 0.01, 0.1],  
        'class_weight': ["balanced", None],
        'probability': [True]
    }
elif model_type == "NN":
    model = classifiers.NeuralNet
    param_grid = {
        'n_genes': [2000],
        'n_neurons':[
                    [800,400,100],
                    [400,200,50],
                    [200,100,25],
                    [800,400],
                    [400,200],
                    [200,100]
                    ],
        'use_batch_norm': [True, False],
        'dropout_rate': [0, 0.2,0.5], 
        'batch_size': [32],
        'patience': [2],
        'l2_reg': [0.001, 0],
        'class_weight': [True, False],
        'min_delta': [0.001],
        'learning_rate': [0.0001],
        'loss_function': ["standard", "focal"]
    }
else:
    raise ValueError(f"Model type {model_type} not supported")

# If needed downsample param_list
full_param_list = list(ParameterGrid(param_grid))

# Batch norm and dropout do not play nicely together, waste of compute
if model_type == "NN":
    full_param_list = [
        params for params in full_param_list
        if not (params['use_batch_norm'] and params['dropout_rate'] > 0)
    ]

# Downsample if needed
n_downsample = 10
if len(full_param_list) > n_downsample:
    param_list = random.sample(full_param_list, k=n_downsample)
else:
    param_list = full_param_list


Output dir is /Users/jsevere2/Documents/AML_PhD/leukem_ai/out/NN/20250603_1913
Loading and preparing data


  studies_series: 2834
  X_df: (60660, 2834)
  y_series: 2834
  Studies: 2834
  X shape: (2834, 60660)
  y: 2834


  Studies: 2268
  X shape: (2268, 60660)
  y: 2268


In [9]:
# Define the pipeline
pipe = Pipeline([
    ('DEseq2', transformers.DESeq2RatioNormalizer()),
    ('feature_selection', transformers.FeatureSelection2()),
    ('scaler', StandardScaler())
])
print("Pipeline set up")

# Start the inner cross-validation process
print("Starting inner cross-validation process.")
# Iterate through different multiclass classification strategies
# standard: Uses the classifier's default multiclass handling
# OvO: One-vs-One strategy - trains binary classifier between each pair of classes
# OvR: One-vs-Rest strategy - trains binary classifier for each class against all others
if model_type == "NN":
    multi_types = ["standard"]
else:
    multi_types = ["standard", "OvO", "OvR"]


Pipeline set up
Starting inner cross-validation process.


In [10]:
fold_type = "CV"
if fold_type == "CV":
    for multi_type in multi_types:
        df = train_test.run_inner_cv(
            X, y, study_labels, model, param_list, n_jobs, pipe, 
            multi_type=multi_type, k_out=k_out, k_in=k_in,
            model_type = model_type
            )
        
        # Convert encoded labels back to original class names
        df = train_test.restore_labels(df, label_mapping)
        
        # Save results to CSV file with model type, strategy and timestamp
        df.to_csv(f"{output_dir}/{model_type}_inner_cv_{multi_type}_{time}.csv")   
elif fold_type == "loso":
    for multi_type in multi_types:
        df = train_test.run_inner_cv_loso(
            X, y, study_labels, model, param_list, n_jobs, pipe, 
            multi_type=multi_type,
            model_type = model_type
            )
        
        # Convert encoded labels back to original class names
        df = train_test.restore_labels(df, label_mapping)
        
        # Save results to CSV file with model type, strategy and timestamp
        df.to_csv(f"{output_dir}/{model_type}_inner_cv_loso_{multi_type}_{time}.csv")   
else:
    raise ValueError(f"Fold type {fold_type} not supported.")

print("Cross-validation process finished.")

[2000]
outer_fold
0
inner_fold
0


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.0873 - loss: 2.6401 - val_accuracy: 0.4374 - val_loss: 2.2433
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5400 - loss: 1.9913 - val_accuracy: 0.5697 - val_loss: 1.8022
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6633 - loss: 1.3788 - val_accuracy: 0.6349 - val_loss: 1.5284
Epoch 4/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 21ms/step - accuracy: 0.2101 - loss: 2.7627 - val_accuracy: 0.4956 - val_loss: 2.0327
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.7475 - loss: 1.1459 - val_accuracy: 0.6526 - val_loss: 1.3693
Epoch 5/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.6586 - loss: 1.2098 - val_accuracy: 

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   44.6s finished


inner_fold
1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2127 - loss: 2.4631 - val_accuracy: 0.4974 - val_loss: 2.0620
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2491 - loss: 2.9838 - val_accuracy: 0.5168 - val_loss: 2.2285
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.5905 - loss: 1.8631 - val_accuracy: 0.5785 - val_loss: 1.7149
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5812 - loss: 2.0406 - val_accuracy: 0.6473 - val_loss: 1.9082
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.2594 - loss: 2.6861 - val_accuracy: 0.4444 - val_loss: 2.1042
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step - accuracy: 0.6493 - loss: 1.5678 - val_accuracy: 0

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   38.2s finished


outer_fold
1




inner_fold
0


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.1209 - loss: 2.7821 - val_accuracy: 0.4374 - val_loss: 2.1698
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 17ms/step - accuracy: 0.2765 - loss: 2.9910 - val_accuracy: 0.5556 - val_loss: 2.1883
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - accuracy: 0.5900 - loss: 1.9457 - val_accuracy: 0.5855 - val_loss: 1.6932
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.6008 - loss: 1.9771 - val_accuracy: 0.6437 - val_loss: 1.8942
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.1888 - loss: 2.9431 - val_accuracy: 0.4974 - val_loss: 1.9856
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.7034 - loss: 1.3244 - val_accuracy: 0.6720 - val_l

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   40.6s finished


inner_fold
1


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.


Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
Epoch 1/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 21ms/step - accuracy: 0.1754 - loss: 2.7084 - val_accuracy: 0.4656 - val_loss: 2.0514
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 19ms/step - accuracy: 0.2687 - loss: 3.0094 - val_accuracy: 0.5714 - val_loss: 2.1800
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.5599 - loss: 1.8710 - val_accuracy: 0.5556 - val_loss: 1.6485
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - accuracy: 0.6015 - loss: 2.0257 - val_accuracy: 0.6702 - val_loss: 1.8993
Epoch 3/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 23ms/step - accuracy: 0.2309 - loss: 2.7794 - val_accuracy: 0.4444 - val_loss: 1.9969
Epoch 2/10000
[1m18/18[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.7252 - loss: 1.7144 - val_accuracy: 

[Parallel(n_jobs=4)]: Done  10 out of  10 | elapsed:   50.6s finished
