In [1]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, precision_score, f1_score
import numpy as np

# Load and merge CSVs

In [None]:
# Function to merge CSV files
def merge_csvs(csv_list):
    # Load the first CSV
    merged_df = pd.read_csv(csv_list[0])
    
    # Merge the remaining CSVs
    for csv in csv_list[1:]:
        temp_df = pd.read_csv(csv)
        merged_df = merged_df.merge(temp_df, on="id")
    
    return merged_df

merged_df = merge_csvs("radiomics.csv", "cnn.csv")

# Train and compare models

### 1. Hyperparameter Tuning

We used the **RandomizedSearchCV** method, that allows for randomized sampling of hyperparameters to optimize the machine learning method.
The function accepts a classifier, a set of hyperparameter distributions to search over, the input data X and labels y, and a name for the model.

> **Preprocessing:** We created a **ColumnTransformer**, which applies **StandardScaler()** to scale the numeric features. This is crucial, especially for algorithms sensitive to feature scaling like XGBoost.

> **Pipeline Setup:** We created a pipeline that consists of the preprocessing step and the classifying step.

> **Cross-Validation:** We used **StratifiedKFold** cross-validation, which splits the data into 5 folds while ensuring that the distribution of classes remains balanced in each fold.

> **RandomizedSearchCV:** The next step is to perform hyperparameter tuning by randomly selecting hyperparameter combinations from the provided **param_distributions**.

> **Fitting the Model and Displaying Results:** After the tuning process and the fitting of each model, the function prints the best hyperparameters and the highest AUC score achieved during cross-validation.

In [None]:
# Function to perform RandomizedSearchCV
def tune_model(classifier, param_distributions, X, y, model_name):
    # Define the preprocessor (if needed)
    # Here, we"ll standardize the numeric features
    numeric_features = X.select_dtypes(include=["float64", "int64"]).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features)  # Standardize numeric features
        ]
    )

    # Create the pipeline
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),  # Preprocessing step
        ("classifier", classifier)  # Classifier passed as parameter
    ])

    # Setup StratifiedKFold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setup RandomizedSearchCV
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions,
        n_iter=50,  # Number of random combinations to try
        scoring="roc_auc",  # Use AUC as the scoring metric
        cv=kf,
        random_state=42,
        n_jobs=-1,  # Use all available cores
        return_train_score=True
    )

    # Perform RandomizedSearchCV
    random_search.fit(X, y)

    # Display the best parameters and score
    print(f"{model_name} - Best parameters: {random_search.best_params_}")
    print(f"{model_name} - Best AUC score: {random_search.best_score_:.4f}")

    return random_search.best_estimator_

###  XGBoost (xgb_param_distributions): 


- **n_estimators:** Number of boosting rounds or trees to grow, with values ranging from 50 to 300 in steps of 50.
- **max_depth:** Maximum depth of each tree, which controls the complexity of the model, sampled between 3 and 15.
- **learning_rate:** The step size at each iteration, ranging from 0.01 to 0.2 to control the contribution of each tree.
- **subsample:** Fraction of samples to be used in each boosting round, with values of 0.5, 0.7, and 1.0.
- **colsample_bytree:** Fraction of features used when building each tree, with values of 0.5, 0.7, and 1.0.
- **gamma:** Minimum loss reduction required to make a further partition in a leaf node, ranging from 0 to 5 in steps of 0.5.

In [None]:
# Parameter distributions for each classifier
xgb_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "classifier__subsample": [0.5, 0.7, 1.0],
    "classifier__colsample_bytree": [0.5, 0.7, 1.0],
    "classifier__gamma": np.arange(0, 5, 0.5),
}

### Random Forest (rf_param_distributions):

- **n_estimators:** Number of trees in the forest, sampled between 50 and 300 in steps of 50.
- **max_depth:** Maximum depth of each tree, sampled between 3 and 15.
- **min_samples_split:** Minimum number of samples required to split an internal node, with options of 2, 5, and 10.
- **min_samples_leaf:** Minimum number of samples required to be at a leaf node, with values of 1, 2, and 4.
- **max_features:** The number of features to consider when looking for the best split, with options "auto" and "sqrt".

In [None]:
rf_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__max_features": ["auto", "sqrt"],
}

In [None]:
# Define features and target
X = merged_df.drop(columns=["id", "class"])  # Exclude "id" and "class" from features
y = merged_df["class"]  # Target variable

# Tune models
best_xgb_model = tune_model(XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"), xgb_param_distributions, X, y, "XGBoost")
best_rf_model = tune_model(RandomForestClassifier(random_state=42), rf_param_distributions, X, y, "Random Forest")
# TODO: SVM?

In [None]:
# Define scoring metrics
scoring = {
    "AUC": "roc_auc",
    "Precision": make_scorer(precision_score, average="weighted"),
    "F1": make_scorer(f1_score, average="weighted"),
}

# Perform cross-validation and calculate all metrics
cv_results = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

# Display the results
print(f"AUC: {cv_results["test_AUC"].mean():.4f} ± {cv_results["test_AUC"].std():.4f}")
print(f"Precision: {cv_results["test_Precision"].mean():.4f} ± {cv_results["test_Precision"].std():.4f}")
print(f"F1-score: {cv_results["test_F1"].mean():.4f} ± {cv_results["test_F1"].std():.4f}")