In [None]:
import pandas as pd
from sklearn.model_selection import StratifiedKFold, RandomizedSearchCV, cross_validate
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import make_scorer, precision_score, f1_score
import numpy as np

# Load and merge CSVs

In [None]:
# Function to merge CSV files
def merge_csvs(csv_list):
    # Load the first CSV
    merged_df = pd.read_csv(csv_list[0])
    
    # Merge the remaining CSVs
    for csv in csv_list[1:]:
        temp_df = pd.read_csv(csv)
        merged_df = merged_df.merge(temp_df, on="id")
    
    return merged_df

merged_df = merge_csvs("radiomics.csv", "cnn.csv")

# Train and compare models

In [None]:
# Function to perform RandomizedSearchCV
def tune_model(classifier, param_distributions, X, y, model_name):
    # Define the preprocessor (if needed)
    # Here, we"ll standardize the numeric features
    numeric_features = X.select_dtypes(include=["float64", "int64"]).columns
    preprocessor = ColumnTransformer(
        transformers=[
            ("num", StandardScaler(), numeric_features)  # Standardize numeric features
        ]
    )

    # Create the pipeline
    pipeline = Pipeline(steps=[
        ("preprocessor", preprocessor),  # Preprocessing step
        ("classifier", classifier)  # Classifier passed as parameter
    ])

    # Setup StratifiedKFold cross-validation
    kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    # Setup RandomizedSearchCV
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions,
        n_iter=50,  # Number of random combinations to try
        scoring="roc_auc",  # Use AUC as the scoring metric
        cv=kf,
        random_state=42,
        n_jobs=-1,  # Use all available cores
        return_train_score=True
    )

    # Perform RandomizedSearchCV
    random_search.fit(X, y)

    # Display the best parameters and score
    print(f"{model_name} - Best parameters: {random_search.best_params_}")
    print(f"{model_name} - Best AUC score: {random_search.best_score_:.4f}")

    return random_search.best_estimator_

In [None]:
# Parameter distributions for each classifier
xgb_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__learning_rate": [0.01, 0.05, 0.1, 0.2],
    "classifier__subsample": [0.5, 0.7, 1.0],
    "classifier__colsample_bytree": [0.5, 0.7, 1.0],
    "classifier__gamma": np.arange(0, 5, 0.5),
}

rf_param_distributions = {
    "classifier__n_estimators": np.arange(50, 300, 50),
    "classifier__max_depth": np.arange(3, 15),
    "classifier__min_samples_split": [2, 5, 10],
    "classifier__min_samples_leaf": [1, 2, 4],
    "classifier__max_features": ["auto", "sqrt"],
}

# Define features and target
X = merged_df.drop(columns=["id", "class"])  # Exclude "id" and "class" from features
y = merged_df["class"]  # Target variable

# Tune models
best_xgb_model = tune_model(XGBClassifier(use_label_encoder=False, eval_metric="mlogloss"), xgb_param_distributions, X, y, "XGBoost")
best_rf_model = tune_model(RandomForestClassifier(random_state=42), rf_param_distributions, X, y, "Random Forest")
# TODO: SVM?

In [None]:
# Define scoring metrics
scoring = {
    "AUC": "roc_auc",
    "Precision": make_scorer(precision_score, average="weighted"),
    "F1": make_scorer(f1_score, average="weighted"),
}

# Perform cross-validation and calculate all metrics
cv_results = cross_validate(pipeline, X, y, cv=5, scoring=scoring)

# Display the results
print(f"AUC: {cv_results["test_AUC"].mean():.4f} ± {cv_results["test_AUC"].std():.4f}")
print(f"Precision: {cv_results["test_Precision"].mean():.4f} ± {cv_results["test_Precision"].std():.4f}")
print(f"F1-score: {cv_results["test_F1"].mean():.4f} ± {cv_results["test_F1"].std():.4f}")