In [50]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import GridSearchCV, KFold

In [51]:
# Cell 2: Paths
BASE_DIR = Path("../../data/preprocessed")
SAVE_DIR = Path("../../results")
SAVE_DIR.mkdir(parents=True, exist_ok=True)

# Paths for different join types, scaling methods, and transformations
JOIN_TYPES = ['outer', 'left', 'right']
SCALING_METHODS = ['standard', 'minmax']
TRANSFORMATIONS = ['original', 'log']

In [52]:
# Dataset names and their corresponding filenames
datasets = {}
for join_type in JOIN_TYPES:
    for scaling_method in SCALING_METHODS:
        for transformation in TRANSFORMATIONS:
            dataset_name = f"{join_type}_{scaling_method}_{transformation}"
            train_file = BASE_DIR / join_type / scaling_method / transformation / "train_data_scaled.csv"
            test_file = BASE_DIR / join_type / scaling_method / transformation / "test_data_scaled.csv"
            datasets[dataset_name] = (train_file, test_file)

In [53]:
# Load target labels
y_train = pd.read_excel("../../data/raw/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
y_train = y_train.set_index("participant_id")
y_train.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
UmrK0vMLopoR,1,1
CPaeQkhcjg7d,1,0
Nb4EetVPm3gs,1,0
p4vPhVu91o4b,1,1
M09PXs7arQ5E,1,1


In [54]:
def load_data(train_file, test_file):
    """Load training and test datasets."""
    # Create X_train and X_test
    X_train = pd.read_csv(train_file)
    X_test = pd.read_csv(test_file)

    # Extract participant IDs and set index
    participant_id = X_test["participant_id"]
    X_train = X_train.set_index("participant_id")
    X_test = X_test.set_index("participant_id")

    return X_train, X_test, participant_id

In [55]:
# def train_model(X_train, y_train):
#     """Train an XGBoost MultiOutputClassifier model with GridSearchCV."""
#     # Define the base XGBoost classifier
#     xgb_classifier = XGBClassifier(objective='binary:logistic', random_state=42)

#     # Wrap in MultiOutputClassifier
#     multioutput_classifier = MultiOutputClassifier(xgb_classifier)

#     # Define hyperparameters for GridSearchCV
#     param_grid = {
#         'estimator__n_estimators': [100, 200, 300],
#         'estimator__learning_rate': [0.01, 0.1, 0.2],
#         'estimator__max_depth': [3, 5, 7],
#         'estimator__colsample_bytree': [0.7, 0.9, 1.0],
#         'estimator__subsample': [0.7, 0.9, 1.0],
#         'estimator__gamma': [0, 0.1, 0.2],
#         'estimator__reg_lambda': [0, 1, 10],
#         'estimator__reg_alpha': [0, 1, 10],
#     }

#     # Define KFold cross-validation
#     kfold = KFold(n_splits=5, shuffle=True, random_state=42)

#     # Perform GridSearchCV
#     grid_search = GridSearchCV(
#         estimator=multioutput_classifier,
#         param_grid=param_grid,
#         cv=kfold,
#         scoring='accuracy',
#         n_jobs=-1,
#         verbose=1
#     )

#     # Fit the model
#     grid_search.fit(X_train, y_train)

#     # Print the best parameters
#     print(f"Best parameters found: {grid_search.best_params_}")

#     return grid_search.best_estimator_

In [56]:
def train_model(X_train, y_train):
    """Train an XGBoost MultiOutputClassifier model without GridSearchCV."""
    # Define the base XGBoost classifier with chosen hyperparameters
    xgb_classifier = XGBClassifier(
        objective='binary:logistic',
        random_state=42,
        n_estimators=200,
        learning_rate=0.1,
        max_depth=5,
        colsample_bytree=0.9,
        subsample=0.9,
        gamma=0.1,
        reg_lambda=1,
        reg_alpha=1
    )

    # Wrap in MultiOutputClassifier
    multioutput_classifier = MultiOutputClassifier(xgb_classifier)

    # Fit the model directly
    multioutput_classifier.fit(X_train, y_train)

    return multioutput_classifier


In [57]:
def predict_and_save(model, X_test, participant_id, dataset_name):
    """Generate predictions and save results."""
    y_pred = model.predict(X_test)

    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(
        y_pred, columns=['Predicted_Gender', 'Predicted_ADHD'], index=X_test.index
    )

    result_df = predictions_df.reset_index()

    # Save the results
    result_file = SAVE_DIR / f"{dataset_name}_xgb.csv"
    result_df.to_csv(result_file, index=False)

    print(f"Results saved to {result_file}")

In [58]:
for dataset_name, (train_file, test_file) in datasets.items():
    print(f"Processing dataset: {dataset_name}")

    # Load data
    X_train, X_test, participant_id = load_data(train_file, test_file)

    # Train model 
    model = train_model(X_train, y_train)

    # Predict and save results
    predict_and_save(model, X_test, participant_id, dataset_name)

Processing dataset: outer_standard_original
Results saved to ../../results/outer_standard_original_xgb.csv
Processing dataset: outer_standard_log


FileNotFoundError: [Errno 2] No such file or directory: '../../data/preprocessed/outer/standard/log/train_data_scaled.csv'