In [26]:
import os
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from xgboost import XGBClassifier
from sklearn.multioutput import MultiOutputClassifier

In [27]:
# Paths
DATA_DIR = Path("../../data/preprocessed")
SAVE_DIR = Path("../../results")
SAVE_DIR.mkdir(parents=True, exist_ok=True) 

In [28]:
# Dataset names and their corresponding filenames
datasets = {
    "pca": ("train_data_pca.csv", "test_data_pca.csv"),
    "scaled": ("train_data_scaled.csv", "test_data_scaled.csv"),
    "raw": ("train_data.csv", "test_data.csv"),
}

In [29]:
# Load target labels
y_train = pd.read_excel("../../data/raw/widsdatathon2025/TRAIN/TRAINING_SOLUTIONS.xlsx")
y_train = y_train.set_index("participant_id")
y_train.head()

Unnamed: 0_level_0,ADHD_Outcome,Sex_F
participant_id,Unnamed: 1_level_1,Unnamed: 2_level_1
UmrK0vMLopoR,1,1
CPaeQkhcjg7d,1,0
Nb4EetVPm3gs,1,0
p4vPhVu91o4b,1,1
M09PXs7arQ5E,1,1


In [30]:
def load_data(train_file, test_file):
    """Load training and test datasets"""
    # Create X_train and X_test
    X_train = pd.read_csv(DATA_DIR / train_file)
    X_test = pd.read_csv(DATA_DIR / test_file)

    # Extract participant IDs and set index
    participant_id = X_test["participant_id"]  
    X_train = X_train.set_index("participant_id") 
    X_test = X_test.set_index("participant_id") 

    return X_train, X_test, participant_id

In [31]:
def train_model(X_train, y_train):
    """Train an XGBoost MultiOutputClassifier model."""
    xgb_classifier = XGBClassifier(objective='binary:logistic', n_estimators=100, learning_rate=0.1, max_depth=5)
    multioutput_classifier = MultiOutputClassifier(xgb_classifier)
    multioutput_classifier.fit(X_train, y_train)
    return multioutput_classifier

In [32]:

def predict_and_save(model, X_test, participant_id, dataset_name):
    """Generate predictions and save results."""
    y_pred = model.predict(X_test)

    # Convert predictions to a DataFrame
    predictions_df = pd.DataFrame(
        y_pred, columns=['Predicted_Gender', 'Predicted_ADHD'], index=X_test.index
    )

    result_df = predictions_df.reset_index()

    # Save the results
    result_file = SAVE_DIR / f"{dataset_name}_xgb.csv"
    result_df.to_csv(result_file, index=False)

    print(f"Results saved to {result_file}")

In [33]:
for dataset_name, (train_file, test_file) in datasets.items():
    print(f"Processing dataset: {dataset_name}")

    # Load data
    X_train, X_test, participant_id = load_data(train_file, test_file)

    # Train model
    model = train_model(X_train, y_train)

    # Predict and save results
    predict_and_save(model, X_test, participant_id, dataset_name)

Processing dataset: pca
Results saved to ../../results/pca_xgb.csv
Processing dataset: scaled
Results saved to ../../results/scaled_xgb.csv
Processing dataset: raw
