In [1]:
import os
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Load and Preprocess Data
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label  # Assign class labels
            data_frames.append(df)
    df = pd.concat(data_frames, ignore_index=True)
    
    # Encode categorical data
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])
    
    # Handle missing values
    df.fillna(df.median(), inplace=True)
    
    return df

# Load Data
dataset_path_healthy = "/home/admincit/Desktop/Team_4/split_fif/healthy_csv"
dataset_path_mdd = "/home/admincit/Desktop/Team_4/split_fif/mdd_processed"

df_healthy = load_and_preprocess(dataset_path_healthy, label=0)  # 0 for Healthy
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)  # 1 for MDD

df = pd.concat([df_healthy, df_mdd], axis=0).reset_index(drop=True)

# Split Features and Labels
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Standardize Data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
joblib.dump(scaler, "binary_xgb_scaler.pkl")

# Split into Train, Validation & Test
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)

# Define XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    device="cuda",
    eval_metric="logloss",
    learning_rate=0.005,
    max_depth=25,
    gamma=0.2,
    subsample=0.95,
    colsample_bytree=0.97,
    min_child_weight=1,
    reg_alpha=0.8,
    reg_lambda=3.0,
    n_estimators=6000,
    verbosity=1
)

# Perform Cross-Validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(xgb_model, X_train, y_train, cv=cv, scoring="accuracy")

# Train Model
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "binary_xgb_model.pkl")

# Evaluate on Validation Set
val_preds = xgb_model.predict(X_val)
val_probs = xgb_model.predict_proba(X_val)[:, 1]
accuracy = accuracy_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, val_probs)
report = classification_report(y_val, val_preds)

# Print Results
print(f"Validation Accuracy: {accuracy:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:\n", report)


Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




Validation Accuracy: 0.8667
ROC AUC Score: 0.9630

Classification Report:
               precision    recall  f1-score   support

           0       0.83      0.83      0.83        12
           1       0.89      0.89      0.89        18

    accuracy                           0.87        30
   macro avg       0.86      0.86      0.86        30
weighted avg       0.87      0.87      0.87        30

