## Importing all required libraries

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder


## Load and Preprocess Data

In [None]:
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label  # Assign class labels
            data_frames.append(df)
    df = pd.concat(data_frames, ignore_index=True)

    # ✅ Handle categorical data
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    # ✅ Handle missing values
    df.fillna(df.median(), inplace=True)

    return df


In [None]:
healthy_dir = "dataset_grouped_47_healthy_cleaned"
mdd_dir = "dataset_grouped_70_mdd_cleaned"


In [None]:
df_healthy = load_and_preprocess(healthy_dir, label=0)  # 0 for Healthy
df_mdd = load_and_preprocess(mdd_dir, label=1)  # 1 for MDD

In [None]:
scaler = StandardScaler()

In [None]:
X_flattened = X.reshape(X.shape[0], -1)  # (samples, features)

# ✅ Apply StandardScaler
X_scaled = scaler.fit_transform(X_flattened)

# ✅ Reshape back to ensure correct dimensions
X = X_scaled  # Keeps it 2D for ML models

print(f"✅ Data Shape after Standardization: {X.shape}")  # Should be (samples, features)

# ================================
# 🔹 Split into Train, Validation & Test
# ================================
X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, 
    test_size=0.2,  # Reserve 20% for validation & test  
    random_state=42,
    stratify=y
)

X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp,
    test_size=0.3,  # 30% of temp set → 6% of total data
    random_state=42,
    stratify=y_temp
)

In [None]:
import numpy as np
import xgboost as xgb
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# ✅ Convert to NumPy (if not already) and ensure correct dtype
X_train_np = np.array(X_train, dtype=np.float32)
y_train_np = np.array(y_train, dtype=np.float32)
X_val_np = np.array(X_val, dtype=np.float32)

# ✅ XGBoost Model with GPU Optimization
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",  # ✅ Correct method for GPU in XGBoost 2.0+
    device="cuda",  # ✅ Explicitly set device to GPU
    eval_metric="logloss",
    learning_rate=0.005,
    max_depth=15,
    gamma=0.2,
    subsample=0.8,
    colsample_bytree=0.8,
    min_child_weight=2,
    reg_alpha=0.3,
    reg_lambda=1.5,
    n_estimators=2000,  # ✅ Lowered for efficiency
    verbosity=1
)

# ✅ Define Parameter Grid (For Fine-Tuning)
param_grid = {
    'n_estimators': [1500, 2000],         
    'learning_rate': [0.005],        
    'max_depth': [10, 15],              
    'gamma': [0.1, 0.2],                 
    'subsample': [0.8],            
    'colsample_bytree': [0.8],     
    'reg_alpha': [0.3],             
    'reg_lambda': [1.5],            
    'min_child_weight': [2]         
}

# ✅ Use StratifiedKFold for Cross-Validation
cv = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

# ✅ GridSearchCV for Hyperparameter Tuning
grid_search = GridSearchCV(
    estimator=xgb_model, 
    param_grid=param_grid, 
    scoring="accuracy",
    cv=cv,  # ✅ Use StratifiedKFold
    verbose=3,  
    n_jobs=1  # ✅ Ensures GPU execution inside XGBoost
)

# ✅ Train with GridSearchCV (Fully GPU-optimized)
grid_search.fit(X_train_np, y_train_np)  # 🚀 Data will be loaded into GPU inside XGBoost

# ✅ Best Hyperparameters
print("🔥 Best Parameters:", grid_search.best_params_)

# ✅ Test on Validation Set
best_model = grid_search.best_estimator_
val_preds = best_model.predict(X_val_np)


# Full Code

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
from xgboost import XGBClassifier
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import StandardScaler, LabelEncoder

# ✅ Function to Load and Preprocess Data
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label  # Assign class labels
            data_frames.append(df)
    df = pd.concat(data_frames, ignore_index=True)

    # ✅ Handle categorical data
    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    # ✅ Handle missing values
    df.fillna(df.median(), inplace=True)

    return df

# ✅ Load MDD and Healthy Data (Training)
healthy_dir = "/home/admincit/Desktop/Team_4/dataset_grouped_47/dataset_grouped_47_healthy_cleaned"
mdd_dir = "/home/admincit/Desktop/Team_4/dataset_grouped_47/dataset_grouped_70_mdd_cleaned"

df_healthy = load_and_preprocess(healthy_dir, label=0)  # 0 for Healthy
df_mdd = load_and_preprocess(mdd_dir, label=1)  # 1 for MDD

# ✅ Combine into a Single Training Dataset
df_train = pd.concat([df_healthy, df_mdd], ignore_index=True)

# ✅ Separate Features and Labels
X_train = df_train.drop(columns=["Label"])
y_train = df_train["Label"]

# ✅ Scale Data (Standardization)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)

# ✅ Save the Scaler for Later Use
joblib.dump(scaler, "model_weights/binary_xgb_scaler.pkl")

# ✅ Define Cross-Validation Strategy
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# 🔥 **Train XGBoost Model with Best Hyperparameters**
xgb_model = XGBClassifier(
    objective="binary:logistic",  # Binary classification
    tree_method="hist",  # Optimized method
    device="cuda",  # Use GPU
    eval_metric="logloss",  # Binary log loss
    learning_rate=0.005,  
    max_depth=25,  
    gamma=0.2,  
    subsample=0.95,  
    colsample_bytree=0.97,  
    min_child_weight=1,  
    reg_alpha=0.8,  
    reg_lambda=3.0,  
    n_estimators=6000,  # High number for better learning
    verbosity=1  # Output progress
)

# ✅ Perform Stratified K-Fold Cross-Validation
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=cv, scoring="accuracy")

# ✅ Train XGBoost Model on Full Data
xgb_model.fit(X_train_scaled, y_train)

# ✅ Save Trained Model
joblib.dump(xgb_model, "model_weights/binary_xgb_model.pkl")
print(f"🔥 XGBoost Model Trained! Cross-Validation Accuracy: {cv_scores.mean():.4f} ± {cv_scores.std():.4f}")

# ✅ Load MDD and Healthy Data (Validation)
val_healthy_dir = "/home/admincit/Desktop/Team_4/dataset_grouped_47/dataset_grouped_47_healthy_cleaned"
val_mdd_dir = "/home/admincit/Desktop/Team_4/dataset_grouped_47/dataset_grouped_70_mdd_cleaned"

df_val_healthy = load_and_preprocess(val_healthy_dir, label=0)
df_val_mdd = load_and_preprocess(val_mdd_dir, label=1)

# ✅ Combine Validation Data
df_validation = pd.concat([df_val_healthy, df_val_mdd], ignore_index=True)

# ✅ Prepare Validation Data
X_val = df_validation.drop(columns=["Label"])
y_val = df_validation["Label"]

# ✅ Apply Same Scaling as Training
X_val_scaled = scaler.transform(X_val)

# ✅ Make Predictions on Validation Data
val_preds = xgb_model.predict(X_val_scaled)
val_probs = xgb_model.predict_proba(X_val_scaled)[:, 1]  # Probabilities for ROC AUC

# ✅ Evaluate Performance
accuracy = accuracy_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, val_probs)
report = classification_report(y_val, val_preds)

print(f"🔥 Validation Accuracy: {accuracy:.4f}")
print(f"🔥 ROC AUC Score: {roc_auc:.4f}")
print("\n🔹 Classification Report:\n", report)
