In [4]:
import os
import joblib
import numpy as np
import pandas as pd
import cupy as cp
import xgboost as xgb
import re
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, log_loss,
    balanced_accuracy_score, cohen_kappa_score
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# Load and Preprocess Data
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label  # Assign class labels
            # Extract subject ID from filename (e.g., sub-88000489)
            match = re.search(r"(sub-\d+)", file)
            df["subject_id"] = match.group(1) if match else "unknown"
            data_frames.append(df)

    df = pd.concat(data_frames, ignore_index=True)
    
    # Encode categorical columns except subject_id
    for col in df.select_dtypes(include=['object']).columns:
        if col != "subject_id":
            df[col] = LabelEncoder().fit_transform(df[col])
    
    # Handle missing values ONLY for numeric columns
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    return df


# Load Data
dataset_path_healthy = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/healthy"
dataset_path_mdd = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/mdd"

df_healthy = load_and_preprocess(dataset_path_healthy, label=0)
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)

df = pd.concat([df_healthy, df_mdd], axis=0).reset_index(drop=True)

# Subject-level Split
unique_subjects = df["subject_id"].unique()
train_ids, test_ids = train_test_split(unique_subjects, test_size=0.2, random_state=42, stratify=None)
val_ids, test_ids = train_test_split(test_ids, test_size=0.5, random_state=42)

train_df = df[df["subject_id"].isin(train_ids)]
val_df = df[df["subject_id"].isin(val_ids)]
test_df = df[df["subject_id"].isin(test_ids)]

# Drop subject ID
X_train = train_df.drop(columns=["Label", "subject_id"]).values
y_train = train_df["Label"].values

original_feature_names = train_df.drop(columns=["Label", "subject_id"]).columns.tolist()
joblib.dump(original_feature_names, "original_feature_names.pkl")

X_val = val_df.drop(columns=["Label", "subject_id"]).values
y_val = val_df["Label"].values

X_test = test_df.drop(columns=["Label", "subject_id"]).values
y_test = test_df["Label"].values

# Scale data
scaler = MinMaxScaler(feature_range=(0, 100))
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)
joblib.dump(scaler, "best_xgb_scaler.pkl")

# Define XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective="binary:logistic",
    tree_method="hist",
    device="cuda",
    eval_metric="logloss",
    learning_rate=0.005,
    max_depth=25,
    gamma=0.2,
    subsample=0.95,
    colsample_bytree=0.97,
    min_child_weight=1,
    reg_alpha=0.8,
    reg_lambda=3.0,
    n_estimators=6000,
    verbosity=1
)

# Calibrated Model
calibrated_model = CalibratedClassifierCV(estimator=xgb_model, method='sigmoid', cv=3)
calibrated_model.fit(X_train_scaled, y_train)
joblib.dump(calibrated_model, "best_xgb_calibrated_model.pkl")

# Cross-Validation (on train set)
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=100)
cv_scores = cross_val_score(xgb_model, X_train_scaled, y_train, cv=cv, scoring="accuracy")

# Train original model for evaluation
xgb_model.fit(X_train_scaled, y_train)
joblib.dump(xgb_model, "best_xgb_model.pkl")

# ✅ Transfer input to GPU using CuPy
X_val_gpu = cp.array(X_val_scaled)
val_preds = xgb_model.predict(X_val_gpu)
val_probs = xgb_model.predict_proba(X_val_gpu)[:, 1]


accuracy = accuracy_score(y_val, val_preds)
roc_auc = roc_auc_score(y_val, val_probs)
logloss_val = log_loss(y_val, val_probs)
balanced_acc = balanced_accuracy_score(y_val, val_preds)
kappa = cohen_kappa_score(y_val, val_preds)
report = classification_report(y_val, val_preds)

# Print Results
print(f"\nValidation Accuracy: {accuracy:.4f}")
print(f"Balanced Accuracy: {balanced_acc:.4f}")
print(f"Cohen's Kappa: {kappa:.4f}")
print(f"Log Loss: {logloss_val:.4f}")
print(f"ROC AUC Score: {roc_auc:.4f}")
print("\nClassification Report:\n", report)
print(f"Cross-Validation Accuracy Scores: {cv_scores}")



Validation Accuracy: 0.9167
Balanced Accuracy: 0.9000
Cohen's Kappa: 0.8235
Log Loss: 0.4183
ROC AUC Score: 0.8714

Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.93        14
           1       1.00      0.80      0.89        10

    accuracy                           0.92        24
   macro avg       0.94      0.90      0.91        24
weighted avg       0.93      0.92      0.91        24

Cross-Validation Accuracy Scores: [0.74285714 0.82857143 0.76470588 0.79411765 0.76470588]


# Training with multiple random states for mdd 0 to 80

In [None]:
import os
import joblib
import numpy as np
import pandas as pd
import cupy as cp
import xgboost as xgb
import re
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, log_loss,
    balanced_accuracy_score, cohen_kappa_score
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# ------------------- Load and Preprocess -------------------
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label
            match = re.search(r"(sub-\d+)", file)
            df["subject_id"] = match.group(1) if match else "unknown"
            data_frames.append(df)

    df = pd.concat(data_frames, ignore_index=True)
    
    for col in df.select_dtypes(include=['object']).columns:
        if col != "subject_id":
            df[col] = LabelEncoder().fit_transform(df[col])
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    return df

# ------------------- Data Paths -------------------
dataset_path_healthy = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/healthy"
dataset_path_mdd = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/mdd"

df_healthy = load_and_preprocess(dataset_path_healthy, label=0)
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)

df = pd.concat([df_healthy, df_mdd], axis=0).reset_index(drop=True)

# ------------------- Random States to Evaluate -------------------
random_states = list(range(1, 11)) + [42, 100, 123, 2021, 7]
results = []
# ------------------- Loop Over Random States -------------------
for state in random_states:
    print(f"\n================ Random State: {state} ================\n")
    
    unique_subjects = df["subject_id"].unique()
    train_ids, test_ids = train_test_split(unique_subjects, test_size=0.2, random_state=state, stratify=None)
    val_ids, test_ids = train_test_split(test_ids, test_size=0.5, random_state=state)
    
    train_df = df[df["subject_id"].isin(train_ids)]
    val_df = df[df["subject_id"].isin(val_ids)]
    
    X_train = train_df.drop(columns=["Label", "subject_id"]).values
    y_train = train_df["Label"].values
    X_val = val_df.drop(columns=["Label", "subject_id"]).values
    y_val = val_df["Label"].values

    # Scaling
    scaler = MinMaxScaler(feature_range=(0, 100))
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Define XGBoost Model
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        device="cuda",
        eval_metric="logloss",
        learning_rate=0.005,
        max_depth=25,
        gamma=0.2,
        subsample=0.95,
        colsample_bytree=0.97,
        min_child_weight=1,
        reg_alpha=0.8,
        reg_lambda=3.0,
        n_estimators=6000,
        verbosity=0
    )

    # Calibrated model
    calibrated_model = CalibratedClassifierCV(estimator=xgb_model, method='sigmoid', cv=3)
    calibrated_model.fit(X_train_scaled, y_train)

    # Train original model (not calibrated) for validation evaluation
    xgb_model.fit(X_train_scaled, y_train)

    # Validation
    X_val_gpu = cp.array(X_val_scaled)
    val_preds = xgb_model.predict(X_val_gpu)
    val_probs = xgb_model.predict_proba(X_val_gpu)[:, 1]

    accuracy = accuracy_score(y_val, val_preds)
    roc_auc = roc_auc_score(y_val, val_probs)
    logloss_val = log_loss(y_val, val_probs)
    balanced_acc = balanced_accuracy_score(y_val, val_preds)
    kappa = cohen_kappa_score(y_val, val_preds)
    report = classification_report(y_val, val_preds)

        # Store results
    results.append({
        "random_state": state,
        "accuracy": accuracy,
        "balanced_accuracy": balanced_acc,
        "kappa": kappa,
        "log_loss": logloss_val,
        "roc_auc": roc_auc
    })

    # Print results
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(f"Log Loss: {logloss_val:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:\n", report)

best_result = max(results, key=lambda x: x["accuracy"])
print("\n================== BEST RANDOM STATE ==================\n")
print(f"Best Random State: {best_result['random_state']}")
print(f"Validation Accuracy: {best_result['accuracy']:.4f}")
print(f"Balanced Accuracy: {best_result['balanced_accuracy']:.4f}")
print(f"Cohen's Kappa: {best_result['kappa']:.4f}")
print(f"Log Loss: {best_result['log_loss']:.4f}")
print(f"ROC AUC Score: {best_result['roc_auc']:.4f}")
print("\n========================================================\n")



Validation Accuracy: 0.7200
Balanced Accuracy: 0.8158
Cohen's Kappa: 0.4514
Log Loss: 0.5685
ROC AUC Score: 0.9474

Classification Report:
               precision    recall  f1-score   support

           0       0.46      1.00      0.63         6
           1       1.00      0.63      0.77        19

    accuracy                           0.72        25
   macro avg       0.73      0.82      0.70        25
weighted avg       0.87      0.72      0.74        25



Validation Accuracy: 0.5500
Balanced Accuracy: 0.5500
Cohen's Kappa: 0.1000
Log Loss: 0.8575
ROC AUC Score: 0.4400

Classification Report:
               precision    recall  f1-score   support

           0       0.67      0.20      0.31        10
           1       0.53      0.90      0.67        10

    accuracy                           0.55        20
   macro avg       0.60      0.55      0.49        20
weighted avg       0.60      0.55      0.49        20



Validation Accuracy: 0.3182
Balanced Accuracy: 0.3182
Cohen'

In [4]:
import os
import joblib
import numpy as np
import pandas as pd
import cupy as cp
import xgboost as xgb
import re
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, log_loss,
    balanced_accuracy_score, cohen_kappa_score
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# ------------------- Load and Preprocess -------------------
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label
            match = re.search(r"(sub-\d+)", file)
            df["subject_id"] = match.group(1) if match else "unknown"
            data_frames.append(df)

    df = pd.concat(data_frames, ignore_index=True)
    
    for col in df.select_dtypes(include=['object']).columns:
        if col != "subject_id":
            df[col] = LabelEncoder().fit_transform(df[col])
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    return df

# ------------------- Data Paths -------------------
dataset_path_healthy = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/healthy"
dataset_path_mdd = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/mdd2"

df_healthy = load_and_preprocess(dataset_path_healthy, label=0)
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)

df = pd.concat([df_healthy, df_mdd], axis=0).reset_index(drop=True)

# ------------------- Random States to Evaluate -------------------
random_states = list(range(1, 11)) + [42, 100, 123, 2021, 7]
results = []
# ------------------- Loop Over Random States -------------------
for state in random_states:
    print(f"\n================ Random State: {state} ================\n")
    
    unique_subjects = df["subject_id"].unique()
    train_ids, test_ids = train_test_split(unique_subjects, test_size=0.2, random_state=state, stratify=None)
    val_ids, test_ids = train_test_split(test_ids, test_size=0.5, random_state=state)
    
    train_df = df[df["subject_id"].isin(train_ids)]
    val_df = df[df["subject_id"].isin(val_ids)]
    
    X_train = train_df.drop(columns=["Label", "subject_id"]).values
    y_train = train_df["Label"].values
    X_val = val_df.drop(columns=["Label", "subject_id"]).values
    y_val = val_df["Label"].values

    # Scaling
    scaler = MinMaxScaler(feature_range=(0, 100))
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Define XGBoost Model
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        device="cuda",
        eval_metric="logloss",
        learning_rate=0.005,
        max_depth=25,
        gamma=0.2,
        subsample=0.95,
        colsample_bytree=0.97,
        min_child_weight=1,
        reg_alpha=0.8,
        reg_lambda=3.0,
        n_estimators=6000,
        verbosity=0
    )

    # Calibrated model
    calibrated_model = CalibratedClassifierCV(estimator=xgb_model, method='sigmoid', cv=3)
    calibrated_model.fit(X_train_scaled, y_train)

    # Train original model (not calibrated) for validation evaluation
    xgb_model.fit(X_train_scaled, y_train)

    # Validation
    X_val_gpu = cp.array(X_val_scaled)
    val_preds = xgb_model.predict(X_val_gpu)
    val_probs = xgb_model.predict_proba(X_val_gpu)[:, 1]

    accuracy = accuracy_score(y_val, val_preds)
    roc_auc = roc_auc_score(y_val, val_probs)
    logloss_val = log_loss(y_val, val_probs)
    balanced_acc = balanced_accuracy_score(y_val, val_preds)
    kappa = cohen_kappa_score(y_val, val_preds)
    report = classification_report(y_val, val_preds)

        # Store results
    results.append({
        "random_state": state,
        "accuracy": accuracy,
        "balanced_accuracy": balanced_acc,
        "kappa": kappa,
        "log_loss": logloss_val,
        "roc_auc": roc_auc
    })

    # Print results
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(f"Log Loss: {logloss_val:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:\n", report)

best_result = max(results, key=lambda x: x["accuracy"])
print("\n================== BEST RANDOM STATE ==================\n")
print(f"Best Random State: {best_result['random_state']}")
print(f"Validation Accuracy: {best_result['accuracy']:.4f}")
print(f"Balanced Accuracy: {best_result['balanced_accuracy']:.4f}")
print(f"Cohen's Kappa: {best_result['kappa']:.4f}")
print(f"Log Loss: {best_result['log_loss']:.4f}")
print(f"ROC AUC Score: {best_result['roc_auc']:.4f}")
print("\n========================================================\n")



Validation Accuracy: 0.6429
Balanced Accuracy: 0.5938
Cohen's Kappa: 0.2045
Log Loss: 0.6155
ROC AUC Score: 0.8438

Classification Report:
               precision    recall  f1-score   support

           0       0.75      0.25      0.38        12
           1       0.62      0.94      0.75        16

    accuracy                           0.64        28
   macro avg       0.69      0.59      0.56        28
weighted avg       0.68      0.64      0.59        28



Validation Accuracy: 0.5417
Balanced Accuracy: 0.4643
Cohen's Kappa: -0.0820
Log Loss: 0.7923
ROC AUC Score: 0.5500

Classification Report:
               precision    recall  f1-score   support

           0       0.00      0.00      0.00        10
           1       0.57      0.93      0.70        14

    accuracy                           0.54        24
   macro avg       0.28      0.46      0.35        24
weighted avg       0.33      0.54      0.41        24



Validation Accuracy: 0.4800
Balanced Accuracy: 0.4872
Cohen

In [3]:
import os
import joblib
import numpy as np
import pandas as pd
import cupy as cp
import xgboost as xgb
import re
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, roc_auc_score, log_loss,
    balanced_accuracy_score, cohen_kappa_score
)
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.calibration import CalibratedClassifierCV

# ------------------- Load and Preprocess -------------------
def load_and_preprocess(directory, label=None):
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label
            match = re.search(r"(sub-\d+)", file)
            df["subject_id"] = match.group(1) if match else "unknown"
            data_frames.append(df)

    df = pd.concat(data_frames, ignore_index=True)
    
    for col in df.select_dtypes(include=['object']).columns:
        if col != "subject_id":
            df[col] = LabelEncoder().fit_transform(df[col])
    
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

    return df

# ------------------- Data Paths -------------------
dataset_path_healthy = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/healthy"
dataset_path_mdd = "/mnt/data/saikrishna/Team_4/preprocessed_data_new/mdd3"

df_healthy = load_and_preprocess(dataset_path_healthy, label=0)
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)

df = pd.concat([df_healthy, df_mdd], axis=0).reset_index(drop=True)

# ------------------- Random States to Evaluate -------------------
random_states = list(range(1, 11)) + [42, 100, 123, 2021, 7]
results = []
# ------------------- Loop Over Random States -------------------
for state in random_states:
    print(f"\n================ Random State: {state} ================\n")
    
    unique_subjects = df["subject_id"].unique()
    train_ids, test_ids = train_test_split(unique_subjects, test_size=0.2, random_state=state, stratify=None)
    val_ids, test_ids = train_test_split(test_ids, test_size=0.5, random_state=state)
    
    train_df = df[df["subject_id"].isin(train_ids)]
    val_df = df[df["subject_id"].isin(val_ids)]
    
    X_train = train_df.drop(columns=["Label", "subject_id"]).values
    y_train = train_df["Label"].values
    X_val = val_df.drop(columns=["Label", "subject_id"]).values
    y_val = val_df["Label"].values

    # Scaling
    scaler = MinMaxScaler(feature_range=(0, 100))
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)

    # Define XGBoost Model
    xgb_model = xgb.XGBClassifier(
        objective="binary:logistic",
        tree_method="hist",
        device="cuda",
        eval_metric="logloss",
        learning_rate=0.005,
        max_depth=25,
        gamma=0.2,
        subsample=0.95,
        colsample_bytree=0.97,
        min_child_weight=1,
        reg_alpha=0.8,
        reg_lambda=3.0,
        n_estimators=6000,
        verbosity=0
    )

    # Calibrated model
    calibrated_model = CalibratedClassifierCV(estimator=xgb_model, method='sigmoid', cv=3)
    calibrated_model.fit(X_train_scaled, y_train)

    # Train original model (not calibrated) for validation evaluation
    xgb_model.fit(X_train_scaled, y_train)

    # Validation
    X_val_gpu = cp.array(X_val_scaled)
    val_preds = xgb_model.predict(X_val_gpu)
    val_probs = xgb_model.predict_proba(X_val_gpu)[:, 1]

    accuracy = accuracy_score(y_val, val_preds)
    roc_auc = roc_auc_score(y_val, val_probs)
    logloss_val = log_loss(y_val, val_probs)
    balanced_acc = balanced_accuracy_score(y_val, val_preds)
    kappa = cohen_kappa_score(y_val, val_preds)
    report = classification_report(y_val, val_preds)

        # Store results
    results.append({
        "random_state": state,
        "accuracy": accuracy,
        "balanced_accuracy": balanced_acc,
        "kappa": kappa,
        "log_loss": logloss_val,
        "roc_auc": roc_auc
    })

    # Print results
    print(f"Validation Accuracy: {accuracy:.4f}")
    print(f"Balanced Accuracy: {balanced_acc:.4f}")
    print(f"Cohen's Kappa: {kappa:.4f}")
    print(f"Log Loss: {logloss_val:.4f}")
    print(f"ROC AUC Score: {roc_auc:.4f}")
    print("\nClassification Report:\n", report)

best_result = max(results, key=lambda x: x["accuracy"])
print("\n================== BEST RANDOM STATE ==================\n")
print(f"Best Random State: {best_result['random_state']}")
print(f"Validation Accuracy: {best_result['accuracy']:.4f}")
print(f"Balanced Accuracy: {best_result['balanced_accuracy']:.4f}")
print(f"Cohen's Kappa: {best_result['kappa']:.4f}")
print(f"Log Loss: {best_result['log_loss']:.4f}")
print(f"ROC AUC Score: {best_result['roc_auc']:.4f}")
print("\n========================================================\n")



Validation Accuracy: 0.6667
Balanced Accuracy: 0.5341
Cohen's Kappa: 0.0741
Log Loss: 0.6958
ROC AUC Score: 0.5568

Classification Report:
               precision    recall  f1-score   support

           0       0.33      0.25      0.29         8
           1       0.75      0.82      0.78        22

    accuracy                           0.67        30
   macro avg       0.54      0.53      0.53        30
weighted avg       0.64      0.67      0.65        30



Validation Accuracy: 0.6207
Balanced Accuracy: 0.6130
Cohen's Kappa: 0.2276
Log Loss: 0.7786
ROC AUC Score: 0.6250

Classification Report:
               precision    recall  f1-score   support

           0       0.58      0.54      0.56        13
           1       0.65      0.69      0.67        16

    accuracy                           0.62        29
   macro avg       0.62      0.61      0.61        29
weighted avg       0.62      0.62      0.62        29



Validation Accuracy: 0.6667
Balanced Accuracy: 0.4964
Cohen'