# Imports

In [4]:
import os
import joblib
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold, train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from scipy.stats import mode
import cupy as cp

# Load Preprocess And Scalling the data

In [5]:
# Load dataset paths
dataset_path_healthy = "../../split_fif/healthy_csv"
dataset_path_mdd = "../../split_fif/mdd_csv"
dataset_path_other = "../../split_fif/other_csv"

def load_and_preprocess(directory, label=None):
    """ Load and preprocess dataset """
    data_frames = []
    for file in os.listdir(directory):
        if file.endswith(".csv"):
            df = pd.read_csv(os.path.join(directory, file))
            if label is not None:
                df["Label"] = label
            data_frames.append(df)
    df = pd.concat(data_frames, ignore_index=True)

    for col in df.select_dtypes(include=['object']).columns:
        df[col] = LabelEncoder().fit_transform(df[col])

    df.fillna(df.median(), inplace=True)
    return df

# Load and merge datasets
df_healthy = load_and_preprocess(dataset_path_healthy, label=0)
df_mdd = load_and_preprocess(dataset_path_mdd, label=1)
df_other = load_and_preprocess(dataset_path_other, label=2)

df = pd.concat([df_healthy, df_mdd, df_other], axis=0).reset_index(drop=True)

# Split features and labels
X = df.drop(columns=['Label']).values
y = df['Label'].values

# Scale features
scaler = MinMaxScaler(feature_range=(0, 100))
X_scaled = scaler.fit_transform(X)

# Train test and validation split

In [6]:
# Split data
X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y, test_size=0.2, random_state=42, stratify=y)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.3, random_state=42, stratify=y_temp)

X_train_gpu = cp.array(X_train)
y_train_gpu = cp.array(y_train)


joblib.dump(scaler, "multi_class_best_scaler.pkl")

# ---- Train Models with Optimized Hyperparameters ---- #

from sklearn.impute import SimpleImputer

# Ensure missing values are handled
imputer = SimpleImputer(strategy="median")  # Replace NaN with median value
X_train = imputer.fit_transform(X_train)
X_val = imputer.transform(X_val)
X_test = imputer.transform(X_test)

# Save imputer for future predictions
joblib.dump(imputer, "multi_class_imputer.pkl")

# Reapply MinMax Scaling (in case imputer changed values)
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)

# Models

In [7]:
# Optimized XGBoost Model
xgb_model = xgb.XGBClassifier(
    objective="multi:softmax",
    tree_method="hist",
    device="cuda",
    eval_metric="mlogloss",
    learning_rate=0.01,  # Prevent overfitting
    max_depth=15,  # Balanced depth
    gamma=0.2,
    subsample=0.9,  # Avoid overfitting
    colsample_bytree=0.8,
    min_child_weight=3,  # Prevent overfitting
    reg_alpha=0.5,
    reg_lambda=2.0,
    n_estimators=3000,  # Reduce to prevent overfitting
    verbosity=1,
    num_class=3
)
xgb_model.fit(X_train, y_train)
joblib.dump(xgb_model, "xgb_model.pkl")

# Optimized Random Forest Model
rf_model = RandomForestClassifier(
    n_estimators=200,  # Sufficient trees for stability
    max_depth=20,  # Avoid overfitting
    min_samples_split=5,  # Avoid too many splits
    min_samples_leaf=2,  # Prevent deep trees
    max_features="sqrt",  # Balance feature selection
    random_state=42
)
rf_model.fit(X_train, y_train)
joblib.dump(rf_model, "rf_model.pkl")

# Optimized Support Vector Machine (SVM)
svm_model = SVC(
    C=1.5,  # Regularization to prevent overfitting
    kernel="rbf",  # Best for non-linear problems
    gamma="scale",  # Auto-tuned gamma
    probability=True,  # Needed for predict_proba()
    random_state=42
)
svm_model.fit(X_train, y_train)
joblib.dump(svm_model, "svm_model.pkl")

# Optimized Logistic Regression
log_reg = LogisticRegression(
    C=1.0,  # Standard regularization
    solver="lbfgs",  # Handles multi-class well
    max_iter=500,  # Ensure convergence
    random_state=42
)
log_reg.fit(X_train, y_train)
joblib.dump(log_reg, "log_reg.pkl")

# Load models for ensemble
models = {
    "XGBoost": xgb_model,
    "RandomForest": rf_model,
    "SVM": svm_model,
    "LogisticRegression": log_reg
}

# Evaluate the model

In [None]:
# Evaluate models
model_accuracies = {}
for name, model in models.items():
    preds = model.predict(X_val)
    accuracy = accuracy_score(y_val, preds)
    model_accuracies[name] = accuracy
    print(f"{name} Accuracy: {accuracy:.4f}")

# Select best model
best_model_name = max(model_accuracies, key=model_accuracies.get)
best_model = models[best_model_name]
print(f"\nBest Model: {best_model_name} with Accuracy: {model_accuracies[best_model_name]:.4f}")

# Ensemble Learning - Soft Voting
val_preds_prob = np.zeros((X_val.shape[0], len(models)))

for idx, (name, model) in enumerate(models.items()):
    val_preds_prob[:, idx] = np.argmax(model.predict_proba(X_val), axis=1)

ensemble_preds = mode(val_preds_prob, axis=1)[0].flatten()
ensemble_accuracy = accuracy_score(y_val, ensemble_preds)
print(f"\nEnsemble Model Accuracy: {ensemble_accuracy:.4f}")

XGBoost Accuracy: 0.8108
RandomForest Accuracy: 0.8649
SVM Accuracy: 0.7838
LogisticRegression Accuracy: 0.8649

Best Model: RandomForest with Accuracy: 0.8649

Ensemble Model Accuracy: 0.8649


In [9]:
# Function to Predict Using Ensemble
def predict_class(X_predict_scaled):
    model_probs = {}

    for name, model in models.items():
        try:
            prob = model.predict_proba(X_predict_scaled)[0]
            model_probs[name] = prob
        except AttributeError:
            print(f"Model {name} does not support probability prediction.")

    # Averaging Probabilities (Soft Voting)
    avg_probs = np.mean(list(model_probs.values()), axis=0)

    # Get final prediction
    final_class = np.argmax(avg_probs)

    # Print Probabilities
    class_probabilities = {f"Class {i}": avg_probs[i] * 100 for i in range(len(avg_probs))}

    print(f"\nFinal Prediction: Class {final_class}")
    for cls, prob in class_probabilities.items():
        print(f"{cls}: {prob:.2f}%")

    return final_class

In [35]:
# Example Usage:
X_predict_scaled = scaler.transform(X_test[:-1])  # Test with one sample
predict_class(X_predict_scaled)



Final Prediction: Class 1
Class 0: 12.92%
Class 1: 83.99%
Class 2: 3.09%


np.int64(1)