In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/Users/vanhome/Downloads/EEG.machinelearing_data_BRMH.csv') # Load data

In [3]:
# Manually change these
disorder1 = 'Addictive disorder'
disorder2 = 'Obsessive compulsive disorder'
filtered = df[df['main.disorder'].isin([disorder1, disorder2])]

bands = ['delta', 'theta', 'alpha', 'beta', 'highbeta', 'gamma']
datatypes = ['AB', 'COH', 'AB+COH']
classifiers = {
    'SVM': SVC(kernel='linear', class_weight='balanced'),
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

In [4]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

for band in bands:
    for dtype in datatypes:
        if dtype == 'AB+COH':
            cols = [col for col in filtered.columns if band in col and ('AB' in col or 'COH' in col)]
        else:
            cols = [col for col in filtered.columns if band in col and dtype in col]
        if not cols:
            print(f"⚠️ Skipping: {band} + {dtype} — no matching features.")
            continue

        X = filtered[cols].values
        y = filtered['main.disorder'].apply(lambda x: 1 if x == disorder2 else 0).values

        # Data cleaning
        X[np.isinf(X)] = np.nan
        X = X[:, ~np.isnan(X).all(axis=0)]  # remove all-NaN columns
        if X.shape[1] == 0:
            print(f"⚠️ Skipping: {band} + {dtype} — all columns were NaN.")
            continue
        
        X = SimpleImputer(strategy="mean").fit_transform(X)
        X = VarianceThreshold(threshold=0.0).fit_transform(X)
        X = StandardScaler().fit_transform(X)

        pca = PCA(n_components=0.95, svd_solver='full')
        X = pca.fit_transform(X)
        
        print(f"\n=== Band: {band}, Type: {dtype} ===")
        for name, clf in classifiers.items():
            scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')
            print(f"{name} AUC: {scores.mean():.3f} ± {scores.std():.3f}")


=== Band: delta, Type: AB ===
SVM AUC: 0.465 ± 0.060
LogisticRegression AUC: 0.434 ± 0.052
RandomForest AUC: 0.532 ± 0.036
KNN AUC: 0.554 ± 0.090

=== Band: delta, Type: COH ===
SVM AUC: 0.688 ± 0.102
LogisticRegression AUC: 0.649 ± 0.110
RandomForest AUC: 0.634 ± 0.052
KNN AUC: 0.651 ± 0.079

=== Band: delta, Type: AB+COH ===
SVM AUC: 0.623 ± 0.109
LogisticRegression AUC: 0.604 ± 0.119
RandomForest AUC: 0.636 ± 0.083
KNN AUC: 0.649 ± 0.072

=== Band: theta, Type: AB ===
SVM AUC: 0.609 ± 0.056
LogisticRegression AUC: 0.587 ± 0.057
RandomForest AUC: 0.633 ± 0.080
KNN AUC: 0.514 ± 0.092

=== Band: theta, Type: COH ===
SVM AUC: 0.689 ± 0.119
LogisticRegression AUC: 0.672 ± 0.117
RandomForest AUC: 0.635 ± 0.101
KNN AUC: 0.635 ± 0.103

=== Band: theta, Type: AB+COH ===
SVM AUC: 0.668 ± 0.110
LogisticRegression AUC: 0.681 ± 0.109
RandomForest AUC: 0.709 ± 0.046
KNN AUC: 0.622 ± 0.123

=== Band: alpha, Type: AB ===
SVM AUC: 0.609 ± 0.074
LogisticRegression AUC: 0.620 ± 0.080
RandomForest AUC