In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import SGDClassifier
from sklearn.decomposition import PCA

In [2]:
df = pd.read_csv('/Users/vanhome/Downloads/EEG.machinelearing_data_BRMH.csv') # Load data

In [3]:
# Manually change these
disorder1 = 'Addictive disorder'
disorder2 = 'Trauma and stress related disorder'
filtered = df[df['main.disorder'].isin([disorder1, disorder2])]

bands = ['delta', 'theta', 'alpha', 'beta', 'highbeta', 'gamma']
datatypes = ['AB', 'COH', 'AB+COH']
classifiers = {
    'SVM': SVC(kernel='linear', class_weight='balanced'),
    'LogisticRegression': LogisticRegression(max_iter=1000, class_weight='balanced'),
    'RandomForest': RandomForestClassifier(random_state=42, class_weight='balanced'),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

In [4]:
# Cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

def band_match(col, band):
    parts = col.split('.')
    return band in parts

for band in bands:
    for dtype in datatypes:
        if dtype == 'AB+COH':
            cols = [col for col in filtered.columns if band_match(col, band) and ('AB' in col or 'COH' in col)]
        else:
            cols = [col for col in filtered.columns if band_match(col, band) and dtype in col]
        if not cols:
            print(f"⚠️ Skipping: {band} + {dtype} — no matching features.")
            continue
        
        X = filtered[cols].values
        y = filtered['main.disorder'].apply(lambda x: 1 if x == disorder2 else 0).values

        # Data cleaning
        X[np.isinf(X)] = np.nan
        X = X[:, ~np.isnan(X).all(axis=0)]  # remove all-NaN columns
        if X.shape[1] == 0:
            print(f"⚠️ Skipping: {band} + {dtype} — all columns were NaN.")
            continue
        
        X = SimpleImputer(strategy="mean").fit_transform(X)
        X = VarianceThreshold(threshold=0.0).fit_transform(X)
        X = StandardScaler().fit_transform(X)

        pca = PCA(n_components=0.95, svd_solver='full')
        X = pca.fit_transform(X)
        
        print(f"\n=== Band: {band}, Type: {dtype} ===")
        for name, clf in classifiers.items():
            scores = cross_val_score(clf, X, y, cv=cv, scoring='roc_auc')
            print(f"{name} AUC: {scores.mean():.3f} ± {scores.std():.3f}")


=== Band: delta, Type: AB ===
SVM AUC: 0.638 ± 0.061
LogisticRegression AUC: 0.624 ± 0.058
RandomForest AUC: 0.593 ± 0.057
KNN AUC: 0.624 ± 0.055

=== Band: delta, Type: COH ===
SVM AUC: 0.613 ± 0.098
LogisticRegression AUC: 0.621 ± 0.096
RandomForest AUC: 0.599 ± 0.093
KNN AUC: 0.594 ± 0.068

=== Band: delta, Type: AB+COH ===
SVM AUC: 0.655 ± 0.068
LogisticRegression AUC: 0.654 ± 0.052
RandomForest AUC: 0.604 ± 0.070
KNN AUC: 0.631 ± 0.067

=== Band: theta, Type: AB ===
SVM AUC: 0.618 ± 0.013
LogisticRegression AUC: 0.612 ± 0.009
RandomForest AUC: 0.576 ± 0.084
KNN AUC: 0.538 ± 0.067

=== Band: theta, Type: COH ===
SVM AUC: 0.613 ± 0.113
LogisticRegression AUC: 0.604 ± 0.101
RandomForest AUC: 0.630 ± 0.052
KNN AUC: 0.582 ± 0.036

=== Band: theta, Type: AB+COH ===
SVM AUC: 0.624 ± 0.097
LogisticRegression AUC: 0.642 ± 0.099
RandomForest AUC: 0.686 ± 0.069
KNN AUC: 0.640 ± 0.040

=== Band: alpha, Type: AB ===
SVM AUC: 0.592 ± 0.063
LogisticRegression AUC: 0.594 ± 0.069
RandomForest AUC