In [1]:
import pandas as pd
dataparse = pd.read_csv('/Users/vanhome/Downloads/EEG.machinelearing_data_BRMH.csv')

In [2]:
disorder_groups = {}
for disorder in dataparse['main.disorder'].unique():
    group = dataparse[dataparse['main.disorder'] == disorder]
    X = group.drop(columns=['main.disorder']).values
    disorder_groups[disorder] = X

print("Addictive Disorder shape:", disorder_groups["Addictive disorder"].shape)
print("Trauma and stress related disorder shape:", disorder_groups["Trauma and stress related disorder"].shape)
print("Mood Disorder shape:", disorder_groups["Mood disorder"].shape)
print("Healthy control shape:", disorder_groups["Healthy control"].shape)
print("Obsessive compulsive disorder shape:", disorder_groups["Obsessive compulsive disorder"].shape)
print("Schizophrenia shape:", disorder_groups["Schizophrenia"].shape)
print("Anxiety Disorder shape:", disorder_groups["Anxiety disorder"].shape)

Addictive Disorder shape: (186, 1148)
Trauma and stress related disorder shape: (128, 1148)
Mood Disorder shape: (266, 1148)
Healthy control shape: (95, 1148)
Obsessive compulsive disorder shape: (46, 1148)
Schizophrenia shape: (117, 1148)
Anxiety Disorder shape: (107, 1148)


In [3]:
print(dataparse.columns[:20])

Index(['no.', 'sex', 'age', 'eeg.date', 'education', 'IQ', 'main.disorder',
       'specific.disorder', 'AB.A.delta.a.FP1', 'AB.A.delta.b.FP2',
       'AB.A.delta.c.F7', 'AB.A.delta.d.F3', 'AB.A.delta.e.Fz',
       'AB.A.delta.f.F4', 'AB.A.delta.g.F8', 'AB.A.delta.h.T3',
       'AB.A.delta.i.C3', 'AB.A.delta.j.Cz', 'AB.A.delta.k.C4',
       'AB.A.delta.l.T4'],
      dtype='object')


In [4]:
# Drop metadata to keep only EEG columns
eeg_features = dataparse.drop(columns=[
    'no.', 'sex', 'age', 'eeg.date', 'education', 'IQ',
    'main.disorder', 'specific.disorder'
])

labels = dataparse['main.disorder']
print("EEG shape (X):", eeg_features.shape)
print("Labels shape (y):", labels.shape)

EEG shape (X): (945, 1141)
Labels shape (y): (945,)


In [5]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()
y = le.fit_transform(labels)  # Converts disorder names to integers
X = eeg_features.values       # Converts DataFrame to NumPy array

print("Encoded labels:", y[:10])
print("Class names:", le.classes_)

Encoded labels: [0 0 0 0 0 0 0 0 0 0]
Class names: ['Addictive disorder' 'Anxiety disorder' 'Healthy control' 'Mood disorder'
 'Obsessive compulsive disorder' 'Schizophrenia'
 'Trauma and stress related disorder']


In [6]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Split into train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict & evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))

Accuracy: 0.30158730158730157


In [7]:
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import VarianceThreshold

# Copy original matrix to avoid altering it permanently
X_safe = X.copy()

# Replace inf with NaN
X_safe[np.isinf(X_safe)] = np.nan

# Drop columns that are entirely NaN
# Find columns with all NaNs
non_all_nan_cols = ~np.isnan(X_safe).all(axis=0)
X_safe = X_safe[:, non_all_nan_cols]  # Keep only useful columns

# Impute remaining NaNs with column mean
imputer = SimpleImputer(strategy="mean")
X_imputed = imputer.fit_transform(X_safe)

# Drop constant (zero-variance) columns
selector = VarianceThreshold(threshold=0.0)
X_nonconstant = selector.fit_transform(X_imputed)

# Scale
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X_nonconstant)

print("Final cleaned X shape:", X_scaled.shape)

Final cleaned X shape: (945, 1140)


In [8]:
from sklearn.decomposition import PCA

# Keep 95% of the variance
pca = PCA(n_components=0.95, svd_solver='full')
X_pca = pca.fit_transform(X_scaled)

print("PCA-reduced shape:", X_pca.shape)

print("Number of components kept:", pca.n_components_)

PCA-reduced shape: (945, 95)
Number of components kept: 95


In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Use PCA-transformed data here!
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Train model
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

# Predict & evaluate
y_pred = clf.predict(X_test)
print("Accuracy with PCA:", accuracy_score(y_test, y_pred))

Accuracy with PCA: 0.328042328042328
