<a href="https://colab.research.google.com/github/injaamam/ADHD/blob/main/ADHD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Step 1: Setup environment & import libraries

### Details about the Dataset:

The dataset has total 21 column. The first 19 columns are channel names. The other 2 columns are Class and ID. (Class: ADHD/Control, ID: Patient ID)

Participants were 61 children with ADHD and 60 healthy controls (boys and girls, ages 7-12). EEG recording was performed based on 10-20 standard by 19 channels (Fz, Cz, Pz, C3, T3, C4, T4, Fp1, Fp2, F3, F4, F7, F8, P3, P4, T5, T6, O1, O2) at 128 Hz sampling frequency.

First row contains 21 column names.
ID: v10p starts from row 2,
ID: v12p starts from row 14305
ID: v14p starts from row 31910
ID: v15p starts from row 49472
and so on


link:
https://www.kaggle.com/datasets/danizo/eeg-dataset-for-adhd

In [5]:
# Install any missing packages (Colab usually has most)
!pip install pandas numpy scipy scikit-learn matplotlib --quiet

# Import
import pandas as pd
import numpy as np
from scipy.signal import welch
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import matplotlib.pyplot as plt
print("Done")


Done


#Step 2: Mount drive

In [14]:
# from google.colab import drive
# drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


#Step 3: Load dataset

In [7]:
df = pd.read_csv('/content/drive/MyDrive/ML/Datasets/adhdata.csv')
df.head()

Unnamed: 0,Fp1,Fp2,F3,F4,C3,C4,P3,P4,O1,O2,...,F8,T7,T8,P7,P8,Fz,Cz,Pz,Class,ID
0,261.0,402.0,16.0,261.0,126.0,384.0,126.0,236.0,52.0,236.0,...,16.0,200.0,494.0,126.0,236.0,121.0,367.0,121.0,ADHD,v10p
1,121.0,191.0,-94.0,85.0,16.0,200.0,126.0,52.0,347.0,273.0,...,-57.0,126.0,347.0,52.0,52.0,15.0,121.0,-19.0,ADHD,v10p
2,-55.0,85.0,-204.0,15.0,-57.0,200.0,52.0,126.0,236.0,200.0,...,-94.0,126.0,420.0,52.0,126.0,-55.0,261.0,85.0,ADHD,v10p
3,191.0,85.0,52.0,50.0,89.0,236.0,163.0,89.0,89.0,89.0,...,-57.0,236.0,420.0,126.0,126.0,15.0,85.0,-55.0,ADHD,v10p
4,-55.0,-125.0,-204.0,-160.0,-204.0,16.0,-241.0,-241.0,89.0,16.0,...,-131.0,89.0,310.0,-57.0,52.0,-55.0,15.0,-336.0,ADHD,v10p


#Step 4:

All unique patient id print:

In [12]:
# Get unique ID values
unique_ids = df["ID"].unique()

# Print them as an array
print("Unique IDs:", unique_ids.tolist())

# Total count
print("Total IDs:", len(unique_ids))

Unique IDs: ['v10p', 'v12p', 'v14p', 'v15p', 'v173', 'v18p', 'v19p', 'v1p', 'v20p', 'v21p', 'v22p', 'v24p', 'v25p', 'v27p', 'v28p', 'v29p', 'v30p', 'v31p', 'v32p', 'v33p', 'v34p', 'v35p', 'v36p', 'v37p', 'v38p', 'v39p', 'v3p', 'v40p', 'v6p', 'v8p', 'v177', 'v179', 'v181', 'v183', 'v190', 'v196', 'v198', 'v200', 'v204', 'v206', 'v209', 'v213', 'v215', 'v219', 'v227', 'v231', 'v234', 'v236', 'v238', 'v244', 'v246', 'v250', 'v254', 'v263', 'v265', 'v270', 'v274', 'v279', 'v284', 'v286', 'v288', 'v107', 'v108', 'v109', 'v110', 'v111', 'v112', 'v113', 'v114', 'v115', 'v116', 'v41p', 'v42p', 'v43p', 'v44p', 'v45p', 'v46p', 'v47p', 'v48p', 'v49p', 'v50p', 'v51p', 'v52p', 'v53p', 'v54p', 'v55p', 'v56p', 'v57p', 'v58p', 'v59p', 'v60p', 'v117', 'v118', 'v120', 'v121', 'v123', 'v125', 'v127', 'v129', 'v131', 'v133', 'v134', 'v138', 'v140', 'v143', 'v147', 'v149', 'v151', 'v297', 'v298', 'v299', 'v300', 'v302', 'v303', 'v304', 'v305', 'v306', 'v307', 'v308', 'v309', 'v310']
Total IDs: 121


#Step 5: Epoching & feature extraction

In [17]:
# ---------- Simple feature extraction: ONE value per channel (mean over 1s epoch) ----------
channels = ['Fp1','Fp2','F3','F4','C3','C4','P3','P4','O1','O2',
            'F7','F8','T7','T8','P7','P8','Fz','Cz','Pz']
fs = 128          # sampling rate in Hz
win = fs          # 128 samples = 1 second
n_epochs = len(df) // win

rows = []
labels = []
ids = []
for e in range(n_epochs):
    start = e * win
    seg = df.iloc[start:start+win]
    feat = {}
    for ch in channels:
        feat[ch] = seg[ch].astype(float).mean()   # single value per channel
    rows.append(feat)
    labels.append(seg['Class'].mode()[0])
    ids.append(seg['ID'].mode()[0])

X = pd.DataFrame(rows)         # columns are exactly the 19 channel names
y = pd.Series(labels, name='label')
X['ID'] = ids
X.head()


Unnamed: 0,Fp1,Fp2,F3,F4,C3,C4,P3,P4,O1,O2,F7,F8,T7,T8,P7,P8,Fz,Cz,Pz,ID
0,245.132812,284.46875,140.53125,187.265625,102.554688,186.125,112.054688,137.375,139.453125,156.960938,185.015625,164.0625,160.757812,184.992188,138.625,147.539062,158.257812,163.796875,104.15625,v10p
1,183.6875,169.90625,158.117188,124.578125,137.96875,131.09375,142.203125,131.664062,145.890625,134.679688,124.5,161.398438,146.179688,149.695312,127.789062,132.726562,150.945312,113.578125,133.875,v10p
2,209.5625,205.695312,194.34375,114.757812,232.460938,159.21875,233.625,168.125,207.367188,149.429688,136.25,160.710938,172.65625,149.6875,189.8125,172.078125,188.898438,153.398438,227.929688,v10p
3,77.351562,96.59375,126.789062,168.234375,105.84375,162.328125,82.0625,124.515625,81.414062,167.1875,188.8125,148.523438,126.710938,152.507812,127.296875,157.429688,88.65625,155.03125,68.046875,v10p
4,150.132812,154.484375,153.25,189.984375,162.140625,161.234375,172.398438,212.601562,170.851562,155.625,114.960938,142.5,167.171875,153.695312,155.945312,150.210938,130.960938,156.148438,162.804688,v10p


#Step 6: Train a Random Forest

In [19]:
pipe = make_pipeline(StandardScaler(), RandomForestClassifier(n_estimators=100, random_state=42))
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion matrix:\n", confusion_matrix(y_test, y_pred))


NameError: name 'X_train' is not defined

# Step 7 (interactive prediction + full-feature display)

In [None]:
# ----- Step 8 (demo + manual input) -----
import random
from sklearn.model_selection import train_test_split

# Ensure model is trained on 90/10 split
X_full = X.copy()               # X has channel cols + 'ID'
X_features = X_full.drop(columns=['ID'])
X_train, X_test, y_train, y_test = train_test_split(
    X_features, y, test_size=0.1, random_state=42, stratify=y
)
pipe.fit(X_train, y_train)

# Pick a random test sample and show ALL 19 channel values
i = random.randint(0, len(X_test) - 1)
sample = X_test.iloc[i:i+1]
sample_index = sample.index[0]
patient_id = X_full.loc[sample_index, 'ID']

print("=== 19-channel feature vector (means for one epoch) ===")
display(sample.T)

# Predict the sample
pred = pipe.predict(sample)[0]
proba = pipe.predict_proba(sample)[0]
conf = max(proba) * 100

print("\n--- Prediction Result ---")
print(f"Predicted: {pred}")
print(f"Actual:    {y_test.iloc[i]}")
print(f"Confidence: {conf:.2f}%")
print("✅ Correct" if pred == y_test.iloc[i] else "❌ Incorrect")
print(f"Patient ID: {patient_id}")

# Manual input template (19 values) - prefilled from this sample
manual_values = sample.iloc[0].to_dict()

print("\n--- Manual input template (edit any of these 19 values) ---")
for k, v in manual_values.items():
    print(f"{k}: {v}")

# Example: to change a channel manually, uncomment and edit:
# manual_values['Fp1'] = 150.0

# Predict using manual 19-channel input
manual_df = pd.DataFrame([manual_values])
manual_pred = pipe.predict(manual_df)[0]
manual_proba = pipe.predict_proba(manual_df)[0]
manual_conf = max(manual_proba) * 100

print("\n--- Prediction for manual input ---")
print(f"Predicted: {manual_pred}")
print(f"Confidence: {manual_conf:.2f}%")
