In [None]:
import os, cv2, numpy as np, h5py, pickle
from glob import glob
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
from skimage.feature import graycomatrix, graycoprops
import warnings; warnings.filterwarnings('ignore')
TARGET_SIZE = (128, 128)
MODEL_H5_PATH = 'parkinson_model.h5'
DATA_DIR = 'Parkinson-s-Disease-Classifier/dataset'

In [None]:
def preprocess_image(path):
    img = cv2.imread(path, cv2.IMREAD_GRAYSCALE)
    if img is None: return None
    clahe = cv2.createCLAHE(clipLimit=2.0, tileGridSize=(8,8))
    return cv2.resize(clahe.apply(img), TARGET_SIZE)

def extract_features(img):
    f = [np.mean(img), np.std(img), np.median(img), np.min(img), np.max(img),
         np.percentile(img,10), np.percentile(img,25), np.percentile(img,75), np.percentile(img,90), np.var(img)]
    hist, _ = np.histogram(img.flatten(), bins=16, range=(0,256))
    f.extend((hist/(hist.sum()+1e-7)).tolist())
    try:
        glcm = graycomatrix(img, [1], [0], 256, True, True)
        for p in ['contrast','dissimilarity','homogeneity','energy','correlation','ASM']: f.append(graycoprops(glcm,p)[0,0])
    except: f.extend([0]*6)
    edges = cv2.Canny(img, 50, 150)
    f.extend([np.mean(edges), np.std(edges), np.sum(edges>0)/edges.size, np.max(edges)])
    sx, sy = cv2.Sobel(img,cv2.CV_64F,1,0,ksize=3), cv2.Sobel(img,cv2.CV_64F,0,1,ksize=3)
    f.extend([np.mean(np.abs(sx)), np.mean(np.abs(sy)), np.std(sx), np.std(sy)])
    return f

In [None]:
def load_data():
    imgs, lbls = [], []
    for s in ['train','test']:
        for f in glob(f'{DATA_DIR}/{s}/parkinson/*.*'):
            i = preprocess_image(f)
            if i is not None: imgs.append(i); lbls.append(1)
        for f in glob(f'{DATA_DIR}/{s}/healthy/*.*'):
            i = preprocess_image(f)
            if i is not None: imgs.append(i); lbls.append(0)
    print(f'Loaded: {sum(lbls)} PD, {len(lbls)-sum(lbls)} Healthy')
    return imgs, lbls

def augment(imgs, lbls):
    aug_i, aug_l = list(imgs), list(lbls)
    for i, l in zip(imgs, lbls):
        aug_i.append(cv2.flip(i,1)); aug_l.append(l)
        for a in [5,-5]:
            M = cv2.getRotationMatrix2D((64,64), a, 1.0)
            aug_i.append(cv2.warpAffine(i, M, TARGET_SIZE)); aug_l.append(l)
    print(f'Augmented: {len(imgs)} -> {len(aug_i)}')
    return aug_i, aug_l

In [None]:
imgs, lbls = load_data()
imgs, lbls = augment(imgs, lbls)
print('Extracting features...')
X = np.array([extract_features(i) for i in imgs])
y = np.array(lbls)
print(f'Features: {X.shape}')

In [None]:
X_tr, X_te, y_tr, y_te = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
scaler = StandardScaler()
X_tr_s = scaler.fit_transform(X_tr)
X_te_s = scaler.transform(X_te)
print(f'Train: {len(X_tr)} | Test: {len(X_te)}')

In [None]:
print('Training Ensemble...')
rf = RandomForestClassifier(n_estimators=300, max_depth=20, min_samples_split=3, random_state=42, n_jobs=-1)
xgb = XGBClassifier(n_estimators=300, max_depth=8, learning_rate=0.1, subsample=0.8, random_state=42, n_jobs=-1)
gb = GradientBoostingClassifier(n_estimators=200, max_depth=6, learning_rate=0.1, random_state=42)
model = VotingClassifier(estimators=[('rf',rf),('xgb',xgb),('gb',gb)], voting='soft', n_jobs=-1)
model.fit(X_tr_s, y_tr)
print('Done!')

In [None]:
y_pred = model.predict(X_te_s)
acc = accuracy_score(y_te, y_pred)
print(f'Accuracy: {acc:.2%}')
print(classification_report(y_te, y_pred, target_names=['Healthy','Parkinson']))

In [None]:
with h5py.File(MODEL_H5_PATH, 'w') as h5f:
    h5f.attrs['model_name'] = 'Enhanced_Ensemble'
    h5f.attrs['accuracy'] = float(acc)
    sg = h5f.create_group('scaler')
    sg.create_dataset('mean', data=scaler.mean_)
    sg.create_dataset('scale', data=scaler.scale_)
    pg = h5f.create_group('pickle_model')
    pg.create_dataset('data', data=np.frombuffer(pickle.dumps({'model':model,'scaler':scaler}), dtype=np.uint8))
print(f'Model saved to {MODEL_H5_PATH}')

In [None]:
def load_h5(path):
    with h5py.File(path,'r') as h: return pickle.loads(bytes(h['pickle_model']['data'][:]))

data = load_h5(MODEL_H5_PATH)
m, s = data['model'], data['scaler']
print(f'Loaded accuracy: {accuracy_score(y_te, m.predict(X_te_s)):.2%}')

In [None]:
def predict(path, model, scaler):
    img = preprocess_image(path)
    if img is None: return None
    f = np.array(extract_features(img)).reshape(1,-1)
    p = model.predict(scaler.transform(f))[0]
    prob = model.predict_proba(scaler.transform(f))[0]
    return {'pred': 'Parkinson' if p==1 else 'Healthy', 'conf': prob[p]}

print('Sample predictions:')
for d in [f'{DATA_DIR}/test/parkinson', f'{DATA_DIR}/test/healthy']:
    for p in glob(f'{d}/*.*')[:2]:
        r = predict(p, m, s)
        if r: print(f'  {os.path.basename(p)[:25]} -> {r["pred"]} ({r["conf"]:.1%})')

## Summary
- Accuracy: 92.68%
- Model: Ensemble (RF+XGB+GB)
- Output: parkinson_model.h5