### Test Model

In [2]:
"""
Created on Tuesday 3 April 2022
Author: ZMW
"""
#import libraries

import pandas as pd
from sklearn.preprocessing import LabelEncoder
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings("ignore")
from sklearn import svm
from sklearn import metrics
from sklearn.metrics import roc_curve,roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import learning_curve
from sklearn.pipeline import make_pipeline
from numpy import mean
from numpy import std
from xgboost import plot_importance
from hyperopt import STATUS_OK, Trials, fmin, hp, tpe
import shap

In [3]:
#Load the fitted dataset
data = pd.read_csv('data.csv')

In [4]:
data.head()

Unnamed: 0,Label,Patient,A1_1Deg,A1_1Deg_err,B1_1Deg,B1_1Deg_err,C1_1Deg,C1_1Deg_err,1_Deg_ResSoS_1,1Deg_AdjR2_1,...,8_Deg_ResSoS_1,8Deg_AdjR2_1,A2_8Deg,A2_8Deg_err,B2_8Deg,B2_8Deg_err,C2_8Deg,C2_8Deg_err,8_Deg_ResSoS_2,8Deg_AdjR2_2
0,Not_Parkinson's,PD001,10.38819,0.006132,0.049329,6e-06,0.011989,0.000337,8928.709194,0.990598,...,6698.838281,0.951413,10.273124,0.044206,0.386347,0.000384,0.037356,0.002439,6809.4558,0.939095
1,Not_Parkinson's,PD002,10.429055,0.006929,0.049478,7e-06,0.002837,0.000379,11388.604704,0.988092,...,4769.901822,0.96736,11.4825,0.049154,0.395412,0.00038,-0.002486,0.002419,8383.48184,0.939586
2,Not_Parkinson's,PD003,10.097088,0.005091,0.049368,6e-06,0.005707,0.000287,6140.292312,0.993141,...,3488.747176,0.968709,10.084338,0.019838,0.394595,0.000174,-0.035466,0.001106,1349.96224,0.987259
3,Not_Parkinson's,PD004,10.482881,0.022179,0.049585,2.3e-05,0.016056,0.001212,117748.97971,0.879372,...,10972.045844,0.920137,11.493414,0.048867,0.401386,0.000377,-0.042463,0.002395,8242.136178,0.94037
4,Not_Parkinson's,PD005,10.181273,0.008833,0.049368,1e-05,0.001569,0.000494,18447.340517,0.979692,...,18268.960666,0.809586,10.370348,0.036121,0.395392,0.000308,-0.030769,0.00196,4485.300264,0.960035


In [5]:
# Features and labels
x = data.iloc[:,2:].values
y = data.iloc[:,0].values

In [6]:
## Convert the categorical variables to number
LabelEncoder_gender = LabelEncoder()
y = LabelEncoder_gender.fit_transform(y)

In [7]:
# spliting feature
x_train, x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=0)

### Hyperparameter tuning

In [8]:
# parameter list
params = {'C': hp.uniform('C', 0.1, 100), 'gamma': hp.uniform('gamma', 0, 0.2)}

In [9]:
# Hyperparameter Tuning function
def hyperparameter_tuning(params):
    clf = svm.SVC(**params, kernel='rbf', random_state=1, probability=True, class_weight='balanced',
                        cache_size=2000, max_iter=10000)
    clf.fit(x_train, y_train)
    pred = clf.predict(x_test)
    accuracy = accuracy_score(y_test, pred>0.5)
#   print ("SCORE:", accuracy)
    return {'loss': -accuracy, 'status': STATUS_OK }


In [10]:
# Optimizing
trials = Trials()

best_hyperparams = fmin(fn = hyperparameter_tuning,
                        space = params,
                        algo = tpe.suggest,
                        max_evals = 50,
                        trials = trials)

100%|██████████| 50/50 [00:00<00:00, 145.89trial/s, best loss: -0.6666666666666666]


In [11]:
# Model with tuned parameters
best_model = svm.SVC(C=best_hyperparams['C'], gamma=best_hyperparams['gamma'], kernel='rbf', random_state=1,
                         probability=True, class_weight='balanced', cache_size=2000, max_iter=10000)

In [12]:
def run_best_model(arg):
    # data set preparation
    # data = pd.read_csv('data.csv')
    data = arg

    ## Convert the categorical variables to number
    LabelEncoder_gender = LabelEncoder()
    data['Label'] = LabelEncoder_gender.fit_transform(data['Label'])
    
    # run 50 repetitions
    splits = 50
    all_set = []
    for split in range(1,splits+1):
        
        seed = split

        # data set preparation
        train , rest = train_test_split(data,train_size = 0.8,shuffle=True)
        validate , test = train_test_split(rest, train_size = 0.5, shuffle=True)

        # training set
        data_tra_x = train.iloc[:,2:].values
        data_tra_y = train.iloc[:,0].values


        # validation set
        data_val_x = validate.iloc[:,2:].values
        data_val_y = validate.iloc[:,0].values


        # test set
        data_tes_x = test.iloc[:,2:].values
        data_tes_y =  test.iloc[:,0].values

        best_model = svm.SVC(C=best_hyperparams['C'], gamma=best_hyperparams['gamma'], kernel='rbf', random_state=1,
                         probability=True, class_weight='balanced', cache_size=2000, max_iter=10000)
    
        best_model.fit(data_tra_x,data_tra_y)

        # training error
        tra_pred = best_model.predict_proba(data_tra_x)
        try:
            tra_results = [split,'tra',roc_auc_score(data_tra_y,tra_pred[:,1])]
        except ValueError:
            pass

        # validation error
        val_pred = best_model.predict_proba(data_val_x)
        try:
            val_results = [split,'val',roc_auc_score(data_val_y,val_pred[:,1])]
        except ValueError:
            pass

        # testing error
        tes_pred = best_model.predict_proba(data_tes_x)
        try:
            tes_results = [' ','tes',roc_auc_score(data_tes_y,tes_pred[:,1])]
        except ValueError:
            pass

        all_set += tra_results,val_results,tes_results

    all_set_df = pd.DataFrame(all_set,columns=['split','set','acc'])

    print('train accuracy',all_set_df[all_set_df['set'] == 'tra']['acc'].mean())
    print('validation accuracy',all_set_df[all_set_df['set'] == 'val']['acc'].mean())
    print('test accuracy',all_set_df[all_set_df['set'] == 'tes']['acc'].mean())

#run_best_model(data)

In [17]:
run_best_model(data)

train accuracy 0.34
validation accuracy 0.455
test accuracy 0.435


### Model with custom features

In [22]:
data = data[['Label','Patient','C2_6Deg','B2_4Deg','B2_6Deg','C2_4Deg',
            'C1_2Deg','C1_8Deg','A1_4Deg_err','C2_1Deg','B1_8Deg','B2_8Deg']]


def run_best_model(arg):
    # data set preparation
    # data = pd.read_csv('data.csv')
    data = arg

    ## Convert the categorical variables to number
    LabelEncoder_gender = LabelEncoder()
    data['Label'] = LabelEncoder_gender.fit_transform(data['Label'])
    
    # run 50 repetitions
    splits = 50
    all_set = []
    for split in range(1,splits+1):
        
        seed = split

        # data set preparation
        train , rest = train_test_split(data,train_size = 0.8,shuffle=True)
        validate , test = train_test_split(rest, train_size = 0.5, shuffle=True)

        # training set
        data_tra_x = train.iloc[:,2:].values
        data_tra_y = train.iloc[:,0].values


        # validation set
        data_val_x = validate.iloc[:,2:].values
        data_val_y = validate.iloc[:,0].values


        # test set
        data_tes_x = test.iloc[:,2:].values
        data_tes_y =  test.iloc[:,0].values

        best_model = svm.SVC(C=best_hyperparams['C'], gamma=best_hyperparams['gamma'], kernel='rbf', random_state=1,
                         probability=True, class_weight='balanced', cache_size=2000, max_iter=10000)
    
        best_model.fit(data_tra_x,data_tra_y)

        # training error
        tra_pred = best_model.predict_proba(data_tra_x)
        try:
            tra_results = [split,'tra',roc_auc_score(data_tra_y,tra_pred[:,1])]
        except ValueError:
            pass

        # validation error
        val_pred = best_model.predict_proba(data_val_x)
        try:
            val_results = [split,'val',roc_auc_score(data_val_y,val_pred[:,1])]
        except ValueError:
            pass

        # testing error
        tes_pred = best_model.predict_proba(data_tes_x)
        try:
            tes_results = [' ','tes',roc_auc_score(data_tes_y,tes_pred[:,1])]
        except ValueError:
            pass

        all_set += tra_results,val_results,tes_results

    all_set_df = pd.DataFrame(all_set,columns=['split','set','acc'])

    print('train accuracy',all_set_df[all_set_df['set'] == 'tra']['acc'].mean())
    print('validation accuracy',all_set_df[all_set_df['set'] == 'val']['acc'].mean())
    print('test accuracy',all_set_df[all_set_df['set'] == 'tes']['acc'].mean())

#run_best_model(data)

In [25]:
run_best_model(data)

train accuracy 0.19539271839271838
validation accuracy 0.25
test accuracy 0.18
