In [76]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier

In [77]:
parkinsons_data = pd.read_csv('../datasets/parkinsons_train_udprs.csv')

In [78]:
parkinsons_data.head()

Unnamed: 0,subject#,age,sex,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,...,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE,motor_UPDRS,total_UPDRS
0,35,71,0,111.4,0.00477,2.8e-05,0.00231,0.00268,0.00694,0.042,...,0.02585,0.03523,0.06608,0.012196,20.642,0.42525,0.70398,0.20987,36.42,54.613
1,16,65,0,145.42,0.00746,6.5e-05,0.00416,0.00384,0.01247,0.0326,...,0.02139,0.02702,0.05281,0.056589,19.099,0.63142,0.70052,0.28203,9.2401,21.72
2,34,59,0,24.54,0.00914,9.1e-05,0.00479,0.00444,0.01438,0.03524,...,0.01826,0.03347,0.04969,0.049628,16.267,0.69867,0.69256,0.35247,26.843,32.921
3,7,72,0,51.54,0.0074,7.9e-05,0.00289,0.00403,0.00866,0.03861,...,0.02448,0.0325,0.05766,0.02814,18.561,0.62262,0.65295,0.25549,16.663,24.108
4,26,49,0,52.967,0.00499,5e-05,0.00245,0.00283,0.00734,0.04487,...,0.02934,0.0312,0.07128,0.017165,20.485,0.64292,0.71314,0.21553,26.177,32.943


In [79]:
parkinsons_data.shape

(4112, 22)

In [80]:
parkinsons_data.isnull().sum()

subject#         0
age              0
sex              0
test_time        0
Jitter(%)        0
Jitter(Abs)      0
Jitter:RAP       0
Jitter:PPQ5      0
Jitter:DDP       0
Shimmer          0
Shimmer(dB)      0
Shimmer:APQ3     0
Shimmer:APQ5     0
Shimmer:APQ11    0
Shimmer:DDA      0
NHR              0
HNR              0
RPDE             0
DFA              0
PPE              0
motor_UPDRS      0
total_UPDRS      0
dtype: int64

In [81]:
target_variable = parkinsons_data[['motor_UPDRS','total_UPDRS']]
scaler = StandardScaler()
normalized_target = scaler.fit_transform(target_variable)
num_clusters = 3
n_init=10
kmeans = KMeans(n_clusters=num_clusters,n_init=n_init, random_state=0)
kmeans.fit(normalized_target)
cluster_labels = kmeans.labels_
data['target_label'] = cluster_labels

In [82]:
x = parkinsons_data.drop(['subject#','motor_UPDRS','total_UPDRS'], axis=1)
y = data['target_label']

In [83]:
y

0       2
1       1
2       0
3       0
4       0
       ..
4107    1
4108    2
4109    1
4110    1
4111    2
Name: target_label, Length: 4112, dtype: int32

In [84]:
xTrain, xTest, yTrain, yTest = train_test_split(x, y, test_size=0.2)

In [85]:
xTrain

Unnamed: 0,age,sex,test_time,Jitter(%),Jitter(Abs),Jitter:RAP,Jitter:PPQ5,Jitter:DDP,Shimmer,Shimmer(dB),Shimmer:APQ3,Shimmer:APQ5,Shimmer:APQ11,Shimmer:DDA,NHR,HNR,RPDE,DFA,PPE
1661,65,0,26.473,0.00384,0.000025,0.00188,0.00219,0.00565,0.02497,0.209,0.01374,0.01448,0.01935,0.04123,0.018668,24.222,0.53995,0.68519,0.18035
3881,71,0,104.500,0.00480,0.000030,0.00224,0.00267,0.00673,0.03206,0.278,0.01617,0.01959,0.02698,0.04850,0.013778,20.115,0.51737,0.71389,0.23910
1775,49,0,24.844,0.00829,0.000079,0.00415,0.00456,0.01246,0.07609,0.672,0.04148,0.04830,0.05361,0.12443,0.046915,16.882,0.63665,0.71589,0.27668
3504,55,0,95.506,0.01219,0.000087,0.00610,0.00600,0.01830,0.04382,0.420,0.02255,0.02645,0.03340,0.06765,0.034874,16.884,0.57039,0.73834,0.37631
798,85,1,138.470,0.00653,0.000046,0.00344,0.00325,0.01033,0.02459,0.208,0.01284,0.01470,0.01918,0.03853,0.025830,19.083,0.62792,0.66858,0.21312
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3407,73,0,137.840,0.00753,0.000066,0.00402,0.00457,0.01205,0.08503,0.713,0.04603,0.04800,0.05958,0.13809,0.064536,18.625,0.64239,0.62077,0.30083
2133,74,1,20.363,0.00587,0.000026,0.00314,0.00333,0.00941,0.03236,0.308,0.01313,0.01684,0.03409,0.03939,0.018469,18.954,0.65559,0.68755,0.18702
92,61,0,100.760,0.00626,0.000051,0.00328,0.00260,0.00984,0.01771,0.158,0.00893,0.01007,0.01509,0.02679,0.020865,22.581,0.50420,0.58777,0.18769
3831,57,1,168.380,0.00407,0.000023,0.00200,0.00216,0.00599,0.02920,0.260,0.01555,0.01874,0.02425,0.04664,0.019498,21.822,0.53500,0.72125,0.20168


In [None]:
param_grid = {
    'linear': {'C': [0.01, 0.1, 1, 10, 100]},
    'rbf': {'C': [0.01, 0.1, 1, 10, 100], 'gamma': [0.01, 0.1, 1, 10, 100]},
    'poly': {'C': [0.01, 0.1, 1, 10, 100], 'degree': [2, 3, 4, 5]},
}

kernels = {'linear': 'linear', 'rbf': 'rbf', 'poly': 'poly'}

for name, kernel in kernels.items():
    clf = GridSearchCV(SVC(kernel=kernel), param_grid[name], cv=5, n_jobs=-1)
    clf.fit(xTrain, yTrain)
    print(f"{name}: best hyperparameters: {clf.best_params_}, best score: {clf.best_score_}")

    # Evaluate the performance of the SVM model on the testing set
    yPred = clf.predict(xTest)
    accuracy = accuracy_score(yTest, yPred)
    print(f"{name}: accuracy on testing set: {accuracy:.3f}")

In [75]:
rf_model = RandomForestClassifier(random_state=42)
svm_model = SVC(kernel="rbf",random_state=42, probability=True)
xgb_model = XGBClassifier()

kfold = KFold(n_splits=5, shuffle=True, random_state=42)
svm_scores = cross_val_score(svm_model, x, y, cv=kfold)
print(f"SVM CV Scores: {svm_scores}")
print(f"SVM CV Mean Score: {svm_scores.mean()}")
print("--"*40)

rf_scores = cross_val_score(rf_model, x, y, cv=kfold)
print(f"Random Forest CV Scores: {rf_scores}")
print(f"Random Forest CV Mean Score: {rf_scores.mean()}")
print("--"*40)

xgb_scores = cross_val_score(xgb_model, x, y, cv=kfold)
print(f"XGBoost CV Scores: {xgb_scores}")
print(f"XGBoost CV Mean Score: {xgb_scores.mean()}")
print("--"*40)

SVM CV Scores: [0.44349939 0.42527339 0.45863747 0.44403893 0.43309002]
SVM CV Mean Score: 0.4409078411721404
--------------------------------------------------------------------------------
Random Forest CV Scores: [0.87970838 0.86998785 0.9026764  0.88686131 0.85644769]
Random Forest CV Mean Score: 0.879136326950537
--------------------------------------------------------------------------------
XGBoost CV Scores: [0.98298906 0.9671932  0.986618   0.97201946 0.98296837]
XGBoost CV Mean Score: 0.978357619888072
--------------------------------------------------------------------------------


In [None]:
svm_model.fit(xTrain, yTrain)
preds_test = svm_model.predict(xTest)
preds_train = svm_model.predict(xTrain)

print(f"Accuracy on train data by SVM Classifier\
: {accuracy_score(yTrain, svm_model.predict(xTrain))*100}")
cf_matrix_train = confusion_matrix(yTrain, preds_train)
plt.figure(figsize=(6,4))
sns.heatmap(cf_matrix_train, annot=True, cmap='Blues')
plt.title("Confusion Matrix on train data for SVM Classifier")
plt.show()

print(f"Accuracy on test data by SVM Classifier\
: {accuracy_score(yTest, preds_test)*100}")
cf_matrix = confusion_matrix(yTest, preds_test)
plt.figure(figsize=(6,4))
sns.heatmap(cf_matrix, annot=True, cmap='Blues')
plt.title("Confusion Matrix on test data for SVM Classifier")
plt.show()