<h1 style="font-size:4rem">Detection of Parkinson's Disease using Vocal Biomarkers and Machine learning</h1>

# Data Acquisition

We acquired our dataset from mPower public research portal. mPower is a Parkinson's disease clinical study

## Loading the Dataset


In [21]:
import pandas as pd
import numpy as np
df = pd.read_csv("Parkinsson disease.csv", sep=",")
print(df.shape, '\n')
df.head()

(195, 24) 



Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335


## Adjusting the shape of the dataset

In [24]:
targetCol = 'status'                                                    # defining target column
targetColDf = df.pop(targetCol)                                     # popping target column from loanData df
df.insert(len(df.columns),targetCol, targetColDf)                   # inserting target column to last column
new_column_names = ['Name', 'Avg_VFF', 'Max_VFF', 'Min_VFF']
df.rename(columns=dict(zip(df.columns[:4], new_column_names)), inplace=True)
# replacing ':' in column names with '_'
df.columns = [c.replace(':', '_') for c in df.columns]
# replacing '(' in column names with '_'
df.columns = [c.replace('(', '_') for c in df.columns]
# replacing ')' in column names with '' i.e blank
df.columns = [c.replace(')', '') for c in df.columns]

df.head()


Unnamed: 0,Name,Avg_VFF,Max_VFF,Min_VFF,MDVP_Jitter_%,MDVP_Jitter_Abs,MDVP_RAP,MDVP_PPQ,Jitter_DDP,MDVP_Shimmer,...,Shimmer_DDA,NHR,HNR,RPDE,DFA,spread1,spread2,D2,PPE,status
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,7e-05,0.0037,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654,1
1,phon_R01_S01_2,122.4,148.65,113.819,0.00968,8e-05,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,0.458359,0.819521,-4.075192,0.33559,2.486855,0.368674,1
2,phon_R01_S01_3,116.682,131.111,111.555,0.0105,9e-05,0.00544,0.00781,0.01633,0.05233,...,0.0827,0.01309,20.651,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634,1
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,9e-05,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975,1
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.1047,0.01767,19.649,0.417356,0.823484,-3.747787,0.234513,2.33218,0.410335,1


# Data Preprocessing

Displaying the Data report

In [10]:
stats = df.describe().round(2) #Display stats rounded to 2 decimal places
cardinality = df.nunique().round(2) # Calculate Cardinality
stats.loc['cardinality'] = cardinality
print("Data Quality Report:\n", stats)


Data Quality Report:
              MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
count             195.00        195.00        195.00          195.00   
mean              154.23        197.10        116.32            0.01   
std                41.39         91.49         43.52            0.00   
min                88.33        102.14         65.48            0.00   
25%               117.57        134.86         84.29            0.00   
50%               148.79        175.83        104.32            0.00   
75%               182.77        224.21        140.02            0.01   
max               260.10        592.03        239.17            0.03   
cardinality       195.00        195.00        195.00          173.00   

             MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
count                   195.0    195.00    195.00      195.00        195.00   
mean                      0.0      0.00      0.00        0.01          0.03   
std                 

Removing Outliers/ exchanging them with median values:

In [14]:
for column in df.columns[1:]:
    median = df[column].median()
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1

    threshold = Q3 + 1.5 * IQR #Threshold for detecting outliers

    for index, value in enumerate(df[column]):#For loop to exchange all outliers with the median value
        if value > threshold:
            df.at[index, column] = median
# print(df.head())
# print(df.info())
stats = df.describe().round(2)
print("Data Quality Report:\n", stats)

Data Quality Report:
        MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
count       195.00        195.00        195.00          195.00   
mean        154.23        177.32        104.66            0.01   
std          41.39         45.04         27.07            0.00   
min          88.33        102.14         65.48            0.00   
25%         117.57        134.86         84.29            0.00   
50%         148.79        175.83        104.32            0.00   
75%         182.77        211.78        112.47            0.01   
max         260.10        272.21        182.79            0.01   

       MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
count             195.0    195.00    195.00      195.00        195.00   
mean                0.0      0.00      0.00        0.01          0.02   
std                 0.0      0.00      0.00        0.00          0.01   
min                 0.0      0.00      0.00        0.00          0.01   
25%               