In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [2]:
heart_data = pd.read_csv('Datasets/heart.csv')
heart_data

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
298,57,0,0,140,241,0,1,123,1,0.2,1,0,3,0
299,45,1,3,110,264,0,1,132,0,1.2,1,0,3,0
300,68,1,0,144,193,1,1,141,0,3.4,1,2,3,0
301,57,1,0,130,131,0,1,115,1,1.2,1,1,3,0


In [3]:
heart_data.isnull().sum()

age         0
sex         0
cp          0
trestbps    0
chol        0
fbs         0
restecg     0
thalach     0
exang       0
oldpeak     0
slope       0
ca          0
thal        0
target      0
dtype: int64

In [4]:
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler

In [9]:
numerical_columns = ['age','trestbps','chol','thalach','oldpeak']
z_scores = heart_data[numerical_columns].apply(zscore)

In [11]:
threshold = 3
outliers = (z_scores.abs() > threshold).any(axis=1)

In [13]:
data_without_outliers = heart_data[~outliers]

In [15]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(data_without_outliers[numerical_columns])

In [17]:
scaled_data = pd.DataFrame(scaled_features, columns = [col + '_scaled' for col in numerical_columns])
categorical_columns = [col for col in heart_data.columns if col not in numerical_columns]
final_data = pd.concat([data_without_outliers[categorical_columns].reset_index(drop=True), scaled_data], axis =1)

In [19]:
X= final_data.drop(columns='target', axis =1)
Y=final_data['target']

In [21]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [23]:
model = LogisticRegression()
model.fit(X_train, Y_train)

In [25]:
feature_importance = pd.DataFrame({
    'Feature': X_train.columns,
    'Coefficient': model.coef_[0]
}).sort_values(by='Coefficient', ascending=False)
print(feature_importance)

            Feature  Coefficient
1                cp     0.690385
5             slope     0.600146
3           restecg     0.436675
11   thalach_scaled     0.407591
8        age_scaled    -0.057269
9   trestbps_scaled    -0.162498
10      chol_scaled    -0.223754
2               fbs    -0.320205
12   oldpeak_scaled    -0.518883
6                ca    -0.760129
4             exang    -0.767884
7              thal    -0.905907
0               sex    -1.261647


In [27]:
model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
model.fit(X_train, Y_train)

# Evaluate
print("Training Accuracy:", model.score(X_train, Y_train))
print("Testing Accuracy:", model.score(X_test, Y_test))

Training Accuracy: 0.8468085106382979
Testing Accuracy: 0.8813559322033898


In [29]:
from sklearn import svm
classifier = svm.SVC(kernel = 'linear')

In [31]:
classifier.fit(X_train, Y_train)

In [33]:
print("Training Accuracy:", classifier.score(X_train, Y_train))
print("Testing Accuracy:", classifier.score(X_test, Y_test))

Training Accuracy: 0.8468085106382979
Testing Accuracy: 0.864406779661017


In [35]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [69]:
param_grid = {
    'C': [1,10,100,1000],
    'gamma': [0.0001,0.001,0.01,0.1],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2,3]
}

In [71]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=9, scoring='accuracy')
grid_search.fit(X_train_scaled, Y_train)

In [72]:
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Train Accuracy:", best_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", best_model.score(X_test_scaled, Y_test))

Best Parameters: {'C': 100, 'degree': 2, 'gamma': 0.001, 'kernel': 'rbf'}
Train Accuracy: 0.8425531914893617
Test Accuracy: 0.8813559322033898


In [83]:
##Combining SVM with other models using Stacking

In [91]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('svm', SVC(kernel='rbf', C=100, gamma=0.001))
]

model = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())
model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", model.score(X_train_scaled, Y_train))
print("Test Accuracy:", model.score(X_test_scaled, Y_test))

Train Accuracy: 0.9063829787234042
Test Accuracy: 0.8983050847457628


In [77]:

rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
rf_grid_search.fit(X_train_scaled, Y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 300}


In [93]:
svm_param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf']
}
svm_grid_search = GridSearchCV(
    SVC(),
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
svm_grid_search.fit(X_train_scaled, Y_train)
print("Best SVM Parameters:", svm_grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best SVM Parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [95]:
optimized_rf = rf_grid_search.best_estimator_
optimized_svm = svm_grid_search.best_estimator_

base_estimators = [
    ('rf', optimized_rf),
    ('svm', optimized_svm)
]

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression()
)

stacking_model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", stacking_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", stacking_model.score(X_test_scaled, Y_test))

Train Accuracy: 0.8680851063829788
Test Accuracy: 0.8813559322033898


In [97]:
input_data = (62,0,0,140,268,0,0,160,0,3.6,0,2,2)
np_array=np.asarray(input_data)

input_data_reshaped= np_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if(prediction[0]==0):
    print("The person does not have a Heart Disease")
else:
    print("The Person has Heart Disease")


[1]
The Person has Heart Disease


In [99]:
for col in X.columns:
    print(col)

sex
cp
fbs
restecg
exang
slope
ca
thal
age_scaled
trestbps_scaled
chol_scaled
thalach_scaled
oldpeak_scaled


In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [2]:
diabetes_dataset = pd.read_csv("Datasets/diabetes.csv")
diabetes_dataset

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101,76,48,180,32.9,0.171,63,0
764,2,122,70,27,0,36.8,0.340,27,0
765,5,121,72,23,112,26.2,0.245,30,0
766,1,126,60,0,0,30.1,0.349,47,1


In [3]:
diabetes_dataset.shape

(768, 9)

In [4]:
diabetes_dataset.describe()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
count,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0,768.0
mean,3.845052,120.894531,69.105469,20.536458,79.799479,31.992578,0.471876,33.240885,0.348958
std,3.369578,31.972618,19.355807,15.952218,115.244002,7.88416,0.331329,11.760232,0.476951
min,0.0,0.0,0.0,0.0,0.0,0.0,0.078,21.0,0.0
25%,1.0,99.0,62.0,0.0,0.0,27.3,0.24375,24.0,0.0
50%,3.0,117.0,72.0,23.0,30.5,32.0,0.3725,29.0,0.0
75%,6.0,140.25,80.0,32.0,127.25,36.6,0.62625,41.0,1.0
max,17.0,199.0,122.0,99.0,846.0,67.1,2.42,81.0,1.0


In [5]:
diabetes_dataset['Outcome'].value_counts()

Outcome
0    500
1    268
Name: count, dtype: int64

In [6]:
diabetes_dataset.groupby('Outcome').mean()

Unnamed: 0_level_0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
Outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
0,3.298,109.98,68.184,19.664,68.792,30.3042,0.429734,31.19
1,4.865672,141.257463,70.824627,22.164179,100.335821,35.142537,0.5505,37.067164


In [12]:
from scipy.stats import zscore
from sklearn.preprocessing import StandardScaler

In [14]:
numerical_columns= ['Pregnancies','Glucose','BloodPressure','SkinThickness','Insulin','BMI','DiabetesPedigreeFunction','Age']
z_score = diabetes_dataset[numerical_columns].apply(zscore)

In [17]:
threshold = 3
outliers = (z_score.abs() > threshold).any(axis=1)
outliers

0      False
1      False
2      False
3      False
4       True
       ...  
763    False
764    False
765    False
766    False
767    False
Length: 768, dtype: bool

In [19]:
data_without_outliers = diabetes_dataset[~outliers]

In [21]:
scaler= StandardScaler()
scaled_features = scaler.fit_transform(data_without_outliers[numerical_columns])

In [23]:
scaled_data = pd.DataFrame(scaled_features, columns = [col + '_scaled' for col in numerical_columns])
categorical_columns = [col for col in diabetes_dataset.columns if col not in numerical_columns]
final_data = pd.concat([data_without_outliers[categorical_columns].reset_index(drop=True), scaled_data],axis =1)
final_data

Unnamed: 0,Outcome,Pregnancies_scaled,Glucose_scaled,BloodPressure_scaled,SkinThickness_scaled,Insulin_scaled,BMI_scaled,DiabetesPedigreeFunction_scaled,Age_scaled
0,1,0.657355,0.924040,-0.028115,0.923219,-0.805266,0.210285,0.606516,1.479220
1,0,-0.868490,-1.177082,-0.515765,0.533462,-0.805266,-0.848063,-0.364220,-0.183265
2,1,1.267694,2.091330,-0.678315,-1.350366,-0.805266,-1.346999,0.764788,-0.095766
3,0,-0.868490,-1.043678,-0.515765,0.143704,0.238698,-0.621274,-1.011378,-1.058257
4,0,0.352186,-0.143197,0.134435,-1.350366,-0.805266,-0.999256,-0.891795,-0.270764
...,...,...,...,...,...,...,...,...,...
683,0,1.878032,-0.643464,0.296984,1.767693,1.193814,0.104450,-0.997309,2.616709
684,0,-0.563321,0.056910,-0.190665,0.403542,-0.805266,0.694102,-0.402909,-0.533262
685,0,0.352186,0.023559,-0.028115,0.143704,0.438606,-0.908540,-0.737040,-0.270764
686,1,-0.868490,0.190315,-1.003415,-1.350366,-0.805266,-0.318889,-0.371255,1.216722


In [25]:
X = final_data.drop(columns = 'Outcome')
Y = final_data['Outcome']

In [27]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.2, stratify=Y, random_state=2)

In [29]:
print(X.shape,X_train.shape, X_test.shape)

(688, 8) (550, 8) (138, 8)


In [31]:
classifier = svm.SVC(kernel = 'linear')

In [33]:
classifier.fit(X_train, Y_train)

In [35]:
X_train_prediction = classifier.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.7963636363636364


In [37]:
X_test_prediction = classifier.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.782608695652174


In [39]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()
model.fit(X_train, Y_train)

In [41]:
print("Training Accuracy:", model.score(X_train, Y_train))
print("Testing Accuracy:", model.score(X_test, Y_test))

Training Accuracy: 0.7981818181818182
Testing Accuracy: 0.7898550724637681


In [43]:
model = LogisticRegression(penalty='l2', C=1.0, solver='liblinear')
model.fit(X_train, Y_train)

In [45]:
print("Training Accuracy:", model.score(X_train, Y_train))
print("Testing Accuracy:", model.score(X_test, Y_test))

Training Accuracy: 0.7981818181818182
Testing Accuracy: 0.7898550724637681


In [47]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [49]:
param_grid = {
    'C': [1,10,100,1000],
    'gamma': [0.0001,0.001,0.01,0.1],
    'kernel': ['linear', 'rbf', 'poly'],
    'degree': [2,3]
}

In [51]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
grid_search = GridSearchCV(SVC(), param_grid, cv=9, scoring='accuracy')
grid_search.fit(X_train_scaled, Y_train)

In [53]:
best_model = grid_search.best_estimator_
print("Best Parameters:", grid_search.best_params_)
print("Train Accuracy:", best_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", best_model.score(X_test_scaled, Y_test))

Best Parameters: {'C': 1, 'degree': 2, 'gamma': 0.01, 'kernel': 'rbf'}
Train Accuracy: 0.8
Test Accuracy: 0.7898550724637681


In [55]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

base_estimators = [
    ('rf', RandomForestClassifier(n_estimators=100)),
    ('svm', SVC(kernel='rbf', C=100, gamma=0.001))
]

model = StackingClassifier(estimators=base_estimators, final_estimator=LogisticRegression())
model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", model.score(X_train_scaled, Y_train))
print("Test Accuracy:", model.score(X_test_scaled, Y_test))

Train Accuracy: 0.8254545454545454
Test Accuracy: 0.7753623188405797


In [57]:
rf_param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}
rf_grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    rf_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
rf_grid_search.fit(X_train_scaled, Y_train)
print("Best Random Forest Parameters:", rf_grid_search.best_params_)

Fitting 5 folds for each of 81 candidates, totalling 405 fits
Best Random Forest Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 10, 'n_estimators': 100}


In [59]:
svm_param_grid = {
    'C': [1, 10, 100],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['rbf']
}
svm_grid_search = GridSearchCV(
    SVC(),
    svm_param_grid,
    cv=5,
    scoring='accuracy',
    verbose=1
)
svm_grid_search.fit(X_train_scaled, Y_train)
print("Best SVM Parameters:", svm_grid_search.best_params_)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
Best SVM Parameters: {'C': 1, 'gamma': 0.01, 'kernel': 'rbf'}


In [61]:
optimized_rf = rf_grid_search.best_estimator_
optimized_svm = svm_grid_search.best_estimator_

base_estimators = [
    ('rf', optimized_rf),
    ('svm', optimized_svm)
]

stacking_model = StackingClassifier(
    estimators=base_estimators,
    final_estimator=LogisticRegression()
)

stacking_model.fit(X_train_scaled, Y_train)
print("Train Accuracy:", stacking_model.score(X_train_scaled, Y_train))
print("Test Accuracy:", stacking_model.score(X_test_scaled, Y_test))

Train Accuracy: 0.8145454545454546
Test Accuracy: 0.7753623188405797


In [63]:
input_data = (5, 160, 90, 35, 200, 33.0, 1.5, 50)
data_np_array = np.asarray(input_data)
input_data_reshaped = data_np_array.reshape(1,-1)
prediction = classifier.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[1]
The person is diabetic




In [65]:
import pickle

In [67]:
filename = 'diabetes_model.sav'
pickle.dump(classifier, open(filename, 'wb'))

In [69]:
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [71]:
for column in X.columns:
  print(column)

Pregnancies_scaled
Glucose_scaled
BloodPressure_scaled
SkinThickness_scaled
Insulin_scaled
BMI_scaled
DiabetesPedigreeFunction_scaled
Age_scaled


In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn import svm
from sklearn.metrics import accuracy_score

In [6]:
parkinsons_data = pd.read_csv('Datasets/parkinsons.csv')
parkinsons_data

Unnamed: 0,name,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
0,phon_R01_S01_1,119.992,157.302,74.997,0.00784,0.00007,0.00370,0.00554,0.01109,0.04374,...,0.06545,0.02211,21.033,1,0.414783,0.815285,-4.813031,0.266482,2.301442,0.284654
1,phon_R01_S01_2,122.400,148.650,113.819,0.00968,0.00008,0.00465,0.00696,0.01394,0.06134,...,0.09403,0.01929,19.085,1,0.458359,0.819521,-4.075192,0.335590,2.486855,0.368674
2,phon_R01_S01_3,116.682,131.111,111.555,0.01050,0.00009,0.00544,0.00781,0.01633,0.05233,...,0.08270,0.01309,20.651,1,0.429895,0.825288,-4.443179,0.311173,2.342259,0.332634
3,phon_R01_S01_4,116.676,137.871,111.366,0.00997,0.00009,0.00502,0.00698,0.01505,0.05492,...,0.08771,0.01353,20.644,1,0.434969,0.819235,-4.117501,0.334147,2.405554,0.368975
4,phon_R01_S01_5,116.014,141.781,110.655,0.01284,0.00011,0.00655,0.00908,0.01966,0.06425,...,0.10470,0.01767,19.649,1,0.417356,0.823484,-3.747787,0.234513,2.332180,0.410335
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
190,phon_R01_S50_2,174.188,230.978,94.261,0.00459,0.00003,0.00263,0.00259,0.00790,0.04087,...,0.07008,0.02764,19.517,0,0.448439,0.657899,-6.538586,0.121952,2.657476,0.133050
191,phon_R01_S50_3,209.516,253.017,89.488,0.00564,0.00003,0.00331,0.00292,0.00994,0.02751,...,0.04812,0.01810,19.147,0,0.431674,0.683244,-6.195325,0.129303,2.784312,0.168895
192,phon_R01_S50_4,174.688,240.005,74.287,0.01360,0.00008,0.00624,0.00564,0.01873,0.02308,...,0.03804,0.10715,17.883,0,0.407567,0.655683,-6.787197,0.158453,2.679772,0.131728
193,phon_R01_S50_5,198.764,396.961,74.904,0.00740,0.00004,0.00370,0.00390,0.01109,0.02296,...,0.03794,0.07223,19.020,0,0.451221,0.643956,-6.744577,0.207454,2.138608,0.123306


In [8]:
parkinsons_data.shape

(195, 24)

In [10]:
parkinsons_data.describe()

Unnamed: 0,MDVP:Fo(Hz),MDVP:Fhi(Hz),MDVP:Flo(Hz),MDVP:Jitter(%),MDVP:Jitter(Abs),MDVP:RAP,MDVP:PPQ,Jitter:DDP,MDVP:Shimmer,MDVP:Shimmer(dB),...,Shimmer:DDA,NHR,HNR,status,RPDE,DFA,spread1,spread2,D2,PPE
count,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,...,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0,195.0
mean,154.228641,197.104918,116.324631,0.00622,4.4e-05,0.003306,0.003446,0.00992,0.029709,0.282251,...,0.046993,0.024847,21.885974,0.753846,0.498536,0.718099,-5.684397,0.22651,2.381826,0.206552
std,41.390065,91.491548,43.521413,0.004848,3.5e-05,0.002968,0.002759,0.008903,0.018857,0.194877,...,0.030459,0.040418,4.425764,0.431878,0.103942,0.055336,1.090208,0.083406,0.382799,0.090119
min,88.333,102.145,65.476,0.00168,7e-06,0.00068,0.00092,0.00204,0.00954,0.085,...,0.01364,0.00065,8.441,0.0,0.25657,0.574282,-7.964984,0.006274,1.423287,0.044539
25%,117.572,134.8625,84.291,0.00346,2e-05,0.00166,0.00186,0.004985,0.016505,0.1485,...,0.024735,0.005925,19.198,1.0,0.421306,0.674758,-6.450096,0.174351,2.099125,0.137451
50%,148.79,175.829,104.315,0.00494,3e-05,0.0025,0.00269,0.00749,0.02297,0.221,...,0.03836,0.01166,22.085,1.0,0.495954,0.722254,-5.720868,0.218885,2.361532,0.194052
75%,182.769,224.2055,140.0185,0.007365,6e-05,0.003835,0.003955,0.011505,0.037885,0.35,...,0.060795,0.02564,25.0755,1.0,0.587562,0.761881,-5.046192,0.279234,2.636456,0.25298
max,260.105,592.03,239.17,0.03316,0.00026,0.02144,0.01958,0.06433,0.11908,1.302,...,0.16942,0.31482,33.047,1.0,0.685151,0.825288,-2.434031,0.450493,3.671155,0.527367


In [12]:
parkinsons_data['status'].value_counts()

status
1    147
0     48
Name: count, dtype: int64

In [16]:
X = parkinsons_data.drop(columns=['name','status'], axis = 1)
Y = parkinsons_data['status']

X,Y

(     MDVP:Fo(Hz)  MDVP:Fhi(Hz)  MDVP:Flo(Hz)  MDVP:Jitter(%)  \
 0        119.992       157.302        74.997         0.00784   
 1        122.400       148.650       113.819         0.00968   
 2        116.682       131.111       111.555         0.01050   
 3        116.676       137.871       111.366         0.00997   
 4        116.014       141.781       110.655         0.01284   
 ..           ...           ...           ...             ...   
 190      174.188       230.978        94.261         0.00459   
 191      209.516       253.017        89.488         0.00564   
 192      174.688       240.005        74.287         0.01360   
 193      198.764       396.961        74.904         0.00740   
 194      214.289       260.277        77.973         0.00567   
 
      MDVP:Jitter(Abs)  MDVP:RAP  MDVP:PPQ  Jitter:DDP  MDVP:Shimmer  \
 0             0.00007   0.00370   0.00554     0.01109       0.04374   
 1             0.00008   0.00465   0.00696     0.01394       0.06134   
 2

In [18]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.2, random_state=2)

In [24]:
print(X.shape,X_train.shape, X_test.shape)

(195, 22) (156, 22) (39, 22)


In [26]:
model = svm.SVC(kernel = 'linear')

In [28]:
model.fit(X_train, Y_train)

In [30]:
X_train_prediction = model.predict(X_train)
training_data_accuracy = accuracy_score(X_train_prediction, Y_train)
print('Accuracy score of the training data : ', training_data_accuracy)

Accuracy score of the training data :  0.8717948717948718


In [32]:
X_test_prediction = model.predict(X_test)
test_data_accuracy = accuracy_score(X_test_prediction, Y_test)
print('Accuracy score of the test data : ', test_data_accuracy)

Accuracy score of the test data :  0.8717948717948718


In [40]:
input_data = (160, 220, 85, 0.004, 0.00003, 0.002, 0.003, 0.009, 0.015, 0.15, 0.007, 0.010, 0.015, 0.020, 0.01, 25, 0.4, 0.65, -3.0, 0.3, 2.0, 0.1)
data_np_array = np.asarray(input_data)
input_data_reshaped = data_np_array.reshape(1,-1)
prediction = model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person does not have Parkinsons Disease')
else:
  print('The person has Parkinsons Disease')

[1]
The person has Parkinsons Disease




In [42]:
import pickle

In [46]:
filename = 'parkisons_model.sav'
pickle.dump(model, open(filename, 'wb'))

In [48]:
loaded_model = pickle.load(open('diabetes_model.sav', 'rb'))

In [50]:
for column in X.columns:
  print(column)

MDVP:Fo(Hz)
MDVP:Fhi(Hz)
MDVP:Flo(Hz)
MDVP:Jitter(%)
MDVP:Jitter(Abs)
MDVP:RAP
MDVP:PPQ
Jitter:DDP
MDVP:Shimmer
MDVP:Shimmer(dB)
Shimmer:APQ3
Shimmer:APQ5
MDVP:APQ
Shimmer:DDA
NHR
HNR
RPDE
DFA
spread1
spread2
D2
PPE
