In [3]:
import pandas as pd
import numpy as np


In [4]:
df = pd.read_csv('diabetes_eda.csv')
x = df.drop("Outcome", axis=1)
y = df['Outcome']


In [5]:
x = x.drop(["Unnamed: 0", "Pregnancies","Insulin","Insulin_me"], axis=1)

In [6]:
x

Unnamed: 0,Glucose,BloodPressure,SkinThickness,BMI,DiabetesPedigreeFunction,Age
0,148,72,35,33.6,0.627,50
1,85,66,29,26.6,0.351,31
2,183,64,0,23.3,0.672,32
3,89,66,23,28.1,0.167,21
4,137,40,35,43.1,2.288,33
...,...,...,...,...,...,...
763,101,76,48,32.9,0.171,63
764,122,70,27,36.8,0.340,27
765,121,72,23,26.2,0.245,30
766,126,60,0,30.1,0.349,47


In [7]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.pipeline import  Pipeline, make_pipeline


In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x,y)

In [10]:
def scores(y_test, y_pred):
    accs = accuracy_score(y_test, y_pred)
    cr = classification_report(y_test, y_pred)
    cm = confusion_matrix(y_test, y_pred)
    print(accs,"\n")
    print(cr,"\n")
    print(cm)

In [11]:
log_reg = LogisticRegression()
log_reg.fit(x_train, y_train)

In [12]:
y_pred = log_reg.predict(x_test)
scores(y_test, y_pred)

0.7708333333333334 

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       120
           1       0.75      0.58      0.66        72

    accuracy                           0.77       192
   macro avg       0.76      0.73      0.74       192
weighted avg       0.77      0.77      0.76       192
 

[[106  14]
 [ 30  42]]


In [21]:
log_reg_grid = LogisticRegression()
log_param = {
    "penalty": ['l1', 'l2', 'elasticnet'],
    "C": [1,2,3,4,5]
}
log_grid = GridSearchCV(log_reg_grid, param_grid=log_param, n_jobs=-1)
log_grid.fit(x_train, y_train)

50 fits failed out of a total of 75.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
25 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\anaconda3\envs\practice\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\anaconda3\envs\practice\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\user\anaconda3\envs\practice\lib\site-packages\sklearn\linear_model\_logistic.py", line 1169, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "c:\Users\user\anaconda3\envs\practice\lib\site-packages\sk

In [22]:
scores(y_test, log_grid.predict(x_test))

0.7708333333333334 

              precision    recall  f1-score   support

           0       0.78      0.88      0.83       120
           1       0.75      0.58      0.66        72

    accuracy                           0.77       192
   macro avg       0.76      0.73      0.74       192
weighted avg       0.77      0.77      0.76       192
 

[[106  14]
 [ 30  42]]


In [13]:
svc_class= SVC()
svc_class.fit(x_train, y_train)
scores(y_test, svc_class.predict(x_test))

0.75 

              precision    recall  f1-score   support

           0       0.73      0.95      0.83       120
           1       0.83      0.42      0.56        72

    accuracy                           0.75       192
   macro avg       0.78      0.68      0.69       192
weighted avg       0.77      0.75      0.72       192
 

[[114   6]
 [ 42  30]]


In [25]:
svc_est = SVC()
params = {
    "kernel": ['linear', 'poly', 'rbf', 'sigmoid'],
    "degree": [1,2,3,4,5],
    "C":[1,2,3,4,5,6]
}
svc_grid = GridSearchCV(svc_est, param_grid=params, n_jobs=-1)
svc_grid.fit(x_train, y_train)
scores(y_test, svc_grid.predict(x_test))

0.765625 

              precision    recall  f1-score   support

           0       0.75      0.93      0.83       120
           1       0.81      0.49      0.61        72

    accuracy                           0.77       192
   macro avg       0.78      0.71      0.72       192
weighted avg       0.78      0.77      0.75       192
 

[[112   8]
 [ 37  35]]


In [14]:
dt_clf = DecisionTreeClassifier()
dt_clf.fit(x_train, y_train)
scores(y_test, dt_clf.predict(x_test))

0.6979166666666666 

              precision    recall  f1-score   support

           0       0.75      0.78      0.76       120
           1       0.60      0.57      0.59        72

    accuracy                           0.70       192
   macro avg       0.68      0.67      0.67       192
weighted avg       0.69      0.70      0.70       192
 

[[93 27]
 [31 41]]


In [15]:
ada_clf = AdaBoostClassifier()
ada_clf.fit(x_train, y_train)
scores(y_test, ada_clf.predict(x_test))

0.7239583333333334 

              precision    recall  f1-score   support

           0       0.76      0.82      0.79       120
           1       0.65      0.57      0.61        72

    accuracy                           0.72       192
   macro avg       0.71      0.69      0.70       192
weighted avg       0.72      0.72      0.72       192
 

[[98 22]
 [31 41]]


In [16]:
grad_clf = GradientBoostingClassifier()
grad_clf.fit(x_train, y_train)
scores(y_test, grad_clf.predict(x_test))

0.7395833333333334 

              precision    recall  f1-score   support

           0       0.76      0.85      0.80       120
           1       0.69      0.56      0.62        72

    accuracy                           0.74       192
   macro avg       0.73      0.70      0.71       192
weighted avg       0.73      0.74      0.73       192
 

[[102  18]
 [ 32  40]]


In [17]:
rf_clf = RandomForestClassifier()
rf_clf.fit(x_train, y_train)
scores(y_test, rf_clf.predict(x_test))

0.7552083333333334 

              precision    recall  f1-score   support

           0       0.77      0.88      0.82       120
           1       0.73      0.56      0.63        72

    accuracy                           0.76       192
   macro avg       0.75      0.72      0.72       192
weighted avg       0.75      0.76      0.75       192
 

[[105  15]
 [ 32  40]]


In [18]:
nb_clf = GaussianNB()
nb_clf.fit(x_train, y_train)
scores(y_test, nb_clf.predict(x_test))

0.7864583333333334 

              precision    recall  f1-score   support

           0       0.79      0.89      0.84       120
           1       0.77      0.61      0.68        72

    accuracy                           0.79       192
   macro avg       0.78      0.75      0.76       192
weighted avg       0.78      0.79      0.78       192
 

[[107  13]
 [ 28  44]]


In [19]:
knn_clf = KNeighborsClassifier()
knn_clf.fit(x_train, y_train)
scores(y_test, knn_clf.predict(x_test))

0.6666666666666666 

              precision    recall  f1-score   support

           0       0.70      0.82      0.76       120
           1       0.58      0.40      0.48        72

    accuracy                           0.67       192
   macro avg       0.64      0.61      0.62       192
weighted avg       0.65      0.67      0.65       192
 

[[99 21]
 [43 29]]
