In [1]:
#import lib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
# import data set
diabetes_set = pd.read_csv('./diabetes.csv')

In [8]:
# create x (feature matrix)
x = diabetes_set.drop('Outcome', axis=1)

# create y (label)
y = diabetes_set['Outcome']

In [35]:
# choose right model and hyperparameter
from sklearn.linear_model import SGDClassifier

clf = SGDClassifier()


In [19]:
# split data into train, test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)


In [36]:
# fit model to the train dataset
clf.fit(x_train, y_train)

SGDClassifier()

In [21]:
# make prediction
y_pred = clf.predict(x_test)

In [41]:
# evaluate model on the test set
clf.score(x_test, y_test)

0.6428571428571429

In [25]:
# evaluate model on the train set
clf.score(x_train, y_train)

0.5814332247557004

In [28]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.73      0.71      0.72        98
           1       0.52      0.54      0.53        56

    accuracy                           0.65       154
   macro avg       0.62      0.62      0.62       154
weighted avg       0.65      0.65      0.65       154



In [27]:
confusion_matrix(y_test, y_pred)

array([[70, 28],
       [26, 30]], dtype=int64)

In [30]:
accuracy_score(y_test, y_pred)

0.6493506493506493

In [43]:
# improve a model
param_loss = ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron']

np.random.seed(49)
for i in param_loss:
    print(f"trying model with {i} param_loss...")
    clf = SGDClassifier(loss=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test) *100:.2f}%")
    print("")

trying model with hinge param_loss...
Model accuracy on test set: 66.88%

trying model with log param_loss...
Model accuracy on test set: 36.36%

trying model with modified_huber param_loss...
Model accuracy on test set: 62.99%

trying model with squared_hinge param_loss...
Model accuracy on test set: 68.18%

trying model with perceptron param_loss...
Model accuracy on test set: 63.64%



In [44]:
# trying different model
from sklearn import svm

clf = svm.SVC()

In [45]:
# fit model
clf.fit(x_train, y_train)

SVC()

In [48]:
y_preds = clf.predict(x_test)

In [51]:
clf.score(x_test, y_test)

0.7532467532467533

In [50]:
# evaluating model
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.76      0.90      0.82        98
           1       0.74      0.50      0.60        56

    accuracy                           0.75       154
   macro avg       0.75      0.70      0.71       154
weighted avg       0.75      0.75      0.74       154



In [52]:
confusion_matrix(y_test, y_preds)

array([[88, 10],
       [28, 28]], dtype=int64)

In [53]:
accuracy_score(y_test, y_preds)

0.7532467532467533

In [60]:
# improve a model
kernel_list = ['linear', 'poly', 'rbf', 'sigmoid']

np.random.seed(42)
for i in kernel_list:
    print(f"trying model with `{i}` kernel_list...")
    clf = svm.SVC(kernel=i).fit(x_train, y_train)
    print(f"Model accuracy on test set: {clf.score(x_test, y_test) *100:.2f}%")
    print("")

trying model with `linear` kernel_list...
Model accuracy on test set: 77.92%

trying model with `poly` kernel_list...
Model accuracy on test set: 75.97%

trying model with `rbf` kernel_list...
Model accuracy on test set: 75.32%

trying model with `sigmoid` kernel_list...
Model accuracy on test set: 44.81%



In [61]:
# trying different model (RandomForestClassifier)
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier()

In [62]:
# fit model 
clf.fit(x_train, y_train)

RandomForestClassifier()

In [70]:
# predict
y_preds = clf.predict(x_test)
y_preds.shape, y_test.shape

((154,), (154,))

In [71]:
# evaluating model
clf.score(x_test, y_test)

0.7922077922077922

In [72]:
# evaluating model
print(classification_report(y_test, y_preds))

              precision    recall  f1-score   support

           0       0.80      0.90      0.85        98
           1       0.77      0.61      0.68        56

    accuracy                           0.79       154
   macro avg       0.79      0.75      0.76       154
weighted avg       0.79      0.79      0.79       154



In [73]:
confusion_matrix(y_test, y_preds)

array([[88, 10],
       [22, 34]], dtype=int64)

In [74]:
accuracy_score(y_test, y_preds)

0.7922077922077922

In [82]:
# improve
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(42)
for i in range(10, 100, 10):
    print(f"trying model with `{i}` n_estimators...")
    clf = GradientBoostingClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy (auto) on test set: {clf.score(x_test, y_test) *100:.2f}%")
    print(f"model  accuracy (hands on) : {accuracy_score(y_test, clf.predict(x_test) ) * 100:.2f}%")
    print("")

trying model with `10` n_estimators...
Model accuracy (auto) on test set: 77.92%
model  accuracy (hands on) : 77.92%

trying model with `20` n_estimators...
Model accuracy (auto) on test set: 81.17%
model  accuracy (hands on) : 81.17%

trying model with `30` n_estimators...
Model accuracy (auto) on test set: 78.57%
model  accuracy (hands on) : 78.57%

trying model with `40` n_estimators...
Model accuracy (auto) on test set: 76.62%
model  accuracy (hands on) : 76.62%

trying model with `50` n_estimators...
Model accuracy (auto) on test set: 78.57%
model  accuracy (hands on) : 78.57%

trying model with `60` n_estimators...
Model accuracy (auto) on test set: 77.27%
model  accuracy (hands on) : 77.27%

trying model with `70` n_estimators...
Model accuracy (auto) on test set: 77.92%
model  accuracy (hands on) : 77.92%

trying model with `80` n_estimators...
Model accuracy (auto) on test set: 77.92%
model  accuracy (hands on) : 77.92%

trying model with `90` n_estimators...
Model accuracy (a

In [83]:
# save model
import pickle

pickle.dump(clf, open('GradientBoosting_model_1.pkl', "wb"))

In [84]:
# load model
loaded_model = pickle.load(open("GradientBoosting_model_1.pkl", "rb"))
loaded_model.score(x_test, y_test)

0.7792207792207793

## Feature scale 

In [85]:
# import data set
diabetes_set = pd.read_csv('./diabetes.csv')

In [86]:
# split x, y
x = diabetes_set.drop('Outcome', axis=1)
y = diabetes_set['Outcome']

In [94]:
from sklearn import preprocessing

# Standard Scaler
scaler = preprocessing.StandardScaler()
scaled_x = scaler.fit_transform(x)
scaled_x = pd.DataFrame(scaled_x, columns=x.columns)
scaled_x

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.848324,0.149641,0.907270,-0.692891,0.204013,0.468492,1.425995
1,-0.844885,-1.123396,-0.160546,0.530902,-0.692891,-0.684422,-0.365061,-0.190672
2,1.233880,1.943724,-0.263941,-1.288212,-0.692891,-1.103255,0.604397,-0.105584
3,-0.844885,-0.998208,-0.160546,0.154533,0.123302,-0.494043,-0.920763,-1.041549
4,-1.141852,0.504055,-1.504687,0.907270,0.765836,1.409746,5.484909,-0.020496
...,...,...,...,...,...,...,...,...
763,1.827813,-0.622642,0.356432,1.722735,0.870031,0.115169,-0.908682,2.532136
764,-0.547919,0.034598,0.046245,0.405445,-0.692891,0.610154,-0.398282,-0.531023
765,0.342981,0.003301,0.149641,0.154533,0.279594,-0.735190,-0.685193,-0.275760
766,-0.844885,0.159787,-0.470732,-1.288212,-0.692891,-0.240205,-0.371101,1.170732


In [95]:
# split x_train, x_test, y_train, y_test
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(scaled_x, y, test_size=0.2)

In [96]:
# choose right model
from sklearn.ensemble import GradientBoostingClassifier

clf = GradientBoostingClassifier(n_estimators=20)

In [97]:
# Fit model 
clf.fit(x_train, y_train)

GradientBoostingClassifier(n_estimators=20)

In [98]:
# Evaluating 
clf.score(x_test, y_test)

0.7792207792207793

In [99]:
# prediction
preds = clf.predict(x_test)

In [100]:
confusion_matrix(y_test, y_preds)

array([[72, 28],
       [38, 16]], dtype=int64)

In [101]:
# improve
from sklearn.ensemble import GradientBoostingClassifier

np.random.seed(42)
for i in range(10, 100, 10):
    print(f"trying model with `{i}` n_estimators...")
    clf = GradientBoostingClassifier(n_estimators=i).fit(x_train, y_train)
    print(f"Model accuracy (auto) on test set: {clf.score(x_test, y_test) *100:.2f}%")
    print("")

trying model with `10` n_estimators...
Model accuracy (auto) on test set: 77.92%

trying model with `20` n_estimators...
Model accuracy (auto) on test set: 77.92%

trying model with `30` n_estimators...
Model accuracy (auto) on test set: 76.62%

trying model with `40` n_estimators...
Model accuracy (auto) on test set: 78.57%

trying model with `50` n_estimators...
Model accuracy (auto) on test set: 77.27%

trying model with `60` n_estimators...
Model accuracy (auto) on test set: 77.92%

trying model with `70` n_estimators...
Model accuracy (auto) on test set: 77.27%

trying model with `80` n_estimators...
Model accuracy (auto) on test set: 78.57%

trying model with `90` n_estimators...
Model accuracy (auto) on test set: 78.57%

