In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv('diabetes.csv')
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [3]:
df.shape

(768, 9)

In [4]:
X= df.drop('Outcome', axis= 1)
y= df.Outcome

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 20, stratify = y)

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
model_1 = RandomForestClassifier()
model_1.fit(X_train, y_train)
y_pred1 = model_1.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, mean_squared_error
accuracy_score(y_test,y_pred1)

0.7922077922077922

In [9]:
from sklearn.model_selection import RandomizedSearchCV

In [10]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 110, num = 11)]
#max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [11]:
#Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [12]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state = 20, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   23.5s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.7min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [1, 11, 22, 33, 44, 55, 66,
                                                      77, 88, 99, 110],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=20, verbose=2)

In [13]:
rf_random.best_params_

{'n_estimators': 400,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 88,
 'bootstrap': False}

In [14]:
# Manually provide the best parameters to model for training
model_12 = RandomForestClassifier(**{'n_estimators': 800,
 'min_samples_split': 5,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 99,
 'bootstrap': False})

result_12= model_12.fit(X_train, y_train)

In [15]:
pred_12 = result_12.predict(X_test)

In [16]:
accuracy_score(y_test, pred_12)

0.7727272727272727

In [17]:
np.sqrt(mean_squared_error(y_test, pred_12))

0.4767312946227962

# KNN

In [18]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [19]:
estimators_21 = []
estimators_21.append(('standardize', StandardScaler()))
estimators_21.append(('knn', KNeighborsClassifier()))
model_21 = Pipeline(estimators_21)

In [20]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state=10)
results = cross_val_score(model_21, X_train, y_train, cv=kfold)
print(results.mean())
print(results.std())

0.7377842411422527
0.05141337672960019


In [21]:
search_space = [{
                 'knn__n_neighbors': range(1,20),
                 'knn__weights': ['distance']
                }]


In [22]:
from sklearn.model_selection import GridSearchCV

kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state=10)
clf = GridSearchCV(model_21, search_space, cv=kfold, return_train_score=True ,verbose=False)
clf = clf.fit(X_train, y_train)
clf.best_estimator_

Pipeline(steps=[('standardize', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=19, weights='distance'))])

In [23]:
estimators_22 = []
estimators_22.append(('standardize', StandardScaler()))
estimators_22.append(('knn', KNeighborsClassifier(n_neighbors = 19)))
model_22 = Pipeline(estimators_22)
model_22=model_22.fit(X_train, y_train)

In [24]:
pred_22 = model_22.predict(X_test)
accuracy_score(y_test, pred_22)

0.8376623376623377

In [25]:
import pickle

In [26]:
# open a file, where you ant to store the data
file = open('Diabetes_model.pkl', 'wb')

# dump information to that file
pickle.dump(model_22, file)