In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df= pd.read_csv('heart.csv')
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,63,1,3,145,233,1,0,150,0,2.3,0,0,1,1
1,37,1,2,130,250,0,1,187,0,3.5,0,0,2,1
2,41,0,1,130,204,0,0,172,0,1.4,2,0,2,1
3,56,1,1,120,236,0,1,178,0,0.8,2,0,2,1
4,57,0,0,120,354,0,1,163,1,0.6,2,0,2,1


In [3]:
df.shape

(303, 14)

In [4]:
X= df.drop('target', axis= 1)
y= df.target

In [5]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 10, stratify= y)

# Random Forest

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [7]:
model_1 = RandomForestClassifier()
model_1.fit(X_train, y_train)
y_pred1 = model_1.predict(X_test)

In [8]:
from sklearn.metrics import accuracy_score, mean_squared_error
accuracy_score(y_test, y_pred1)

0.8852459016393442

In [9]:
from sklearn.model_selection import RandomizedSearchCV

In [10]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]

# Number of features to consider at every split
max_features = ['auto', 'sqrt']

# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(1, 110, num = 11)]
#max_depth.append(None)

# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]

# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]

# Method of selecting samples for training each tree
bootstrap = [True, False]

In [11]:
#Create the random grid
param_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [12]:
# Use the random grid to search for best hyperparameters

# First create the base model to tune
rf = RandomForestClassifier()

# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = param_grid, 
                               n_iter = 100, cv = 3, verbose=2, random_state = 10, n_jobs = -1)

# Fit the random search model
rf_random.fit(X_train, y_train)

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  25 tasks      | elapsed:   17.6s
[Parallel(n_jobs=-1)]: Done 146 tasks      | elapsed:  1.6min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  3.2min finished


RandomizedSearchCV(cv=3, estimator=RandomForestClassifier(), n_iter=100,
                   n_jobs=-1,
                   param_distributions={'bootstrap': [True, False],
                                        'max_depth': [1, 11, 22, 33, 44, 55, 66,
                                                      77, 88, 99, 110],
                                        'max_features': ['auto', 'sqrt'],
                                        'min_samples_leaf': [1, 2, 4],
                                        'min_samples_split': [2, 5, 10],
                                        'n_estimators': [200, 400, 600, 800,
                                                         1000, 1200, 1400, 1600,
                                                         1800, 2000]},
                   random_state=10, verbose=2)

In [13]:
rf_random.best_params_

{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 1,
 'bootstrap': True}

In [18]:
# Manually provide the best parameters to model for training
model_12 = RandomForestClassifier(**{'n_estimators': 1000,
 'min_samples_split': 2,
 'min_samples_leaf': 4,
 'max_features': 'auto',
 'max_depth': 1,
 'bootstrap': True})

result_12= model_12.fit(X_train, y_train)

In [19]:
pred_12 = result_12.predict(X_test)

In [20]:
from sklearn.metrics import accuracy_score, mean_squared_error
accuracy_score(y_test, pred_12)

0.819672131147541

In [21]:
np.sqrt(mean_squared_error(y_test, pred_12))

0.4246502900652006

# KNN

In [22]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier

In [23]:
estimators_21 = []
estimators_21.append(('standardize', StandardScaler()))
estimators_21.append(('knn', KNeighborsClassifier()))
model_21 = Pipeline(estimators_21)

In [24]:
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state=10)
results = cross_val_score(model_21, X_train, y_train, cv=kfold)
print(results.mean())
print(results.std())

0.8008333333333333
0.11650524547084659


In [25]:
search_space = [{
                 'knn__n_neighbors': range(1,20),
                 'knn__weights': ['distance']
                }]


In [28]:
from sklearn.model_selection import GridSearchCV

kfold = StratifiedKFold(n_splits=10, shuffle = True, random_state=10)
clf = GridSearchCV(model_21, search_space, cv=kfold, return_train_score=True ,verbose=False)
clf = clf.fit(X_train, y_train)
clf.best_estimator_

Pipeline(steps=[('standardize', StandardScaler()),
                ('knn',
                 KNeighborsClassifier(n_neighbors=19, weights='distance'))])

In [31]:
estimators_22 = []
estimators_22.append(('standardize', StandardScaler()))
estimators_22.append(('knn', KNeighborsClassifier(n_neighbors = 19)))
model_22 = Pipeline(estimators_22)
model_22= model_22.fit(X_train, y_train)

In [32]:
pred_22 = model_22.predict(X_test)
accuracy_score(y_test, pred_22)

0.8852459016393442

In [35]:
import pickle

In [36]:
# open a file, where you ant to store the data
file = open('Heart_model.pkl', 'wb')

# dump information to that file
pickle.dump(model_22, file)