## Initial

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import MinMaxScaler
plt.style.use('ggplot')
import warnings
warnings.filterwarnings("ignore")

In [2]:
data=pd.read_csv('diabetes.csv')

In [26]:
data.head(10)

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,,33.6,0.627,50,1
1,1,85.0,66.0,29.0,,26.6,0.351,31,0
2,8,183.0,64.0,,,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5,116.0,74.0,,,25.6,0.201,30,0
6,3,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10,115.0,,,,35.3,0.134,29,0
8,2,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8,125.0,96.0,,,,0.232,54,1


In [3]:
data['BMI'].replace(0,np.nan,inplace=True)
data['Glucose'].replace(0,np.nan,inplace=True)
data['BloodPressure'].replace(0,np.nan,inplace=True)
data['SkinThickness'].replace(0,np.nan,inplace=True)
data['Insulin'].replace(0,np.nan,inplace=True)

In [4]:
pd.isnull(data).sum()

Pregnancies                   0
Glucose                       5
BloodPressure                35
SkinThickness               227
Insulin                     374
BMI                          11
DiabetesPedigreeFunction      0
Age                           0
Outcome                       0
dtype: int64

In [5]:
imputer = KNNImputer(n_neighbors=7)
Imputed_Dataset = pd.DataFrame(imputer.fit_transform(data))
Imputed_Dataset.columns =data.columns

In [6]:
Imputed_Dataset.isnull().sum()

Pregnancies                 0
Glucose                     0
BloodPressure               0
SkinThickness               0
Insulin                     0
BMI                         0
DiabetesPedigreeFunction    0
Age                         0
Outcome                     0
dtype: int64

In [7]:
X = Imputed_Dataset[["Pregnancies","Glucose","BloodPressure","SkinThickness","Insulin","BMI","DiabetesPedigreeFunction","Age"]]
Y = Imputed_Dataset["Outcome"]
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.7, test_size=0.3,random_state=0)

In [8]:
Bayes_Model = GaussianNB()
Bayes_Model.fit(X_train,Y_train)
predictions = Bayes_Model.predict(X_test)
print("Accuracy for Naive Beyes is : ",accuracy_score(predictions,Y_test))


Accuracy for Naive Beyes is :  0.7489177489177489


In [9]:
RF_model = RandomForestClassifier()
RF_model.fit(X_train,Y_train)
RF_predictions = RF_model.predict(X_test)
print("Accuracy for Random Forest Classifier is : ",accuracy_score(RF_predictions,Y_test))


Accuracy for Random Forest Classifier is :  0.7489177489177489


In [10]:
X_new = X.drop(["DiabetesPedigreeFunction","SkinThickness","Insulin","Pregnancies"],1)
X_new.columns

Index(['Glucose', 'BloodPressure', 'BMI', 'Age'], dtype='object')

In [11]:
X_train1, X_test1, Y_train1, Y_test1 = train_test_split(X_new, Y, train_size=0.7, test_size=0.3,random_state=0)

In [12]:
Bayes_Model.fit(X_train1,Y_train1)
predictions = Bayes_Model.predict(X_test1)
print("Accuracy for Naive Beyes is : ",accuracy_score(predictions,Y_test1))


Accuracy for Naive Beyes is :  0.7575757575757576


In [13]:
RF_model.fit(X_train1,Y_train1)
RF_predictions = RF_model.predict(X_test1)
print("Accuracy for Random Forest Classifier is : ",accuracy_score(RF_predictions,Y_test1))

Accuracy for Random Forest Classifier is :  0.7662337662337663


In [14]:
over_sampler = RandomOverSampler(sampling_strategy='minority')
X_Over,Y_Over = over_sampler.fit_resample(X_new,Y)
Sampled_Dataset = X_Over.merge(Y_Over,left_index=True,right_index=True)
Sampled_Dataset

Unnamed: 0,Glucose,BloodPressure,BMI,Age,Outcome
0,148.0,72.000000,33.6,50.0,1.0
1,85.0,66.000000,26.6,31.0,0.0
2,183.0,64.000000,23.3,32.0,1.0
3,89.0,66.000000,28.1,21.0,0.0
4,137.0,40.000000,43.1,33.0,1.0
...,...,...,...,...,...
995,134.0,70.000000,28.9,23.0,1.0
996,162.0,76.000000,49.6,26.0,1.0
997,164.0,78.000000,32.8,45.0,1.0
998,182.0,74.000000,30.5,29.0,1.0


In [15]:
X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_Over, Y_Over, train_size=0.7, test_size=0.3,random_state=0)

In [16]:
Bayes_Model.fit(X_train3,Y_train3)
predictions = Bayes_Model.predict(X_test3)
print("Accuracy for Naive Beyes is : ",accuracy_score(predictions,Y_test3))


Accuracy for Naive Beyes is :  0.73


In [17]:
RF_model.fit(X_train3,Y_train3)
RF_predictions = RF_model.predict(X_test3)
print("Accuracy for Random Forest Classifier is : ",accuracy_score(RF_predictions,Y_test3))

Accuracy for Random Forest Classifier is :  0.8466666666666667


In [18]:
for i in range(15):
    X_train3, X_test3, Y_train3, Y_test3 = train_test_split(X_Over, Y_Over, train_size=0.7, test_size=0.3)
    Bayes_Model.fit(X_train3,Y_train3)
    predictions = Bayes_Model.predict(X_test1)
    print("Accuracy for Naive Beyes is : ",accuracy_score(predictions,Y_test1))
    RF_model.fit(X_train3,Y_train3)
    RF_predictions = RF_model.predict(X_test1)
    print("Accuracy for Random Forest Classifier is : ",accuracy_score(RF_predictions,Y_test1))

Accuracy for Naive Beyes is :  0.7229437229437229
Accuracy for Random Forest Classifier is :  0.922077922077922
Accuracy for Naive Beyes is :  0.7402597402597403
Accuracy for Random Forest Classifier is :  0.9567099567099567
Accuracy for Naive Beyes is :  0.7402597402597403
Accuracy for Random Forest Classifier is :  0.935064935064935
Accuracy for Naive Beyes is :  0.7359307359307359
Accuracy for Random Forest Classifier is :  0.948051948051948
Accuracy for Naive Beyes is :  0.7359307359307359
Accuracy for Random Forest Classifier is :  0.9307359307359307
Accuracy for Naive Beyes is :  0.7186147186147186
Accuracy for Random Forest Classifier is :  0.9307359307359307
Accuracy for Naive Beyes is :  0.7229437229437229
Accuracy for Random Forest Classifier is :  0.9437229437229437
Accuracy for Naive Beyes is :  0.7316017316017316
Accuracy for Random Forest Classifier is :  0.935064935064935
Accuracy for Naive Beyes is :  0.7445887445887446
Accuracy for Random Forest Classifier is :  0.9567

In [19]:
RF_model.fit(X_train3,Y_train3)
RF_predictions = RF_model.predict(X_test1)
print("Accuracy for Random Forest Classifier is : ",accuracy_score(RF_predictions,Y_test1))

Accuracy for Random Forest Classifier is :  0.935064935064935


In [20]:
input_data = (130,176,30,20)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = RF_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0.]
The person is not diabetic


In [21]:
import pickle

In [22]:
filename = 'mydiabetes_model.sav'
pickle.dump(RF_model, open(filename, 'wb'))

In [23]:
loaded_model = pickle.load(open('mydiabetes_model.sav', 'rb'))

In [24]:
input_data = (5,166,72,19)

# changing the input_data to numpy array
input_data_as_numpy_array = np.asarray(input_data)

# reshape the array as we are predicting for one instance
input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)

prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0] == 0):
  print('The person is not diabetic')
else:
  print('The person is diabetic')

[0.]
The person is not diabetic


In [25]:
from sklearn.metrics import confusion_matrix
confusion_matrix(RF_predictions,Y_test1)

array([[148,   6],
       [  9,  68]])