# Detection of Diabetes using Random Forest Algorithm

In [2]:
# Loading numpy
import numpy as np
# Loading pandas
import pandas as pd
# Setting random seed
np.random.seed(0)

# Loading train_test_split for training and testing the dataset
from sklearn.model_selection import train_test_split
# Loading accuracy_score to check accuracy of model
from sklearn.metrics import accuracy_score

In [3]:
df = pd.read_csv("diabetes.csv")
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


### Remove zeros and replace it with mean value

In [4]:
zero_not_accepted = ['Glucose', 'BloodPressure', 'SkinThickness', 'BMI', 'Insulin']

for column in zero_not_accepted:
    df[column] = df[column].replace(0, np.NaN)
    mean = int(df[column].mean(skipna = True))
    df[column] = df[column].replace(np.NaN, mean)

In [5]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,155.0,33.6,0.627,50,1
1,1,85.0,66.0,29.0,155.0,26.6,0.351,31,0
2,8,183.0,64.0,29.0,155.0,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


### Split the dataset and train, test the model
We have to train and test the dataset upto 8th column i.e column 0 to 7

In [6]:
X = df.iloc[:, 0:8]
y = df.iloc[:, 8]

# Train and test model
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)

### Import RandomForestClassifier()

In [7]:
from sklearn.ensemble import RandomForestClassifier
clf = RandomForestClassifier(n_jobs = 2, random_state = 0)
# Training the classifier
clf.fit(x_train, y_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=2,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

### Predict the model

In [40]:
# Applying train classifier to the test
clf.predict(x_test)

array([1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0,
       1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0])

### Viewing the predicted probabilities of the first 10 observations

In [8]:
clf.predict_proba(x_test)[0:10]

array([[0.2, 0.8],
       [0.7, 0.3],
       [1. , 0. ],
       [0.6, 0.4],
       [0.9, 0.1],
       [1. , 0. ],
       [0.1, 0.9],
       [0.2, 0.8],
       [0.7, 0.3],
       [0.5, 0.5]])

### Mapping 0's and 1's for each predicted Diabetes class

In [9]:
preds = clf.predict(x_test)
preds[0:5]

array([1, 0, 0, 0, 0])

### Replace True and False value with 0's and 1's

In [10]:
df.Outcome.replace(True, 1, inplace=True)
df.Outcome.replace(False, 0, inplace=True)

### Check accuracy_score for the model

In [11]:
accuracy_score(y_test, preds)

0.75

### Predict whether a patient is suffer from Diabetes or not using provided inputes

In [12]:
preds = clf.predict([[0, 140, 90, 28, 174, 25, 0.140, 43]])
if preds[0] == 1: 
    print("Diabetes Result: True")
else: 
    print("Diabetes Result: False")

Diabetes Result: False
