# Introduction


**What?** Diabate prediction



# Import modules

In [1]:
import pandas as pd
import numpy as np
from pandas_profiling import ProfileReport
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from joblib import dump, load

# Import dataset

In [2]:
data = pd.read_csv('../data/datasets_228_482_diabetes.csv')

In [3]:
data.Outcome.value_counts()

0    500
1    268
Name: Outcome, dtype: int64

In [4]:
eda_profiling = ProfileReport(data)

In [5]:
eda_profiling

Summarize dataset:   0%|          | 0/22 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]



In [6]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


# Data Preprocessing


- There are considerable zeros.
- To fix this we use imputation by median.
- We also scale the inputs to maintain the range and significance between numeric variables.
- We could probably do more, but for the sake of this demo, we'll it as it stands.



In [7]:
data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']] = data[['Glucose','BloodPressure','SkinThickness','Insulin','BMI']].replace(0,np.NaN)

In [8]:
def impute_median(data, var):   
    temp = data[data[var].notnull()]
    temp = temp[[var, 'Outcome']].groupby(['Outcome'])[[var]].median()
    data.loc[(data['Outcome'] == 0 ) & (data[var].isnull()), var] = temp.loc[0 ,var]
    data.loc[(data['Outcome'] == 1 ) & (data[var].isnull()), var] = temp.loc[1 ,var]
    return data

In [9]:
data = impute_median(data, 'Glucose')
data = impute_median(data, 'BloodPressure')
data = impute_median(data, 'SkinThickness')
data = impute_median(data, 'Insulin')
data = impute_median(data, 'BMI')

In [10]:
data.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.0,169.5,33.6,0.627,50,1
1,1,85.0,66.0,29.0,102.5,26.6,0.351,31,0
2,8,183.0,64.0,32.0,169.5,23.3,0.672,32,1
3,1,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,0,137.0,40.0,35.0,168.0,43.1,2.288,33,1


In [11]:
y = data['Outcome']
x = data.drop('Outcome', axis = 1)
columns = x.columns

scaler = StandardScaler()
scaler = scaler.fit(x)
X = scaler.transform(x)
features = pd.DataFrame(X, columns = columns)

In [12]:
dump(scaler, '../models/scaler.joblib')

['../models/scaler.joblib']

In [13]:
features.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age
0,0.639947,0.864625,-0.03218,0.665181,0.311604,0.169483,0.468492,1.425995
1,-0.844885,-1.204727,-0.528124,-0.010112,-0.440843,-0.848549,-0.365061,-0.190672
2,1.23388,2.014265,-0.693438,0.327535,0.311604,-1.328478,0.604397,-0.105584
3,-0.844885,-1.073339,-0.528124,-0.685405,-0.536303,-0.630399,-0.920763,-1.041549
4,-1.141852,0.50331,-2.677212,0.665181,0.294758,1.551096,5.484909,-0.020496


# Modelling

In [14]:
x_train, x_test, y_train, y_test = train_test_split(features, y, test_size = 0.2, random_state = 42)

In [15]:
model = RandomForestClassifier(n_estimators=300, bootstrap = True, max_features = 'sqrt')
model.fit(x_train, y_train)
y_pred = model.predict(x_test)

In [16]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.92      0.87      0.90        99
           1       0.79      0.87      0.83        55

    accuracy                           0.87       154
   macro avg       0.86      0.87      0.86       154
weighted avg       0.88      0.87      0.87       154



In [17]:
dump(model, '../models/model.joblib')

['../models/model.joblib']

# Inference

In [18]:
pregnancies = 2
glucose = 13
bloodpressure = 30
skinthickness = 4
insulin = 5
bmi = 5
dpf = 0.55
age = 34
feat_cols = features.columns

row = [pregnancies, glucose, bloodpressure, skinthickness, insulin, bmi, dpf, age]

In [19]:
scaler = load('../models/scaler.joblib')

In [20]:
model = load('../models/model.joblib')

In [21]:
feat_cols

Index(['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 'Insulin',
       'BMI', 'DiabetesPedigreeFunction', 'Age'],
      dtype='object')

In [22]:
df = pd.DataFrame([row], columns = feat_cols)
X = scaler.transform(df)
features = pd.DataFrame(X, columns = feat_cols)

In [23]:
if (model.predict(features)==0):
    print("This is a healthy person!")
else: 
    print("This person has high chances of having diabetics!")

This is a healthy person!


# References


- [Link to article](https://pub.towardsai.net/how-i-build-machine-learning-apps-in-hours-a1b1eaa642ed) 
- [Link to code](https://github.com/arunnthevapalan/diabetes-prediction-app)

