# Diabetes Predictor Model - Preprocessing and Training Data Development

### In this notebook we will:
### 1. Create dummy or indicator features for categorical features
### 2. Standardize the magnitude of numeric features using scaler
### 3. Split the data into testing and training datasets

In [40]:
# Importing modules

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
import statsmodels.api as sm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

#show plots inline
%matplotlib inline

In [2]:
# Supress future warnings

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)

In [3]:
# Loading the dataset

df = pd.read_csv('cleaned_diabetes_df.csv')

In [4]:
# Looking at first few row of the dataset

df.head()

Unnamed: 0,DiabetesDiagnosis,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,...,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,diabetes,1.0,1.0,1.0,25.0,0.0,0.0,1.0,1.0,1.0,...,1.0,0.0,2.0,0.0,0.0,0.0,0.0,9.0,6.0,2.0
1,diabetes,0.0,0.0,1.0,29.0,0.0,0.0,0.0,0.0,1.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,11.0,4.0,4.0
2,diabetes,1.0,1.0,1.0,35.0,1.0,0.0,0.0,0.0,1.0,...,1.0,0.0,4.0,1.0,30.0,0.0,0.0,10.0,6.0,8.0
3,diabetes,1.0,0.0,1.0,31.0,0.0,0.0,0.0,1.0,0.0,...,1.0,0.0,3.0,0.0,0.0,0.0,0.0,11.0,6.0,6.0
4,diabetes,1.0,1.0,1.0,26.0,1.0,0.0,0.0,1.0,1.0,...,1.0,0.0,3.0,0.0,10.0,1.0,1.0,12.0,4.0,7.0


In [9]:
# Looking at the data types

df.dtypes

DiabetesDiagnosis        object
HighBP                  float64
HighChol                float64
CholCheck               float64
BMI                     float64
Smoker                  float64
Stroke                  float64
HeartDiseaseorAttack    float64
PhysActivity            float64
Fruits                  float64
Veggies                 float64
HvyAlcoholConsump       float64
AnyHealthcare           float64
NoDocbcCost             float64
GenHlth                 float64
MentHlth                float64
PhysHlth                float64
DiffWalk                float64
Sex                     float64
Age                     float64
Education               float64
Income                  float64
dtype: object

In [11]:
# Making MentHlth and PhysHlth into categorical features using buckets

df['MentHlth'] = df['MentHlth'].astype(int)
df['PhysHlth'] = df['PhysHlth'].astype(int)

df['MentHlth'] = df['MentHlth'].clip(0, 30)
df['PhysHlth'] = df['PhysHlth'].clip(0, 30)

# Starting first with 'MentHlth'
df['MentHlth'] = pd.cut(df['MentHlth'], bins=[0, 7, 14, 21, 28, 30], labels=[1, 2, 3, 4, 5], right=True, include_lowest=True)

# Moving onto 'PhysHlth'
df['PhysHlth'] = pd.cut(df['PhysHlth'], bins=[0, 7, 14, 21, 28, 30], labels=[1, 2, 3, 4, 5], right=True, include_lowest=True)

In [13]:
print(df[['MentHlth', 'PhysHlth']].head(10))

  MentHlth PhysHlth
0        1        1
1        1        1
2        1        5
3        1        1
4        1        2
5        5        3
6        1        5
7        1        1
8        1        1
9        1        1


In [15]:
# Looking at the data types again

df.dtypes

DiabetesDiagnosis         object
HighBP                   float64
HighChol                 float64
CholCheck                float64
BMI                      float64
Smoker                   float64
Stroke                   float64
HeartDiseaseorAttack     float64
PhysActivity             float64
Fruits                   float64
Veggies                  float64
HvyAlcoholConsump        float64
AnyHealthcare            float64
NoDocbcCost              float64
GenHlth                  float64
MentHlth                category
PhysHlth                category
DiffWalk                 float64
Sex                      float64
Age                      float64
Education                float64
Income                   float64
dtype: object

In [17]:
# Checking the BMI feature to see if it can be converted to an integer
# BMI will be my only continuous feature

df['BMI'].head(30)

0     25.0
1     29.0
2     35.0
3     31.0
4     26.0
5     36.0
6     31.0
7     43.0
8     29.0
9     28.0
10    41.0
11    23.0
12    37.0
13    32.0
14    27.0
15    28.0
16    40.0
17    34.0
18    23.0
19    42.0
20    31.0
21    35.0
22    39.0
23    37.0
24    30.0
25    36.0
26    41.0
27    22.0
28    37.0
29    36.0
Name: BMI, dtype: float64

In [19]:
# Changing the dtype of certain features to integers

features_to_convert = ['HighBP', 'HighChol', 'CholCheck', 'BMI', 'Smoker', 'Stroke', 'HeartDiseaseorAttack', 'PhysActivity', \
                       'Fruits', 'Veggies', 'HvyAlcoholConsump', 'AnyHealthcare', 'NoDocbcCost', 'GenHlth', 'MentHlth', \
                       'PhysHlth', 'DiffWalk', 'Sex', 'Age', 'Education', 'Income']

df[features_to_convert] = df[features_to_convert].astype(int)

df.dtypes

DiabetesDiagnosis       object
HighBP                   int64
HighChol                 int64
CholCheck                int64
BMI                      int64
Smoker                   int64
Stroke                   int64
HeartDiseaseorAttack     int64
PhysActivity             int64
Fruits                   int64
Veggies                  int64
HvyAlcoholConsump        int64
AnyHealthcare            int64
NoDocbcCost              int64
GenHlth                  int64
MentHlth                 int64
PhysHlth                 int64
DiffWalk                 int64
Sex                      int64
Age                      int64
Education                int64
Income                   int64
dtype: object

In [21]:
# Per my Exploratory Data Analysis notebook, going to make DiabetesDiagnosis into a just a binary categorical target feature

# Defining category grouping
group_mapping = {'diabetes' : 'diabetes', 'pre_diabetes' : 'diabetes', 'no_diabetes' : 'no_diabetes'}

# Applying the mapping to the 'DiabetesDiagnosis' column
df['DiabetesDiagnosis'] = df['DiabetesDiagnosis'].map(group_mapping)

In [23]:
# Exploring the Diabetes Diagnosis feature 

print(df['DiabetesDiagnosis'].unique())

['diabetes' 'no_diabetes']


#### With the above work, I grouped my pre-diabetes along with diabetes making this problem a binary classification problem

In [26]:
# Manually mapping to make Diabetes diagnosis = 1

df['DiabetesDiagnosis'] = df['DiabetesDiagnosis'].map({'no_diabetes': 0, 'diabetes': 1})


In [28]:
# Exploring my target feature

print(df['DiabetesDiagnosis'].unique())

[1 0]


#### Diabetes = 1
#### No Diabetes = 0

In [32]:
# Checking dtypes

df.dtypes

DiabetesDiagnosis       int64
HighBP                  int64
HighChol                int64
CholCheck               int64
BMI                     int64
Smoker                  int64
Stroke                  int64
HeartDiseaseorAttack    int64
PhysActivity            int64
Fruits                  int64
Veggies                 int64
HvyAlcoholConsump       int64
AnyHealthcare           int64
NoDocbcCost             int64
GenHlth                 int64
MentHlth                int64
PhysHlth                int64
DiffWalk                int64
Sex                     int64
Age                     int64
Education               int64
Income                  int64
dtype: object

#### My problem is a binary classification problem and all my features including my target have been changed into numerical format.
#### I am going to go with a Random Forest model which I do not need to scale my features (standardize).

In [44]:
# Splitting my dataframe into training and test sets

# Separating features and targets

X = df.drop('DiabetesDiagnosis', axis=1)
y = df['DiabetesDiagnosis']

# Splitting into train and test sets
X_train, y_train, X_test, y_test = train_test_split(
    X, y, 
    test_size=0.2, 
    random_state=42,
    stratify=y
)

In [46]:
# Checking for class balance between my test and train split

print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))

HighBP  HighChol  CholCheck  BMI  Smoker  Stroke  HeartDiseaseorAttack  PhysActivity  Fruits  Veggies  HvyAlcoholConsump  AnyHealthcare  NoDocbcCost  GenHlth  MentHlth  PhysHlth  DiffWalk  Sex  Age  Education  Income
0       0         1          21   0       0       0                     1             1       1        0                  1              0            1        1         1         0         0    4    6          8         0.000374
                             23   0       0       0                     1             1       1        0                  1              0            1        1         1         0         0    6    6          8         0.000355
                                                                                                                                                                                                  5    6          8         0.000335
                                                                                                