In [None]:
import pandas as pd
pd.set_option('display.max_columns', 22)
pd.set_option('display.max_rows',100)

In [None]:
#the dataset was imported and
#the Diabetes_012 columns was changed to string values

df = pd.read_csv('/content/diabetes_012_health_indicators_BRFSS2015.csv')
df['Diabetes_012'] = df['Diabetes_012'].replace({0.0: 'nodiab', 1.0: 'prediab', 2.0: 'diab'})
df

Unnamed: 0,Diabetes_012,HighBP,HighChol,CholCheck,BMI,Smoker,Stroke,HeartDiseaseorAttack,PhysActivity,Fruits,Veggies,HvyAlcoholConsump,AnyHealthcare,NoDocbcCost,GenHlth,MentHlth,PhysHlth,DiffWalk,Sex,Age,Education,Income
0,nodiab,1.0,1.0,1.0,40.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,5.0,18.0,15.0,1.0,0.0,9.0,4.0,3.0
1,nodiab,0.0,0.0,0.0,25.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,3.0,0.0,0.0,0.0,0.0,7.0,6.0,1.0
2,nodiab,1.0,1.0,1.0,28.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,5.0,30.0,30.0,1.0,0.0,9.0,4.0,8.0
3,nodiab,1.0,0.0,1.0,27.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,0.0,0.0,0.0,0.0,11.0,3.0,6.0
4,nodiab,1.0,1.0,1.0,24.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,0.0,2.0,3.0,0.0,0.0,0.0,11.0,5.0,4.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
253675,nodiab,1.0,1.0,1.0,45.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,5.0,0.0,1.0,5.0,6.0,7.0
253676,diab,1.0,1.0,1.0,18.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,4.0,0.0,0.0,1.0,0.0,11.0,2.0,4.0
253677,nodiab,0.0,0.0,1.0,28.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,5.0,2.0
253678,nodiab,1.0,0.0,1.0,23.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,3.0,0.0,0.0,0.0,1.0,7.0,5.0,1.0


Subsetting the dataset to for training and prediction

In [None]:
col_to_drop = [
    'CholCheck',
    'HeartDiseaseorAttack',
    'PhysActivity',
    'MentHlth',
    'HvyAlcoholConsump',
    'AnyHealthcare',
    'NoDocbcCost',
    'DiffWalk',
    'Sex',
    'Age',
    'Education',
    'Income',
    'BMI',
    'Fruits',
    'Veggies'
]
df_final = df.drop(columns=col_to_drop, axis=1)
df_final.fillna(0, inplace=True)
df_final.head(100)

Unnamed: 0,Diabetes_012,HighBP,HighChol,Smoker,Stroke,GenHlth,PhysHlth
0,nodiab,1.0,1.0,1.0,0.0,5.0,15.0
1,nodiab,0.0,0.0,1.0,0.0,3.0,0.0
2,nodiab,1.0,1.0,0.0,0.0,5.0,30.0
3,nodiab,1.0,0.0,0.0,0.0,2.0,0.0
4,nodiab,1.0,1.0,0.0,0.0,2.0,0.0
5,nodiab,1.0,1.0,1.0,0.0,2.0,2.0
6,nodiab,1.0,0.0,1.0,0.0,3.0,14.0
7,nodiab,1.0,1.0,1.0,0.0,3.0,0.0
8,diab,1.0,1.0,1.0,0.0,5.0,30.0
9,nodiab,0.0,0.0,0.0,0.0,2.0,0.0


## **Training the model using 80% of the dataset with DecisionTree**

In [None]:
#we train the data with
#DecisionTreeClassifierModel
#and tested the accuracy
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.metrics import accuracy_score

#we split the data into two parts
#the X as input and
#the y as output and trained the data with
# 80% of the sample
X = df_final.drop(columns='Diabetes_012')
y = df_final['Diabetes_012']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.2)

model = DecisionTreeClassifier()
model.fit(X_train, y_train)
predictions = model.predict(X_test)
print(predictions)

accuracy = accuracy_score(y_test, predictions)
accuracy

['nodiab' 'nodiab' 'nodiab' ... 'nodiab' 'nodiab' 'nodiab']


0.8400741091138442

### Saving the trained model and loading it to make predictions

In [None]:
import joblib as jb

X = df_final.drop(columns='Diabetes_012')
y = df_final['Diabetes_012']

model = DecisionTreeClassifier()
model.fit(X, y)

tree.export_graphviz(model, out_file='diabetes-predictor.dot',
                    feature_names=['HighBP','HighChol','Smoker','Stroke','GenHlth','PhysHlth'],
                     class_names=sorted(y.unique()),
                     label='all',
                     rounded=True,
                     filled=True)

jb.dump(model, 'diabetes predictor.joblib')

['diabetes predictor.joblib']

In [None]:
model = jb.load('diabetes predictor.joblib')
predictions = model.predict([[1.0,1.0,0.0,0.0,2.0,3.0]])
print(predictions)

['nodiab']




## **Training the model using 70% of the dataset with RandomForest **

In [None]:
#we train the data with
#RandomTreeClassifierModel
#and tested the accuracy
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

#we split the data into two parts
#the X as input and
#the y as output and trained the data with
# 70% of the sample
X = df_final.drop(columns='Diabetes_012')
y = df_final['Diabetes_012']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =0.3)

model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)
predictions = model_rf.predict(X_test)
print(predictions)

acc = accuracy_score(y_test, predictions)
acc

['nodiab' 'nodiab' 'nodiab' ... 'nodiab' 'nodiab' 'nodiab']


0.8445416798065805

**Saving The model**

In [None]:
import joblib as jb
X = df_final.drop(columns='Diabetes_012')
y = df_final['Diabetes_012']

model_rf = RandomForestClassifier()
model_rf.fit(X, y)


jb.dump(model_rf, 'diabetes-predictor-rf.joblib')


['diabetes-predictor-rf.joblib']

In [None]:
model = jb.load('diabetes-predictor-rf.joblib')
predictions = model.predict([[1.0,1.0,1.0,0.0,5.0,30.0]])
print(predictions)

['nodiab']




# **Visualizing data from the models**