# Machine Learning Diabetes Classification

## Read csv and perform basic data cleaning

In [1]:
# Install zipfile36 if you haven't already
#!pip install zipfile36

In [3]:
# Import our dependencies
import pandas as pd
import numpy as np
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen
from matplotlib import pyplot as plt

# SQL
from sqlalchemy import create_engine
import sqlite3 as sql

# Machine learning
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import accuracy_score
import tensorflow as tf

In [4]:
# Create dataframe
z = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip')
myzip = ZipFile(BytesIO(z.read())).extract('dataset_diabetes/diabetic_data.csv')
df = pd.read_csv(myzip)
df.head(5)

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,weight,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),?,6,25,1,1,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),?,1,1,7,3,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),?,1,1,7,2,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),?,1,1,7,2,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),?,1,1,7,1,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [5]:
# Drop the non-beneficial ID column 'patient_nbr'
df = df.drop(['patient_nbr'],1)

# Drop mostly empty columns, 'weight', 'payer_code', 'max_glu_serum', and 'medical_specialty'
df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)

# Replace '?' values to nulls
df.replace({'?': np.nan}, inplace=True)

# Replace 'None' values to nulls
df.replace({'None': np.nan}, inplace=True)

# Drop the null rows
df = df.dropna()

  
  """


In [6]:
# Convert the target column values to normal and high based on their values
x = {'Norm': 'low'}   
df = df.replace(x)
x = dict.fromkeys(['>7', '>8'], 'high')    
df = df.replace(x)
df.reset_index(inplace=True, drop=True)

In [7]:
# Determine the number of unique values in each column.
df.nunique()

encounter_id                16193
race                            5
gender                          2
age                            10
admission_type_id               8
discharge_disposition_id       21
admission_source_id            15
time_in_hospital               14
num_lab_procedures            114
num_procedures                  7
num_medications                67
number_outpatient              24
number_emergency               19
number_inpatient               18
diag_1                        490
diag_2                        486
diag_3                        539
number_diagnoses               12
A1Cresult                       2
metformin                       4
repaglinide                     4
nateglinide                     4
chlorpropamide                  2
glimepiride                     4
acetohexamide                   1
glipizide                       4
glyburide                       4
tolbutamide                     2
pioglitazone                    4
rosiglitazone 

In [8]:
# Drop columns with only 1 value
df = df.drop(['acetohexamide', 'troglitazone', 'examide', 'citoglipton','glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'],1)
df.nunique()

  


encounter_id                16193
race                            5
gender                          2
age                            10
admission_type_id               8
discharge_disposition_id       21
admission_source_id            15
time_in_hospital               14
num_lab_procedures            114
num_procedures                  7
num_medications                67
number_outpatient              24
number_emergency               19
number_inpatient               18
diag_1                        490
diag_2                        486
diag_3                        539
number_diagnoses               12
A1Cresult                       2
metformin                       4
repaglinide                     4
nateglinide                     4
chlorpropamide                  2
glimepiride                     4
glipizide                       4
glyburide                       4
tolbutamide                     2
pioglitazone                    4
rosiglitazone                   4
acarbose      

In [9]:
# Create SQLite engine using SQLAlechmey
engine = create_engine('sqlite:///diabetes_data.db', echo=False)
conn = engine.connect()
df.to_sql('diabetes', conn, index=False, if_exists='replace')

In [10]:
# Check dataset
pd.read_sql('SELECT * FROM diabetes LIMIT 10', conn)

Unnamed: 0,encounter_id,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,...,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide-metformin,glipizide-metformin,change,diabetesMed,readmitted
0,236316,Caucasian,Male,[80-90),1,3,7,6,64,3,...,No,No,No,No,No,No,No,Ch,Yes,NO
1,955884,Caucasian,Female,[70-80),1,3,7,5,34,0,...,No,No,No,No,Up,No,No,Ch,Yes,>30
2,1257282,Other,Female,[50-60),1,1,7,2,53,0,...,No,No,No,No,Up,No,No,Ch,Yes,NO
3,1270524,Caucasian,Male,[60-70),1,2,7,1,59,0,...,No,No,No,No,Steady,No,No,No,Yes,NO
4,1455252,Caucasian,Female,[80-90),1,1,7,3,34,0,...,No,No,No,No,No,No,No,No,No,>30
5,1810752,Caucasian,Male,[70-80),1,3,7,14,78,1,...,No,No,No,No,Up,No,No,Ch,Yes,<30
6,1881372,Caucasian,Male,[60-70),1,2,7,4,65,2,...,No,No,No,No,Steady,No,No,No,Yes,<30
7,1968528,Caucasian,Female,[70-80),6,25,1,10,56,2,...,No,No,No,No,Down,No,No,Ch,Yes,>30
8,2092362,Caucasian,Female,[70-80),6,25,7,11,88,1,...,No,No,No,No,Down,No,No,Ch,Yes,>30
9,2095932,AfricanAmerican,Female,[30-40),6,25,7,8,62,0,...,No,No,No,No,Steady,No,No,Ch,Yes,>30


In [11]:
# Create cleanup table
engine.execute('CREATE TABLE "cleaned_columns" ('
               'id BIGINT NOT NULL,'
               'diag_1c VARCHAR, '
               'diag_2c VARCHAR, '
               'diag_3c VARCHAR, '
               'PRIMARY KEY (id));')

OperationalError: (sqlite3.OperationalError) table "cleaned_columns" already exists
[SQL: CREATE TABLE "cleaned_columns" (id BIGINT NOT NULL,diag_1c VARCHAR, diag_2c VARCHAR, diag_3c VARCHAR, PRIMARY KEY (id));]
(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [12]:
# Insert columns into new table to clean
engine.execute('INSERT INTO "cleaned_columns" '
               'SELECT encounter_id AS id, '
               'CASE '
                   'WHEN (diag_1 BETWEEN 390 AND 459) OR (diag_1 = 785) THEN "circulatory" '
                   'WHEN (diag_1 BETWEEN 460 AND 519) OR (diag_1 = 786) THEN "respiratory" '
                   'WHEN (diag_1 BETWEEN 520 AND 579) OR (diag_1 = 787) THEN "digestive" '
                   'WHEN (diag_1 BETWEEN 250 AND 251) THEN "diabetes" '
                   'WHEN (diag_1 BETWEEN 800 AND 999) THEN "injury" '
                   'WHEN (diag_1 BETWEEN 710 AND 739) THEN "musculoskeletal" '
                   'WHEN (diag_1 BETWEEN 580 AND 629) OR (diag_1 = 788) THEN "genitourinary" '
                   'WHEN (diag_1 BETWEEN 140 AND 239) THEN "neoplasms" '
                   'ELSE "other" '
               'END AS diag_1c, '
               'CASE '
                   'WHEN (diag_2 BETWEEN 390 AND 459) OR (diag_2 = 785) THEN "circulatory" '
                   'WHEN (diag_2 BETWEEN 460 AND 519) OR (diag_2 = 786) THEN "respiratory" '
                   'WHEN (diag_2 BETWEEN 520 AND 579) OR (diag_2 = 787) THEN "digestive" '
                   'WHEN (diag_2 BETWEEN 250 AND 251) THEN "diabetes" '
                   'WHEN (diag_2 BETWEEN 800 AND 999) THEN "injury" '
                   'WHEN (diag_2 BETWEEN 710 AND 739) THEN "musculoskeletal" '
                   'WHEN (diag_2 BETWEEN 580 AND 629) OR (diag_2 = 788) THEN "genitourinary" '
                   'WHEN (diag_2 BETWEEN 140 AND 239) THEN "neoplasms" '
                   'ELSE "other" '
               'END AS diag_2c, '
               'CASE '
                   'WHEN (diag_3 BETWEEN 390 AND 459) OR (diag_3 = 785) THEN "circulatory" '
                   'WHEN (diag_3 BETWEEN 460 AND 519) OR (diag_3 = 786) THEN "respiratory" '
                   'WHEN (diag_3 BETWEEN 520 AND 579) OR (diag_3 = 787) THEN "digestive" '
                   'WHEN (diag_3 BETWEEN 250 AND 251) THEN "diabetes" '
                   'WHEN (diag_3 BETWEEN 800 AND 999) THEN "injury" '
                   'WHEN (diag_3 BETWEEN 710 AND 739) THEN "musculoskeletal" '
                   'WHEN (diag_3 BETWEEN 580 AND 629) OR (diag_3 = 788) THEN "genitourinary" '
                   'WHEN (diag_3 BETWEEN 140 AND 239) THEN "neoplasms" '
                   'ELSE "other" '
               'END AS diag_3c '
               'FROM diabetes;')

IntegrityError: (sqlite3.IntegrityError) UNIQUE constraint failed: cleaned_columns.id
[SQL: INSERT INTO "cleaned_columns" SELECT encounter_id AS id, CASE WHEN (diag_1 BETWEEN 390 AND 459) OR (diag_1 = 785) THEN "circulatory" WHEN (diag_1 BETWEEN 460 AND 519) OR (diag_1 = 786) THEN "respiratory" WHEN (diag_1 BETWEEN 520 AND 579) OR (diag_1 = 787) THEN "digestive" WHEN (diag_1 BETWEEN 250 AND 251) THEN "diabetes" WHEN (diag_1 BETWEEN 800 AND 999) THEN "injury" WHEN (diag_1 BETWEEN 710 AND 739) THEN "musculoskeletal" WHEN (diag_1 BETWEEN 580 AND 629) OR (diag_1 = 788) THEN "genitourinary" WHEN (diag_1 BETWEEN 140 AND 239) THEN "neoplasms" ELSE "other" END AS diag_1c, CASE WHEN (diag_2 BETWEEN 390 AND 459) OR (diag_2 = 785) THEN "circulatory" WHEN (diag_2 BETWEEN 460 AND 519) OR (diag_2 = 786) THEN "respiratory" WHEN (diag_2 BETWEEN 520 AND 579) OR (diag_2 = 787) THEN "digestive" WHEN (diag_2 BETWEEN 250 AND 251) THEN "diabetes" WHEN (diag_2 BETWEEN 800 AND 999) THEN "injury" WHEN (diag_2 BETWEEN 710 AND 739) THEN "musculoskeletal" WHEN (diag_2 BETWEEN 580 AND 629) OR (diag_2 = 788) THEN "genitourinary" WHEN (diag_2 BETWEEN 140 AND 239) THEN "neoplasms" ELSE "other" END AS diag_2c, CASE WHEN (diag_3 BETWEEN 390 AND 459) OR (diag_3 = 785) THEN "circulatory" WHEN (diag_3 BETWEEN 460 AND 519) OR (diag_3 = 786) THEN "respiratory" WHEN (diag_3 BETWEEN 520 AND 579) OR (diag_3 = 787) THEN "digestive" WHEN (diag_3 BETWEEN 250 AND 251) THEN "diabetes" WHEN (diag_3 BETWEEN 800 AND 999) THEN "injury" WHEN (diag_3 BETWEEN 710 AND 739) THEN "musculoskeletal" WHEN (diag_3 BETWEEN 580 AND 629) OR (diag_3 = 788) THEN "genitourinary" WHEN (diag_3 BETWEEN 140 AND 239) THEN "neoplasms" ELSE "other" END AS diag_3c FROM diabetes;]
(Background on this error at: https://sqlalche.me/e/14/gkpj)

In [13]:
# Check data
pd.read_sql('SELECT * FROM cleaned_columns LIMIT 10', conn)

Unnamed: 0,id,diag_1c,diag_2c,diag_3c
0,236316,circulatory,circulatory,circulatory
1,955884,circulatory,circulatory,other
2,1257282,genitourinary,diabetes,circulatory
3,1270524,circulatory,circulatory,respiratory
4,1455252,circulatory,circulatory,circulatory
5,1810752,circulatory,diabetes,diabetes
6,1881372,circulatory,circulatory,respiratory
7,1968528,circulatory,circulatory,diabetes
8,2092362,diabetes,other,genitourinary
9,2095932,diabetes,circulatory,other


In [14]:
# Join diabetes and cleaned_columns tables
diabetes_df = pd.read_sql('SELECT * FROM diabetes JOIN cleaned_columns ON diabetes.encounter_id = cleaned_columns.id', conn)

In [15]:
diabetes_df.dtypes

encounter_id                 int64
race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone       

In [16]:
# Close SQLite connection
conn.close()

In [17]:
# Drop the non-beneficial ID and diag columns, 'encounter_id', 'id', 'diag_1', 'diag_2', and 'diag_3'
diabetes_2_df = diabetes_df.drop(['encounter_id', 'id', 'diag_1', 'diag_2', 'diag_3'],1)

  


In [18]:
# Check data
diabetes_2_df.dtypes

race                        object
gender                      object
age                         object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
glipizide                   object
glyburide                   object
tolbutamide                 object
pioglitazone                object
rosiglitazone               object
acarbose                    object
miglitol                    object
tolazamide                  object
insulin             

In [20]:
# # Generate our categorical variable list
diabetes_2_df_cat = diabetes_2_df.dtypes[diabetes_2_df.dtypes == "object"].index.tolist()
diabetes_2_df_cat

['race',
 'gender',
 'age',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'tolazamide',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'change',
 'diabetesMed',
 'readmitted',
 'diag_1c',
 'diag_2c',
 'diag_3c']

In [21]:

#OneHot encode the categorical data. 
from sklearn.preprocessing import OneHotEncoder

# Create a OneHotEncoder instance
enc = OneHotEncoder(sparse=False)

# Fit and transform the OneHotEncoder using the categorical variable list
encode_df = pd.DataFrame(enc.fit_transform(diabetes_2_df[diabetes_2_df_cat]))

# Add the encoded variable names to the DataFrame
encode_df.columns = enc.get_feature_names_out(diabetes_2_df_cat)
encode_df.head()

Unnamed: 0,race_AfricanAmerican,race_Asian,race_Caucasian,race_Hispanic,race_Other,gender_Female,gender_Male,age_[0-10),age_[10-20),age_[20-30),...,diag_2c_respiratory,diag_3c_circulatory,diag_3c_diabetes,diag_3c_digestive,diag_3c_genitourinary,diag_3c_injury,diag_3c_musculoskeletal,diag_3c_neoplasms,diag_3c_other,diag_3c_respiratory
0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
# # Merge one-hot encoded features and drop the originals
diabetes_2_df = diabetes_2_df.merge(encode_df,left_index=True, right_index=True)
diabetes_2_df = diabetes_2_df.drop(diabetes_2_df_cat,1)
diabetes_2_df.head()

  This is separate from the ipykernel package so we can avoid doing imports until


Unnamed: 0,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,...,diag_2c_respiratory,diag_3c_circulatory,diag_3c_diabetes,diag_3c_digestive,diag_3c_genitourinary,diag_3c_injury,diag_3c_musculoskeletal,diag_3c_neoplasms,diag_3c_other,diag_3c_respiratory
0,1,3,7,6,64,3,18,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,1,3,7,5,34,0,17,0,0,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
2,1,1,7,2,53,0,6,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,1,2,7,1,59,0,12,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,1,1,7,3,34,0,11,0,0,0,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
#remove target from features data
y = diabetes_2_df['A1Cresult_high']
X = diabetes_2_df.drop(columns=["A1Cresult_high","A1Cresult_low"])
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.705


In [25]:
#Create new dataframe with top only 8 features
top_8_df = diabetes_2_df[['num_lab_procedures','num_medications','time_in_hospital','num_procedures','number_diagnoses','discharge_disposition_id','admission_type_id','admission_source_id']] 
top_8_df.head(20)

Unnamed: 0,num_lab_procedures,num_medications,time_in_hospital,num_procedures,number_diagnoses,discharge_disposition_id,admission_type_id,admission_source_id
0,64,18,6,3,7,3,1,7
1,34,17,5,0,7,3,1,7
2,53,6,2,0,3,1,1,7
3,59,12,1,0,7,2,1,7
4,34,11,3,0,8,1,1,7
5,78,19,14,1,8,3,1,7
6,65,19,4,2,7,2,1,7
7,56,24,10,2,9,25,6,1
8,88,16,11,1,9,25,6,7
9,62,21,8,0,9,25,6,7


In [26]:
#remove target from features data
y = diabetes_2_df['A1Cresult_high']
X = top_8_df
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [27]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.661


In [28]:
importances = rf_model.feature_importances_

In [29]:
# List the features sorted in descending order by feature importance
importances = rf_model.feature_importances_
importances
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.31764033064813757, 'num_lab_procedures'),
 (0.24568759084847203, 'num_medications'),
 (0.15557628673490595, 'time_in_hospital'),
 (0.08200127908277177, 'num_procedures'),
 (0.0609788534619337, 'discharge_disposition_id'),
 (0.056156345651080915, 'number_diagnoses'),
 (0.04579760909909611, 'admission_type_id'),
 (0.036161704473602, 'admission_source_id')]

In [30]:
#Trying PCA on original data 
# //95% of variance
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.95)
pca.fit(diabetes_2_df)
reduced = pca.transform(diabetes_2_df)

In [31]:
#Call into dataframe 
transform_pca = pd.DataFrame(reduced)
transform_pca

Unnamed: 0,0,1,2,3,4
0,9.571776,-0.565898,-0.545421,0.673425,0.764878
1,-19.961442,4.701161,-0.726394,1.936615,1.164190
2,-4.140112,-10.897443,-1.703849,0.556176,-0.592887
3,2.993498,-6.365386,-1.309386,1.204356,-2.892209
4,-21.394756,-1.469669,-2.240689,1.829517,0.367222
...,...,...,...,...,...
16188,-24.489013,18.120871,1.131322,-2.217557,-3.790786
16189,32.819944,43.769514,-1.340973,-2.933543,-2.055186
16190,-0.636850,17.117703,-3.885378,1.739019,1.912936
16191,20.720805,6.611781,-2.780556,-0.444794,7.025070


In [32]:
#Trying PCA on original data 
# //99% of variance
from sklearn.decomposition import PCA
pca = PCA(n_components = 0.99)
pca.fit(diabetes_2_df)
reduced_2 = pca.transform(diabetes_2_df)

In [33]:
#Call into dataframe 
pd.DataFrame(reduced_2)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
0,9.571776,-0.565898,-0.545421,0.673425,0.764878,-1.029208,1.102480,-1.349393,0.078296,-0.489281,0.621829,-0.215455,0.666302,-0.631057,1.290230,-1.294395,-0.083483,0.136756,0.303367,0.079376
1,-19.961442,4.701161,-0.726394,1.936615,1.164190,0.155666,-1.634472,-0.415685,-0.076701,0.455420,0.451711,-0.500498,-0.711043,-0.962595,-0.204869,0.136145,-0.024247,-0.307537,-0.931633,1.121510
2,-4.140112,-10.897443,-1.703849,0.556176,-0.592887,-3.378896,-1.533734,-1.082999,0.392624,-0.216587,0.749215,-0.063358,-0.688480,0.000919,-0.089839,-0.335842,0.366212,0.440547,-0.907121,-0.718245
3,2.993498,-6.365386,-1.309386,1.204356,-2.892209,0.028069,-0.596524,-0.898729,-0.510989,-0.381075,-0.713220,-0.311498,0.819420,0.326787,0.770375,0.452860,1.189121,-0.587995,-0.055774,0.544943
4,-21.394756,-1.469669,-2.240689,1.829517,0.367222,1.229772,-0.278579,-0.480601,-0.626505,-0.364016,-1.793581,0.114869,-0.593812,-1.164817,0.784702,0.380913,-0.019435,-0.077944,-0.578354,-0.043829
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16188,-24.489013,18.120871,1.131322,-2.217557,-3.790786,0.860282,-0.803168,1.117019,-1.298711,-0.082916,0.319795,1.082111,-0.569522,0.524846,1.319951,-0.049328,0.431796,-0.479462,0.485148,0.417599
16189,32.819944,43.769514,-1.340973,-2.933543,-2.055186,4.260477,1.748860,0.694988,-2.134273,-0.602926,-0.710708,0.164640,0.781103,0.735098,0.359271,0.169038,0.051549,0.352423,-0.482140,0.524964
16190,-0.636850,17.117703,-3.885378,1.739019,1.912936,0.556847,-1.119528,-0.696689,-0.676600,-0.439679,0.198809,-0.046533,-0.707627,-0.723394,-0.637956,0.304579,0.244045,-0.675436,0.118181,-0.009918
16191,20.720805,6.611781,-2.780556,-0.444794,7.025070,-0.117233,2.975402,-1.333104,0.590848,0.032287,1.031418,1.120440,-0.858963,-0.659213,-0.635234,0.336577,-0.263421,0.312530,-0.208931,-0.233231


In [34]:
#running Random Forest with the top 5 components from PCA
#remove target from features data
y = diabetes_2_df['A1Cresult_high']
X = transform_pca
# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler = scaler.fit(X_train)

# Scale the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [35]:
# Create a random forest classifier.
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.661


In [36]:
# Define the model - deep neural net
number_input_features = len(X_train_scaled[0])
hidden_nodes_layer1 = 24
hidden_nodes_layer2 = 12

nn = tf.keras.models.Sequential()

# First hidden layer
nn.add(
    tf.keras.layers.Dense(units=hidden_nodes_layer1, input_dim=number_input_features, activation="relu")
)

# Second hidden layer
nn.add(tf.keras.layers.Dense(units=hidden_nodes_layer2, activation="relu"))


# Output layer
nn.add(tf.keras.layers.Dense(units=1, activation="sigmoid"))

# Compile the Sequential model together and customize metrics
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Train the model
fit_model = nn.fit(X_train_scaled, y_train, epochs=50)

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

2022-02-10 19:36:10.239573: I tensorflow/core/platform/cpu_feature_guard.cc:151] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
127/127 - 0s - loss: 0.6127 - accuracy: 0.7004 - 162ms/epoch - 1ms/step
Loss: 0.6127306818962097, Accuracy: 0.7004198431968689


In [37]:
#
# Import checkpoint dependencies
import os
from tensorflow.keras.callbacks import ModelCheckpoint

# Define the checkpoint path and filenames
os.makedirs("checkpoints/",exist_ok=True)
checkpoint_path = "checkpoints/weights.{epoch:02d}.hdf5"

In [38]:
# Compile the model
nn.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])

# Create a callback that saves the model's weights every epoch
cp_callback = ModelCheckpoint(
    filepath=checkpoint_path,
    verbose=1,
    save_weights_only=True,
    save_freq='epoch')

# Train the model
fit_model = nn.fit(X_train_scaled,y_train,epochs=100,callbacks=[cp_callback])

# Evaluate the model using the test data
model_loss, model_accuracy = nn.evaluate(X_test_scaled,y_test,verbose=2)
print(f"Loss: {model_loss}, Accuracy: {model_accuracy}")

Epoch 1/100
Epoch 00001: saving model to checkpoints/weights.01.hdf5
Epoch 2/100
Epoch 00002: saving model to checkpoints/weights.02.hdf5
Epoch 3/100
Epoch 00003: saving model to checkpoints/weights.03.hdf5
Epoch 4/100
Epoch 00004: saving model to checkpoints/weights.04.hdf5
Epoch 5/100
Epoch 00005: saving model to checkpoints/weights.05.hdf5
Epoch 6/100
Epoch 00006: saving model to checkpoints/weights.06.hdf5
Epoch 7/100
Epoch 00007: saving model to checkpoints/weights.07.hdf5
Epoch 8/100
Epoch 00008: saving model to checkpoints/weights.08.hdf5
Epoch 9/100
Epoch 00009: saving model to checkpoints/weights.09.hdf5
Epoch 10/100
Epoch 00010: saving model to checkpoints/weights.10.hdf5
Epoch 11/100
Epoch 00011: saving model to checkpoints/weights.11.hdf5
Epoch 12/100
Epoch 00012: saving model to checkpoints/weights.12.hdf5
Epoch 13/100
Epoch 00013: saving model to checkpoints/weights.13.hdf5
Epoch 14/100
Epoch 00014: saving model to checkpoints/weights.14.hdf5
Epoch 15/100
Epoch 00015: sav

Epoch 00067: saving model to checkpoints/weights.67.hdf5
Epoch 68/100
Epoch 00068: saving model to checkpoints/weights.68.hdf5
Epoch 69/100
Epoch 00069: saving model to checkpoints/weights.69.hdf5
Epoch 70/100
Epoch 00070: saving model to checkpoints/weights.70.hdf5
Epoch 71/100
Epoch 00071: saving model to checkpoints/weights.71.hdf5
Epoch 72/100
Epoch 00072: saving model to checkpoints/weights.72.hdf5
Epoch 73/100
Epoch 00073: saving model to checkpoints/weights.73.hdf5
Epoch 74/100
Epoch 00074: saving model to checkpoints/weights.74.hdf5
Epoch 75/100
Epoch 00075: saving model to checkpoints/weights.75.hdf5
Epoch 76/100
Epoch 00076: saving model to checkpoints/weights.76.hdf5
Epoch 77/100
Epoch 00077: saving model to checkpoints/weights.77.hdf5
Epoch 78/100
Epoch 00078: saving model to checkpoints/weights.78.hdf5
Epoch 79/100
Epoch 00079: saving model to checkpoints/weights.79.hdf5
Epoch 80/100
Epoch 00080: saving model to checkpoints/weights.80.hdf5
Epoch 81/100
Epoch 00081: saving 

In [None]:
# This should have merged like 18 times ago 