# Machine Learning Diabetes Classification

## Read csv and perform basic data cleaning

In [1]:
# Install zipfile36 if you haven't already
#!pip install zipfile36

In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import tensorflow as tf
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [3]:
# Create dataframe
z = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip')
myzip = ZipFile(BytesIO(z.read())).extract('dataset_diabetes/diabetic_data.csv')
df = pd.read_csv(myzip)
df.head(5)
target = ['A1Cresult']

In [4]:
# Drop the non-beneficial ID columns, 'encounter_id' and 'patient_nbr'
df = df.drop(['encounter_id','patient_nbr'],1)

# Drop mostly empty columns, 'weight', 'payer_code', 'max_glu_serum', and 'medical_specialty'
df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)

# Replace '?' values to nulls
df.replace({'?': np.nan}, inplace=True)

# Replace 'None' values to nulls
df.replace({'None': np.nan}, inplace=True)

# Drop the null rows
df = df.dropna()

  df = df.drop(['encounter_id','patient_nbr'],1)
  df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)


In [5]:
# Convert the target column values to normal and high based on their values
x = {'Norm': 'low'}   
df = df.replace(x)
x = dict.fromkeys(['>7', '>8'], 'high')    
df = df.replace(x)
df.reset_index(inplace=True, drop=True)

In [6]:
# Determine the number of unique values in each column.
df.nunique()

race                          5
gender                        2
age                          10
admission_type_id             8
discharge_disposition_id     21
admission_source_id          15
time_in_hospital             14
num_lab_procedures          114
num_procedures                7
num_medications              67
number_outpatient            24
number_emergency             19
number_inpatient             18
diag_1                      490
diag_2                      486
diag_3                      539
number_diagnoses             12
A1Cresult                     2
metformin                     4
repaglinide                   4
nateglinide                   4
chlorpropamide                2
glimepiride                   4
acetohexamide                 1
glipizide                     4
glyburide                     4
tolbutamide                   2
pioglitazone                  4
rosiglitazone                 4
acarbose                      4
miglitol                      4
troglita

In [7]:
# Drop columns with only 1 value
df = df.drop(['acetohexamide', 'troglitazone', 'examide', 'citoglipton','glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'],1)
df.nunique()

  df = df.drop(['acetohexamide', 'troglitazone', 'examide', 'citoglipton','glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'],1)


race                          5
gender                        2
age                          10
admission_type_id             8
discharge_disposition_id     21
admission_source_id          15
time_in_hospital             14
num_lab_procedures          114
num_procedures                7
num_medications              67
number_outpatient            24
number_emergency             19
number_inpatient             18
diag_1                      490
diag_2                      486
diag_3                      539
number_diagnoses             12
A1Cresult                     2
metformin                     4
repaglinide                   4
nateglinide                   4
chlorpropamide                2
glimepiride                   4
glipizide                     4
glyburide                     4
tolbutamide                   2
pioglitazone                  4
rosiglitazone                 4
acarbose                      4
miglitol                      4
tolazamide                    3
insulin 