# Machine Learning Diabetes Classification

## Read csv and perform basic data cleaning

In [6]:
# Install zipfile36 if you haven't already
#!pip install zipfile36

In [42]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import tensorflow as tf
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [47]:
# Create dataframe
z = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip')
myzip = ZipFile(BytesIO(z.read())).extract('dataset_diabetes/diabetic_data.csv')
df = pd.read_csv(myzip)
df.head(5)
target = ['A1Cresult']

In [48]:
# Drop the non-beneficial ID columns, 'encounter_id' and 'patient_nbr'
df = df.drop(['encounter_id','patient_nbr'],1)

# Drop mostly empty columns, 'weight', 'payer_code', 'max_glu_serum', and 'medical_specialty'
df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)

# Replace '?' values to nulls
df.replace({'?': np.nan}, inplace=True)

# Drop the null rows
df = df.dropna()

# Drop rows where A1c status is 'None'
df = df[df.A1Cresult != 'None']

  df = df.drop(['encounter_id','patient_nbr'],1)
  df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)


In [49]:
# Convert the target column values to normal and high based on their values
x = {'Norm': 'low'}   
df = df.replace(x)
x = dict.fromkeys(['>7', '>8'], 'high')    
df = df.replace(x)
df.reset_index(inplace=True, drop=True)

In [50]:
df.info

<bound method DataFrame.info of                   race  gender      age  admission_type_id  \
0            Caucasian    Male  [80-90)                  1   
1            Caucasian  Female  [70-80)                  1   
2                Other  Female  [50-60)                  1   
3            Caucasian    Male  [60-70)                  1   
4            Caucasian  Female  [80-90)                  1   
...                ...     ...      ...                ...   
16188        Caucasian  Female  [70-80)                  3   
16189        Caucasian    Male  [70-80)                  3   
16190        Caucasian  Female  [70-80)                  1   
16191            Other  Female  [40-50)                  1   
16192  AfricanAmerican    Male  [70-80)                  1   

       discharge_disposition_id  admission_source_id  time_in_hospital  \
0                             3                    7                 6   
1                             3                    7                 5   
2