# Machine Learning Diabetes Classification

## Read csv and perform basic data cleaning

In [1]:
# Install zipfile36 if you haven't already
!pip install zipfile36

Collecting zipfile36
  Downloading zipfile36-0.1.3-py3-none-any.whl (20 kB)
Installing collected packages: zipfile36
Successfully installed zipfile36-0.1.3


In [2]:
# Import our dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score
import pandas as pd
import numpy as np
import tensorflow as tf
from io import BytesIO
from zipfile import ZipFile
from urllib.request import urlopen

In [3]:
# Create dataframe
z = urlopen('https://archive.ics.uci.edu/ml/machine-learning-databases/00296/dataset_diabetes.zip')
myzip = ZipFile(BytesIO(z.read())).extract('dataset_diabetes/diabetic_data.csv')
df = pd.read_csv(myzip)
df.head(5)
target = ['A1Cresult']

In [4]:
# Drop the non-beneficial ID columns, 'encounter_id' and 'patient_nbr'
df = df.drop(['encounter_id','patient_nbr'],1)

# Drop mostly empty columns, 'weight', 'payer_code', 'max_glu_serum', and 'medical_specialty'
df = df.drop(['weight', 'payer_code', 'max_glu_serum', 'medical_specialty'],1)

# Replace '?' values to nulls
df.replace({'?': np.nan}, inplace=True)

# Replace 'None' values to nulls
df.replace({'None': np.nan}, inplace=True)

# Drop the null rows
df = df.dropna()

  
  """


In [5]:
# Convert the target column values to normal and high based on their values
x = {'Norm': 'low'}   
df = df.replace(x)
x = dict.fromkeys(['>7', '>8'], 'high')    
df = df.replace(x)
df.reset_index(inplace=True, drop=True)

In [6]:
# Determine the number of unique values in each column.
df.nunique()

race                          5
gender                        2
age                          10
admission_type_id             8
discharge_disposition_id     21
admission_source_id          15
time_in_hospital             14
num_lab_procedures          114
num_procedures                7
num_medications              67
number_outpatient            24
number_emergency             19
number_inpatient             18
diag_1                      490
diag_2                      486
diag_3                      539
number_diagnoses             12
A1Cresult                     2
metformin                     4
repaglinide                   4
nateglinide                   4
chlorpropamide                2
glimepiride                   4
acetohexamide                 1
glipizide                     4
glyburide                     4
tolbutamide                   2
pioglitazone                  4
rosiglitazone                 4
acarbose                      4
miglitol                      4
troglita

In [7]:
# Drop columns with only 1 value
df = df.drop(['acetohexamide', 'troglitazone', 'examide', 'citoglipton','glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone'],1)
df.nunique()

  


race                          5
gender                        2
age                          10
admission_type_id             8
discharge_disposition_id     21
admission_source_id          15
time_in_hospital             14
num_lab_procedures          114
num_procedures                7
num_medications              67
number_outpatient            24
number_emergency             19
number_inpatient             18
diag_1                      490
diag_2                      486
diag_3                      539
number_diagnoses             12
A1Cresult                     2
metformin                     4
repaglinide                   4
nateglinide                   4
chlorpropamide                2
glimepiride                   4
glipizide                     4
glyburide                     4
tolbutamide                   2
pioglitazone                  4
rosiglitazone                 4
acarbose                      4
miglitol                      4
tolazamide                    3
insulin 

In [9]:
#Info on the data frame data types, .non_null, etc. 
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16193 entries, 0 to 16192
Data columns (total 37 columns):
 #   Column                    Non-Null Count  Dtype 
---  ------                    --------------  ----- 
 0   race                      16193 non-null  object
 1   gender                    16193 non-null  object
 2   age                       16193 non-null  object
 3   admission_type_id         16193 non-null  int64 
 4   discharge_disposition_id  16193 non-null  int64 
 5   admission_source_id       16193 non-null  int64 
 6   time_in_hospital          16193 non-null  int64 
 7   num_lab_procedures        16193 non-null  int64 
 8   num_procedures            16193 non-null  int64 
 9   num_medications           16193 non-null  int64 
 10  number_outpatient         16193 non-null  int64 
 11  number_emergency          16193 non-null  int64 
 12  number_inpatient          16193 non-null  int64 
 13  diag_1                    16193 non-null  object
 14  diag_2                

In [13]:
#Looking at dataframe 
df.head()

Unnamed: 0,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,...,rosiglitazone,acarbose,miglitol,tolazamide,insulin,glyburide-metformin,glipizide-metformin,change,diabetesMed,readmitted
0,Caucasian,Male,[80-90),1,3,7,6,64,3,18,...,No,No,No,No,No,No,No,Ch,Yes,NO
1,Caucasian,Female,[70-80),1,3,7,5,34,0,17,...,No,No,No,No,Up,No,No,Ch,Yes,>30
2,Other,Female,[50-60),1,1,7,2,53,0,6,...,No,No,No,No,Up,No,No,Ch,Yes,NO
3,Caucasian,Male,[60-70),1,2,7,1,59,0,12,...,No,No,No,No,Steady,No,No,No,Yes,NO
4,Caucasian,Female,[80-90),1,1,7,3,34,0,11,...,No,No,No,No,No,No,No,No,No,>30


In [11]:
# Generate our categorical variable list
df_cat = df.dtypes[df.dtypes == "object"].index.tolist()
df_cat

['race',
 'gender',
 'age',
 'diag_1',
 'diag_2',
 'diag_3',
 'A1Cresult',
 'metformin',
 'repaglinide',
 'nateglinide',
 'chlorpropamide',
 'glimepiride',
 'glipizide',
 'glyburide',
 'tolbutamide',
 'pioglitazone',
 'rosiglitazone',
 'acarbose',
 'miglitol',
 'tolazamide',
 'insulin',
 'glyburide-metformin',
 'glipizide-metformin',
 'change',
 'diabetesMed',
 'readmitted']

In [14]:
# Check the number of unique values in each column
df[df_cat].nunique()

race                     5
gender                   2
age                     10
diag_1                 490
diag_2                 486
diag_3                 539
A1Cresult                2
metformin                4
repaglinide              4
nateglinide              4
chlorpropamide           2
glimepiride              4
glipizide                4
glyburide                4
tolbutamide              2
pioglitazone             4
rosiglitazone            4
acarbose                 4
miglitol                 4
tolazamide               3
insulin                  4
glyburide-metformin      4
glipizide-metformin      2
change                   2
diabetesMed              2
readmitted               3
dtype: int64

In [15]:
# Check the unique value counts to see if binning is required
df.diag_1.value_counts()

428    1144
414     976
786     853
410     806
486     531
       ... 
237       1
617       1
356       1
989       1
893       1
Name: diag_1, Length: 490, dtype: int64

In [16]:
# Check the unique value counts to see if binning is required
df.diag_2.value_counts()

276       1280
428        938
250        753
427        739
250.02     677
          ... 
316          1
E858         1
980          1
725          1
110          1
Name: diag_2, Length: 486, dtype: int64

In [17]:
# Check the unique value counts to see if binning is required
df.diag_3.value_counts()

250    1557
401    1229
276    1057
428     665
414     550
       ... 
550       1
V55       1
314       1
579       1
825       1
Name: diag_3, Length: 539, dtype: int64