In [440]:
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import mode


In [392]:
diabetic_df =pd.read_csv("diabetic_data.csv")

In [393]:
#lets take a peek at the first 20 rows of data
peek = diabetic_df.head(20)
print(peek)

    encounter_id  patient_nbr             race  gender       age weight  admission_type_id  \
0        2278392      8222157        Caucasian  Female    [0-10)      ?                  6   
1         149190     55629189        Caucasian  Female   [10-20)      ?                  1   
2          64410     86047875  AfricanAmerican  Female   [20-30)      ?                  1   
3         500364     82442376        Caucasian    Male   [30-40)      ?                  1   
4          16680     42519267        Caucasian    Male   [40-50)      ?                  1   
5          35754     82637451        Caucasian    Male   [50-60)      ?                  2   
6          55842     84259809        Caucasian    Male   [60-70)      ?                  3   
7          63768    114882984        Caucasian    Male   [70-80)      ?                  1   
8          12522     48330783        Caucasian  Female   [80-90)      ?                  2   
9          15738     63555939        Caucasian  Female  [90-

In [394]:
#lets look at the dimension of the data: 50 columns, 101766 rows
shape = diabetic_df.shape
print(shape)


(101766, 50)


In [395]:
#lets look at the datatypes of each column
types = diabetic_df.dtypes
print(types)

encounter_id                 int64
patient_nbr                  int64
race                        object
gender                      object
age                         object
weight                      object
admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
payer_code                  object
medical_specialty           object
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
diag_1                      object
diag_2                      object
diag_3                      object
number_diagnoses             int64
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride         

In [396]:
#Now let us dive into other variables and see.
#Let us first start with getting the count of different data types.
dtype_df = diabetic_df.dtypes.reset_index()
dtype_df.columns = ["Count", "Column Type"]
dtype_df.groupby("Column Type").aggregate('count').reset_index()

Unnamed: 0,Column Type,Count
0,int64,13
1,object,37


In [397]:
#What is the distribution of categorical features?
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
description = diabetic_df.describe(include=['O'])
print(description)



             race  gender      age  weight payer_code medical_specialty  diag_1  diag_2  diag_3  \
count      101766  101766   101766  101766     101766            101766  101766  101766  101766   
unique          6       3       10      10         18                73     717     749     790   
top     Caucasian  Female  [70-80)       ?          ?                 ?     428     276     250   
freq        76099   54708    26068   98569      40256             49949    6862    6752   11555   

       max_glu_serum    ...     citoglipton insulin glyburide-metformin glipizide-metformin  \
count         101766    ...          101766  101766              101766              101766   
unique             4    ...               1       4                   4                   2   
top             None    ...              No      No                  No                  No   
freq           96420    ...          101766   47383              101060              101753   

       glimepiride-pioglitaz

In [398]:
#What is the distribution of numerical features?
pd.set_option('display.width', 100)
pd.set_option('precision', 3)
description = diabetic_df.describe()
print(description)

       encounter_id  patient_nbr  admission_type_id  discharge_disposition_id  \
count     1.018e+05    1.018e+05         101766.000                101766.000   
mean      1.652e+08    5.433e+07              2.024                     3.716   
std       1.026e+08    3.870e+07              1.445                     5.280   
min       1.252e+04    1.350e+02              1.000                     1.000   
25%       8.496e+07    2.341e+07              1.000                     1.000   
50%       1.524e+08    4.551e+07              1.000                     1.000   
75%       2.303e+08    8.755e+07              3.000                     4.000   
max       4.439e+08    1.895e+08              8.000                    28.000   

       admission_source_id  time_in_hospital  num_lab_procedures  num_procedures  num_medications  \
count           101766.000        101766.000          101766.000      101766.000       101766.000   
mean                 5.754             4.396              43.096    

In [399]:
#The nominal variables such as diagnosis, race, 
#medical specialty, and discharge disposition had missing values.
#These missing values need to be handled prior to being fed into the model as they may introduce instability and bias in models

In [400]:
#The weight contains a lot of missing values. lets see the percentage of missing values
#
(sum(diabetic_df["weight"]=="?")/len(df))*100


96.858479256333155

In [401]:
(sum(diabetic_df["payer_code"]=="?")/len(diabetic_df))*100

39.557416032859699

In [402]:
(sum(diabetic_df["medical_specialty"]=="?")/len(diabetic_df))*100   

49.08220820313268

In [403]:
#The way I deal with missing values is to delete Weight and payer code, 
#since both features have more than 50% missing values
#and they are not relevant to classification

diabetic_df = diabetic_df.drop(["weight","payer_code","medical_specialty"], axis=1)

In [404]:
diabetic_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,31,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),3,1,2,4,70,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),1,1,7,5,73,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),2,1,4,13,68,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),3,3,4,12,33,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [405]:
#replace all questions marks with null values
diabetic_df=diabetic_df.replace('?', np.nan)

In [365]:
diabetic_df

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,...,citoglipton,insulin,glyburide-metformin,glipizide-metformin,glimepiride-pioglitazone,metformin-rosiglitazone,metformin-pioglitazone,change,diabetesMed,readmitted
0,2278392,8222157,Caucasian,Female,[0-10),6,25,1,1,41,...,No,No,No,No,No,No,No,No,No,NO
1,149190,55629189,Caucasian,Female,[10-20),1,1,7,3,59,...,No,Up,No,No,No,No,No,Ch,Yes,>30
2,64410,86047875,AfricanAmerican,Female,[20-30),1,1,7,2,11,...,No,No,No,No,No,No,No,No,Yes,NO
3,500364,82442376,Caucasian,Male,[30-40),1,1,7,2,44,...,No,Up,No,No,No,No,No,Ch,Yes,NO
4,16680,42519267,Caucasian,Male,[40-50),1,1,7,1,51,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
5,35754,82637451,Caucasian,Male,[50-60),2,1,2,3,31,...,No,Steady,No,No,No,No,No,No,Yes,>30
6,55842,84259809,Caucasian,Male,[60-70),3,1,2,4,70,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
7,63768,114882984,Caucasian,Male,[70-80),1,1,7,5,73,...,No,No,No,No,No,No,No,No,Yes,>30
8,12522,48330783,Caucasian,Female,[80-90),2,1,4,13,68,...,No,Steady,No,No,No,No,No,Ch,Yes,NO
9,15738,63555939,Caucasian,Female,[90-100),3,3,4,12,33,...,No,Steady,No,No,No,No,No,Ch,Yes,NO


In [411]:
#check if we still have null values in dataset
diabetic_df.isnull().sum()


encounter_id                   0
patient_nbr                    0
race                        2273
gender                         0
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                        21
diag_2                       358
diag_3                      1423
number_diagnoses               0
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazo

In [422]:
#the variables race had few percentages of missing values and they will be  imputed however
# we can not input missing for diag1,2,3 hence we'll drop diag observations with missing values.
diabetic_df = diabetic_df.dropna(subset = ['diag_1', 'diag_2', 'diag_3'])
diabetic_df.isnull().sum()

encounter_id                   0
patient_nbr                    0
race                        2191
gender                         0
age                            0
admission_type_id              0
discharge_disposition_id       0
admission_source_id            0
time_in_hospital               0
num_lab_procedures             0
num_procedures                 0
num_medications                0
number_outpatient              0
number_emergency               0
number_inpatient               0
diag_1                         0
diag_2                         0
diag_3                         0
number_diagnoses               0
max_glu_serum                  0
A1Cresult                      0
metformin                      0
repaglinide                    0
nateglinide                    0
chlorpropamide                 0
glimepiride                    0
acetohexamide                  0
glipizide                      0
glyburide                      0
tolbutamide                    0
pioglitazo

In [428]:
#Observations with invalid gender values will be deleted deleted.
c = diabetic_df[diabetic_df.gender != 'Unknown/Invalid']
diabetic_df.gender.unique()


array(['Female', 'Male'], dtype=object)

In [445]:
#the variables discharge disposition and race had few percentages of missing values and they will be  imputed however
# we can not input missing for diag1,2,3 hence we'll drop diag observations with missing values.
diabetic_df['race'].value_counts()
diabetic_df['race'].fillna('Caucasian', inplace=True)
# No more missing valus. No
diabetic_df.isnull().sum()


In [459]:
# LETS Convert the target variable to 2 binary numbers
diabetic_df.readmitted.value_counts()
diabetic_df.ix[(diabetic_df.readmitted == ">30")|(diabetic_df.readmitted=="<30"),"readmitted"]="YES"
diabetic_df.readmitted







       
    
   

    
        

1         YES
2          NO
3          NO
4          NO
5         YES
6          NO
7         YES
8          NO
9          NO
10        YES
11        YES
12        YES
13         NO
14        YES
15         NO
16        YES
17         NO
18        YES
19         NO
20         NO
21         NO
22         NO
23        YES
24         NO
25         NO
26         NO
27        YES
28        YES
29        YES
30         NO
         ... 
101736    YES
101737     NO
101738     NO
101739     NO
101740     NO
101741     NO
101742     NO
101743     NO
101744     NO
101745     NO
101746    YES
101747    YES
101748    YES
101749     NO
101750    YES
101751     NO
101752     NO
101753     NO
101754    YES
101755    YES
101756    YES
101757     NO
101758     NO
101759     NO
101760    YES
101761    YES
101762     NO
101763     NO
101764     NO
101765     NO
Name: readmitted, Length: 100241, dtype: object

In [431]:
def show_missing():
    missing = diabetic_df.columns[diabetic_df.isnull().any()].tolist()
    return missing




In [349]:
diabetic_df.columns[show_missing()].isnull().sum()

0

In [214]:
 diabetic_df.columns

Index(['encounter_id', 'patient_nbr', 'race', 'gender', 'age', 'weight', 'admission_type_id',
       'discharge_disposition_id', 'admission_source_id', 'time_in_hospital', 'payer_code',
       'medical_specialty', 'num_lab_procedures', 'num_procedures', 'num_medications',
       'number_outpatient', 'number_emergency', 'number_inpatient', 'diag_1', 'diag_2', 'diag_3',
       'number_diagnoses', 'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 'nateglinide',
       'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide',
       'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 'tolazamide',
       'examide', 'citoglipton', 'insulin', 'glyburide-metformin', 'glipizide-metformin',
       'glimepiride-pioglitazone', 'metformin-rosiglitazone', 'metformin-pioglitazone', 'change',
       'diabetesMed', 'readmitted'],
      dtype='object')

In [None]:
will