* The chronic kidney disease dataset contains both categorical and numeric features, but contains lots of missing values. The goal here is to predict who has chronic kidney disease given various blood indicators as features.</p>
* The dataset can be download from [UCI Machine Learning Repository](https://archive.ics.uci.edu/ml/datasets/chronic_kidney_disease)

## Library

In [30]:
# Pandas
import pandas as pd

# Numpy
import numpy as np

# Pipeline
from sklearn.pipeline import Pipeline

# DictVectorizer
from sklearn.feature_extraction import DictVectorizer

# XGB
import xgboost as xgb

# cross_val_score
from sklearn.model_selection import cross_val_score

## Load Data

In [2]:
# set display columns
pd.set_option('display.max_columns', 30)

In [3]:
# load data
df = pd.read_csv('datasets/chronic_kidney_disease.csv', header=None, 
                 names=['age', 'bp', 'sg', 'al', 'su', 'rbc', 'pc', 'pcc', 'ba', 'bgr', 'bu', 'sc', 'sod', 'pot', 
                           'hemo', 'pcv', 'wc', 'rc', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane', 'class'])

# head of df
df.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48,80,1.02,1,0,?,normal,notpresent,notpresent,121,36,1.2,?,?,15.4,44,7800,5.2,yes,yes,no,good,no,no,ckd
1,7,50,1.02,4,0,?,normal,notpresent,notpresent,?,18,0.8,?,?,11.3,38,6000,?,no,no,no,good,no,no,ckd
2,62,80,1.01,2,3,normal,normal,notpresent,notpresent,423,53,1.8,?,?,9.6,31,7500,?,no,yes,no,poor,no,yes,ckd
3,48,70,1.005,4,0,normal,abnormal,present,notpresent,117,56,3.8,111,2.5,11.2,32,6700,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80,1.01,2,0,normal,normal,notpresent,notpresent,106,26,1.4,?,?,11.6,35,7300,4.6,no,no,no,good,no,no,ckd
5,60,90,1.015,3,0,?,?,notpresent,notpresent,74,25,1.1,142,3.2,12.2,39,7800,4.4,yes,yes,no,good,yes,no,ckd
6,68,70,1.01,0,0,?,normal,notpresent,notpresent,100,54,24.0,104,4.0,12.4,36,?,?,no,no,no,good,no,no,ckd
7,24,?,1.015,2,4,normal,abnormal,notpresent,notpresent,410,31,1.1,?,?,12.4,44,6900,5,no,yes,no,good,yes,no,ckd
8,52,100,1.015,3,0,normal,abnormal,present,notpresent,138,60,1.9,?,?,10.8,33,9600,4.0,yes,yes,no,good,no,yes,ckd
9,53,90,1.02,2,0,abnormal,abnormal,present,notpresent,70,107,7.2,114,3.7,9.5,29,12100,3.7,yes,yes,no,poor,no,yes,ckd


Columns explain:
* age		-	age	
* bp		-	blood pressure
* sg		-	specific gravity
* al		-   albumin
* su		-	sugar
* rbc		-	red blood cells
* pc		-	pus cell
* pcc		-	pus cell clumps
* ba		-	bacteria
* bgr		-	blood glucose random
* bu		-	blood urea
* sc		-	serum creatinine
* sod		-	sodium
* pot		-	potassium
* hemo	    -	hemoglobin
* pcv		-	packed cell volume
* wc		-	white blood cell count
* rc		-	red blood cell count
* htn		-	hypertension
* dm		-	diabetes mellitus
* cad		-	coronary artery disease
* appet	    -	appetite
* pe		-	pedal edema
* ane		-	anemia
* class	    -	class	

## Exporatory Data Analysis

In [4]:
# df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      400 non-null object
bp       400 non-null object
sg       400 non-null object
al       400 non-null object
su       400 non-null object
rbc      400 non-null object
pc       400 non-null object
pcc      400 non-null object
ba       400 non-null object
bgr      400 non-null object
bu       400 non-null object
sc       400 non-null object
sod      400 non-null object
pot      400 non-null object
hemo     400 non-null object
pcv      400 non-null object
wc       400 non-null object
rc       400 non-null object
htn      400 non-null object
dm       400 non-null object
cad      400 non-null object
appet    400 non-null object
pe       400 non-null object
ane      400 non-null object
class    400 non-null object
dtypes: object(25)
memory usage: 78.2+ KB


* All columns have dtype object, no matter categorial columns or numerical columns.
* We can see there are some missing values in ?, but pandas does not regard it as missing values.

#### Replace ? with Null

In [5]:
# repace ? values
df.replace('?', np.nan, inplace=True)

df.head(10)

Unnamed: 0,age,bp,sg,al,su,rbc,pc,pcc,ba,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,htn,dm,cad,appet,pe,ane,class
0,48,80.0,1.02,1,0,,normal,notpresent,notpresent,121.0,36,1.2,,,15.4,44,7800.0,5.2,yes,yes,no,good,no,no,ckd
1,7,50.0,1.02,4,0,,normal,notpresent,notpresent,,18,0.8,,,11.3,38,6000.0,,no,no,no,good,no,no,ckd
2,62,80.0,1.01,2,3,normal,normal,notpresent,notpresent,423.0,53,1.8,,,9.6,31,7500.0,,no,yes,no,poor,no,yes,ckd
3,48,70.0,1.005,4,0,normal,abnormal,present,notpresent,117.0,56,3.8,111.0,2.5,11.2,32,6700.0,3.9,yes,no,no,poor,yes,yes,ckd
4,51,80.0,1.01,2,0,normal,normal,notpresent,notpresent,106.0,26,1.4,,,11.6,35,7300.0,4.6,no,no,no,good,no,no,ckd
5,60,90.0,1.015,3,0,,,notpresent,notpresent,74.0,25,1.1,142.0,3.2,12.2,39,7800.0,4.4,yes,yes,no,good,yes,no,ckd
6,68,70.0,1.01,0,0,,normal,notpresent,notpresent,100.0,54,24.0,104.0,4.0,12.4,36,,,no,no,no,good,no,no,ckd
7,24,,1.015,2,4,normal,abnormal,notpresent,notpresent,410.0,31,1.1,,,12.4,44,6900.0,5.0,no,yes,no,good,yes,no,ckd
8,52,100.0,1.015,3,0,normal,abnormal,present,notpresent,138.0,60,1.9,,,10.8,33,9600.0,4.0,yes,yes,no,good,no,yes,ckd
9,53,90.0,1.02,2,0,abnormal,abnormal,present,notpresent,70.0,107,7.2,114.0,3.7,9.5,29,12100.0,3.7,yes,yes,no,poor,no,yes,ckd


In [6]:
# df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      391 non-null object
bp       388 non-null object
sg       353 non-null object
al       354 non-null object
su       351 non-null object
rbc      248 non-null object
pc       335 non-null object
pcc      396 non-null object
ba       396 non-null object
bgr      356 non-null object
bu       381 non-null object
sc       383 non-null object
sod      313 non-null object
pot      312 non-null object
hemo     348 non-null object
pcv      329 non-null object
wc       294 non-null object
rc       269 non-null object
htn      398 non-null object
dm       398 non-null object
cad      398 non-null object
appet    399 non-null object
pe       399 non-null object
ane      399 non-null object
class    400 non-null object
dtypes: object(25)
memory usage: 78.2+ KB


#### Convert to Numeric

In [7]:
# numerical columns
num_cols = ['age', 'bp', 'sg', 'al', 'su', 'bgr', 'bu', 'sc', 'sod', 'pot', 'hemo', 'pcv', 'wc', 'rc']

In [8]:
# categorical columns
cate_cols = df.columns.drop('class').drop(num_cols)
# display categorical columns
cate_cols

Index(['rbc', 'pc', 'pcc', 'ba', 'htn', 'dm', 'cad', 'appet', 'pe', 'ane'], dtype='object')

In [9]:
# convert numerical data 
df[num_cols] = df[num_cols].apply(pd.to_numeric, errors='coerce')

In [10]:
# df info
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 400 entries, 0 to 399
Data columns (total 25 columns):
age      391 non-null float64
bp       388 non-null float64
sg       353 non-null float64
al       354 non-null float64
su       351 non-null float64
rbc      248 non-null object
pc       335 non-null object
pcc      396 non-null object
ba       396 non-null object
bgr      356 non-null float64
bu       381 non-null float64
sc       383 non-null float64
sod      313 non-null float64
pot      312 non-null float64
hemo     348 non-null float64
pcv      329 non-null float64
wc       294 non-null float64
rc       269 non-null float64
htn      398 non-null object
dm       398 non-null object
cad      398 non-null object
appet    399 non-null object
pe       399 non-null object
ane      399 non-null object
class    400 non-null object
dtypes: float64(14), object(11)
memory usage: 78.2+ KB


In [11]:
# X and y
X = df.drop(columns=['class'])
y= df['class']

In [12]:
# Import necessary modules
from sklearn_pandas import DataFrameMapper
from sklearn_pandas import CategoricalImputer
from sklearn.preprocessing import Imputer

# Check number of nulls in each feature column
nulls_per_column = X.isnull().sum()
print(nulls_per_column)

# Create a boolean mask for categorical columns
categorical_feature_mask = X.dtypes == object

# Get list of categorical column names
categorical_columns = X.columns[categorical_feature_mask].tolist()

# Get list of non-categorical column names
non_categorical_columns = X.columns[~categorical_feature_mask].tolist()

# Apply numeric imputer
numeric_imputation_mapper = DataFrameMapper(
                                            [([numeric_feature],Imputer(strategy="median")) for numeric_feature in non_categorical_columns],
                                            input_df=True,
                                            df_out=True
                                           )

# Apply categorical imputer
categorical_imputation_mapper = DataFrameMapper(
                                                [(category_feature, CategoricalImputer()) for category_feature in categorical_columns],
                                                input_df=True,
                                                df_out=True
                                               )

age        9
bp        12
sg        47
al        46
su        49
rbc      152
pc        65
pcc        4
ba         4
bgr       44
bu        19
sc        17
sod       87
pot       88
hemo      52
pcv       71
wc       106
rc       131
htn        2
dm         2
cad        2
appet      1
pe         1
ane        1
dtype: int64


In [13]:
# Import FeatureUnion
from sklearn.pipeline import FeatureUnion

# Combine the numeric and categorical transformations
numeric_categorical_union = FeatureUnion([
                                          ("num_mapper", numeric_imputation_mapper),
                                          ("cat_mapper", categorical_imputation_mapper)
                                         ])

In [14]:
# Create full pipeline
pipeline = Pipeline([
                     ("featureunion", numeric_categorical_union),
                     ("vectorizer", DictVectorizer(sort=False)),
                     ("clf", xgb.XGBClassifier(max_depth=3))
                    ])

# Perform cross-validation
cross_val_scores = cross_val_score(pipeline, X, y, scoring="roc_auc", cv=3)

# Print avg. AUC
print("3-fold AUC: ", np.mean(cross_val_scores))

NameError: name 'cross_val_score' is not defined

In [15]:
# X and y
X = df.drop(columns=['class'])
y= df['class']
y_df = df[['class']]

In [16]:
X1 = numeric_imputation_mapper.fit_transform(X)

In [17]:
X2 = categorical_imputation_mapper.fit_transform(X)

In [18]:
X3 = pd.concat([X1, X2], axis=1)

In [19]:
X3.head()

Unnamed: 0,age,bp,sg,al,su,bgr,bu,sc,sod,pot,hemo,pcv,wc,rc,rbc,pc,pcc,ba,htn,dm,cad,appet,pe,ane
0,48.0,80.0,1.02,1.0,0.0,121.0,36.0,1.2,138.0,4.4,15.4,44.0,7800.0,5.2,normal,normal,notpresent,notpresent,yes,yes,no,good,no,no
1,7.0,50.0,1.02,4.0,0.0,121.0,18.0,0.8,138.0,4.4,11.3,38.0,6000.0,4.8,normal,normal,notpresent,notpresent,no,no,no,good,no,no
2,62.0,80.0,1.01,2.0,3.0,423.0,53.0,1.8,138.0,4.4,9.6,31.0,7500.0,4.8,normal,normal,notpresent,notpresent,no,yes,no,poor,no,yes
3,48.0,70.0,1.005,4.0,0.0,117.0,56.0,3.8,111.0,2.5,11.2,32.0,6700.0,3.9,normal,abnormal,present,notpresent,yes,no,no,poor,yes,yes
4,51.0,80.0,1.01,2.0,0.0,106.0,26.0,1.4,138.0,4.4,11.6,35.0,7300.0,4.6,normal,normal,notpresent,notpresent,no,no,no,good,no,no


In [20]:
X3_dict = X3.to_dict('records')

In [21]:
X3_dict

[{'age': 48.0,
  'bp': 80.0,
  'sg': 1.02,
  'al': 1.0,
  'su': 0.0,
  'bgr': 121.0,
  'bu': 36.0,
  'sc': 1.2,
  'sod': 138.0,
  'pot': 4.4,
  'hemo': 15.4,
  'pcv': 44.0,
  'wc': 7800.0,
  'rc': 5.2,
  'rbc': 'normal',
  'pc': 'normal',
  'pcc': 'notpresent',
  'ba': 'notpresent',
  'htn': 'yes',
  'dm': 'yes',
  'cad': 'no',
  'appet': 'good',
  'pe': 'no',
  'ane': 'no'},
 {'age': 7.0,
  'bp': 50.0,
  'sg': 1.02,
  'al': 4.0,
  'su': 0.0,
  'bgr': 121.0,
  'bu': 18.0,
  'sc': 0.8,
  'sod': 138.0,
  'pot': 4.4,
  'hemo': 11.3,
  'pcv': 38.0,
  'wc': 6000.0,
  'rc': 4.8,
  'rbc': 'normal',
  'pc': 'normal',
  'pcc': 'notpresent',
  'ba': 'notpresent',
  'htn': 'no',
  'dm': 'no',
  'cad': 'no',
  'appet': 'good',
  'pe': 'no',
  'ane': 'no'},
 {'age': 62.0,
  'bp': 80.0,
  'sg': 1.01,
  'al': 2.0,
  'su': 3.0,
  'bgr': 423.0,
  'bu': 53.0,
  'sc': 1.8,
  'sod': 138.0,
  'pot': 4.4,
  'hemo': 9.6,
  'pcv': 31.0,
  'wc': 7500.0,
  'rc': 4.8,
  'rbc': 'normal',
  'pc': 'normal',
  'pcc'

In [22]:
y_dict = y_df.to_dict('records')

In [23]:
dv = DictVectorizer(sparse=False)

In [34]:
dv_y = DictVectorizer(sparse=False)

In [24]:
X3_encoded = dv.fit_transform(X3_dict)

In [25]:
X3_encoded[:5,:]

array([[4.800e+01, 1.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 1.210e+02, 8.000e+01, 3.600e+01, 1.000e+00,
        0.000e+00, 0.000e+00, 0.000e+00, 1.000e+00, 1.540e+01, 0.000e+00,
        1.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 4.400e+01,
        1.000e+00, 0.000e+00, 4.400e+00, 0.000e+00, 1.000e+00, 5.200e+00,
        1.200e+00, 1.020e+00, 1.380e+02, 0.000e+00, 7.800e+03],
       [7.000e+00, 4.000e+00, 1.000e+00, 0.000e+00, 1.000e+00, 0.000e+00,
        1.000e+00, 0.000e+00, 1.210e+02, 5.000e+01, 1.800e+01, 1.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.130e+01, 1.000e+00,
        0.000e+00, 0.000e+00, 1.000e+00, 1.000e+00, 0.000e+00, 3.800e+01,
        1.000e+00, 0.000e+00, 4.400e+00, 0.000e+00, 1.000e+00, 4.800e+00,
        8.000e-01, 1.020e+00, 1.380e+02, 0.000e+00, 6.000e+03],
       [6.200e+01, 2.000e+00, 0.000e+00, 1.000e+00, 0.000e+00, 1.000e+00,
        1.000e+00, 0.000e+00, 4.230e+02, 8.000e+01, 5.300e

In [26]:
dv.vocabulary_

{'age': 0,
 'bp': 9,
 'sg': 31,
 'al': 1,
 'su': 33,
 'bgr': 8,
 'bu': 10,
 'sc': 30,
 'sod': 32,
 'pot': 26,
 'hemo': 16,
 'pcv': 23,
 'wc': 34,
 'rc': 29,
 'rbc=normal': 28,
 'pc=normal': 20,
 'pcc=notpresent': 21,
 'ba=notpresent': 6,
 'htn=yes': 18,
 'dm=yes': 15,
 'cad=no': 11,
 'appet=good': 4,
 'pe=no': 24,
 'ane=no': 2,
 'htn=no': 17,
 'dm=no': 14,
 'appet=poor': 5,
 'ane=yes': 3,
 'pc=abnormal': 19,
 'pcc=present': 22,
 'pe=yes': 25,
 'rbc=abnormal': 27,
 'cad=yes': 12,
 'ba=present': 7,
 'dm= yes': 13}

In [35]:
y_encoded = dv_y.fit_transform(y_dict)

In [36]:
y_encoded

array([[1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.],
       [1., 0.

In [37]:
dv_y.vocabulary_

{'class=ckd': 0, 'class=notckd': 1}

In [38]:
y_list = y.apply(lambda x: 1 if x=='ckd' else 0)

In [43]:
y_list

0      1
1      1
2      1
3      1
4      1
5      1
6      1
7      1
8      1
9      1
10     1
11     1
12     1
13     1
14     1
15     1
16     1
17     1
18     1
19     1
20     1
21     1
22     1
23     1
24     1
25     1
26     1
27     1
28     1
29     1
      ..
370    0
371    0
372    0
373    0
374    0
375    0
376    0
377    0
378    0
379    0
380    0
381    0
382    0
383    0
384    0
385    0
386    0
387    0
388    0
389    0
390    0
391    0
392    0
393    0
394    0
395    0
396    0
397    0
398    0
399    0
Name: class, Length: 400, dtype: int64

In [27]:
xgb_cl = xgb.XGBClassifier(max_depth=3)

In [28]:
# Setup the pipeline steps: steps
steps = [("ohe_onestep", DictVectorizer(sparse=False)),
         ("xgb_model", xgb.XGBClassifier())]

# Create the pipeline: xgb_pipeline
xgb_pipeline = Pipeline(steps=steps)

# Fit the pipeline
xgb_pipeline.fit(X3_dict, pd.Series(y))

Pipeline(memory=None,
     steps=[('ohe_onestep', DictVectorizer(dtype=<class 'numpy.float64'>, separator='=', sort=True,
        sparse=False)), ('xgb_model', XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_c...
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))])

In [40]:
scores = cross_val_score(xgb_pipeline, X3_dict, y_list, cv=3, scoring='roc_auc')

In [41]:
scores

array([0.99928571, 0.99710843, 0.99975904])

In [42]:
# Print avg. AUC
print("3-fold AUC: ", np.mean(scores))

3-fold AUC:  0.9987177280550775
