## 1. Imports and loading data

In [21]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA as sklearn_pca

np.set_printoptions(formatter={'float':'{:0.4f}'.format})
pd.set_option('display.precision', 5)

filePath = "C:/Chang/NEU/courses/cs6220 datamining/project/dataset_diabetes/diabetic_data.csv"
# Read dataset using pandas
data = pd.read_csv(filePath, sep=',',header=0)

## 2. Preprocessing
This data set contains both numeric and nominal data types. For numeric data we need to do normalization, while for categorical data, we perform one-hot encoding.

In [29]:
# Drop encounter_id and patient_nbr. Drop weight and payer_code which has 97% missing data
df = data.drop(['encounter_id', 'patient_nbr', 'weight', 'payer_code'], axis = 1)

In [22]:
# TODO: Currently performed binary encoding. Should try data binning.
df['diag_1'] = np.where(df['diag_1'].str.contains('250'), 1, 0)
df['diag_2'] = np.where(df['diag_2'].str.contains('250'), 1, 0)
df['diag_3'] = np.where(df['diag_3'].str.contains('250'), 1, 0)

In [23]:
# Standardizing numeric data
from sklearn.preprocessing import StandardScaler
numData = df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
              'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']]
numData_std = StandardScaler().fit_transform(numData)

In [24]:
# Confirm that the numeric data are standardized
df_numData = pd.DataFrame(numData_std)
df_numData.describe()

Unnamed: 0,0,1,2,3,4,5,6,7
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,2.35731e-14,3.24394e-16,4.23327e-15,-2.18372e-14,1.00082e-13,2.50927e-14,-4.3145e-14,1.28417e-13
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
min,-1.13765,-2.13963,-0.785398,-1.84827,-0.291461,-0.21262,-0.503276,-3.3216
25%,-0.802651,-0.614795,-0.785398,-0.74092,-0.291461,-0.21262,-0.503276,-0.735733
50%,-0.132655,0.0459666,-0.199162,-0.125726,-0.291461,-0.21262,-0.503276,0.298612
75%,0.537341,0.706728,0.387074,0.489467,-0.291461,-0.21262,0.288579,0.815784
max,3.21732,4.51881,2.73202,7.99483,32.8509,81.4667,16.1257,4.43599


In [25]:
df[['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 
              'number_outpatient', 'number_emergency', 'number_inpatient', 'number_diagnoses']] = df_numData

In [26]:
# Apply one-hot encoding to categorical columns
oneHotData = pd.get_dummies(df, columns=['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id',
                                         'admission_source_id', 'medical_specialty', 
                                         'diag_1', 
                                         'diag_2', 'diag_3', 'max_glu_serum', 'A1Cresult', 
                                         'metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 
                                         'glimepiride', 'acetohexamide', 'glipizide', 'glyburide', 'tolbutamide', 
                                         'pioglitazone', 'rosiglitazone', 'acarbose', 'miglitol', 'troglitazone', 
                                         'tolazamide', 'examide', 'citoglipton', 'insulin', 'glyburide-metformin',
                                         'glipizide-metformin', 'glimepiride-pioglitazone', 
                                         'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 
                                         'diabetesMed', 'readmitted'],
                            prefix=['race', 'gender', 'age', 'admission_type_id', 'discharge_disposition_id', 
                                    'admission_source_id', 'medical_specialty', 
                                    'diag_1', 'diag_2', 'diag_3', 
                                    'max_glu_serum', 'A1Cresult', 'metformin', 'repaglinide', 
                                    'nateglinide', 'chlorpropamide', 'glimepiride', 'acetohexamide', 'glipizide', 
                                    'glyburide', 'tolbutamide', 'pioglitazone', 'rosiglitazone', 'acarbose', 
                                    'miglitol', 'troglitazone', 'tolazamide', 'examide', 'citoglipton', 'insulin',
                                    'glyburide-metformin', 'glipizide-metformin', 'glimepiride-pioglitazone', 
                                    'metformin-rosiglitazone', 'metformin-pioglitazone', 'change', 'diabetesMed',
                                    'readmitted'])
oneHotData.describe()

Unnamed: 0,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,race_?,race_AfricanAmerican,...,metformin-rosiglitazone_Steady,metformin-pioglitazone_No,metformin-pioglitazone_Steady,change_Ch,change_No,diabetesMed_No,diabetesMed_Yes,readmitted_<30,readmitted_>30,readmitted_NO
count,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,...,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0,101766.0
mean,2.35731e-14,3.24394e-16,4.23327e-15,-2.18372e-14,1.00082e-13,2.50927e-14,-4.3145e-14,1.28417e-13,0.02234,0.18877,...,2e-05,0.99999,9.82646e-06,0.46195,0.53805,0.22997,0.77003,0.1116,0.34928,0.53912
std,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.14777,0.39132,...,0.00443,0.00313,0.00313472,0.49855,0.49855,0.42081,0.42081,0.31487,0.47675,0.49847
min,-1.13765,-2.13963,-0.785398,-1.84827,-0.291461,-0.21262,-0.503276,-3.3216,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,-0.802651,-0.614795,-0.785398,-0.74092,-0.291461,-0.21262,-0.503276,-0.735733,0.0,0.0,...,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
50%,-0.132655,0.0459666,-0.199162,-0.125726,-0.291461,-0.21262,-0.503276,0.298612,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
75%,0.537341,0.706728,0.387074,0.489467,-0.291461,-0.21262,0.288579,0.815784,0.0,0.0,...,0.0,1.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,1.0
max,3.21732,4.51881,2.73202,7.99483,32.8509,81.4667,16.1257,4.43599,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


## 3. PCA

In [27]:
pca = sklearn_pca(n_components=10)
pca.fit_transform(oneHotData)

array([[-3.4099, -0.7666, -0.1605, ..., 0.7717, -0.4167, 0.7278],
       [0.2984, 0.3564, -1.2648, ..., -0.4434, -0.8187, 0.1261],
       [-0.9564, -0.3818, 1.9668, ..., -0.8102, -0.9108, -2.1515],
       ..., 
       [0.0449, 0.9767, -1.1813, ..., -1.1445, 0.6754, 0.4100],
       [2.0375, -0.2151, -0.1680, ..., 0.8678, -0.1103, -0.2349],
       [-0.9541, -0.5231, 0.2045, ..., 0.3305, 1.1552, -1.4892]])

In [28]:
print(pca.explained_variance_ratio_)

[0.1280 0.0868 0.0691 0.0597 0.0543 0.0506 0.0456 0.0408 0.0301 0.0288]


PC1 only explained 12.8% of the variance. And the first three components explained about 25% of the variance. :(