In [111]:
import os
import numpy as np
import pandas as pd


from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler
from sklearn.decomposition import PCA

In [112]:
import plotly.graph_objs as go
from plotly.offline import iplot

import matplotlib.pyplot as plt

In [113]:
# Read Dataset

data = pd.read_csv('diabetic_data.csv')

In [114]:
# List of columns that we do not need to one hot encode
no_OH = ['encounter_id', 'patient_nbr', 'time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
         'number_emergency', 'number_inpatient', 'number_diagnoses', 'medical_specialty', 'payer_code', 'readmitted']

X_continuous = ['time_in_hospital', 'num_lab_procedures', 'num_procedures', 'num_medications', 'number_outpatient',
         'number_emergency', 'number_inpatient', 'number_diagnoses']

In [115]:
# Do OneHot encoding of remaining columns
OH = OneHotEncoder()

X_OH = data.drop(no_OH, axis=1)
OH.fit(X_OH)
X_OH = OH.transform(X_OH)
X_OH_df = pd.DataFrame(X_OH.toarray(), columns=OH.get_feature_names())

In [116]:
sscaler = StandardScaler().fit(data[X_continuous])
X_normed = sscaler.transform(data[X_continuous])
X_normed = pd.DataFrame(X_normed, columns=X_continuous)

In [117]:
final_df = pd.concat([X_OH_df, X_normed], axis=1)

In [118]:
final_df.head()

Unnamed: 0,x0_?,x0_AfricanAmerican,x0_Asian,x0_Caucasian,x0_Hispanic,x0_Other,x1_Female,x1_Male,x1_Unknown/Invalid,x2_[0-10),...,x36_No,x36_Yes,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses
0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,...,1.0,0.0,-1.137649,-0.106517,-0.785398,-1.848268,-0.291461,-0.21262,-0.503276,-3.321596
1,0.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,-0.467653,0.808384,-0.785398,0.24339,-0.291461,-0.21262,-0.503276,0.815784
2,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,0.0,1.0,-0.802651,-1.631351,2.145781,-0.371804,1.286748,-0.21262,0.288579,-0.735733
3,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,-0.802651,0.045967,-0.199162,-0.002688,-0.291461,-0.21262,-0.503276,-0.218561
4,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.0,1.0,-1.137649,0.401761,-0.785398,-0.986997,-0.291461,-0.21262,-0.503276,-1.252906


In [119]:
# Apply PCA
pca = PCA(n_components=3)
pca.fit(final_df)
PCA_df = pd.DataFrame(pca.transform(final_df), columns=['pc1', 'pc2', 'pc3'])

PCA_df.head()

Unnamed: 0,pc1,pc2,pc3
0,-3.369981,-0.767084,-0.163688
1,0.295295,0.271358,-1.278667
2,-0.910264,-0.374811,2.017104
3,-0.471569,-0.220365,-0.435092
4,-1.659865,-0.105848,-0.918213


In [120]:
PCA_df['labels'] = data['readmitted'].apply(lambda val: False if val=='NO' else True)

In [121]:
PCA_df_ss = PCA_df.sample(50_000)

In [127]:
# Make plots
scatter_1 = go.Scatter3d(x=PCA_df_ss[PCA_df_ss.labels].pc1.values,
                         y=PCA_df_ss[PCA_df_ss.labels].pc2.values,
                         z=PCA_df_ss[PCA_df_ss.labels].pc3.values,
                         name="Readmitted",
                         mode ='markers',
                         marker=dict(size=2,
                                     opacity = 1
                                    )
                      )

scatter_0 = go.Scatter3d(x=PCA_df_ss[~PCA_df_ss.labels].pc1.values,
                         y=PCA_df_ss[~PCA_df_ss.labels].pc2.values,
                         z=PCA_df_ss[~PCA_df_ss.labels].pc3.values,
                         name="Not readmitted",
                         mode ='markers',
                         marker=dict(size=2,
                                     opacity = 1
                                    )
                      )

layout = go.Layout(title='Plot of data in PCA space',
                    scene=dict(xaxis=dict(range=[-6,8]),
                               yaxis=dict(range=[-2,42]),
                               zaxis=dict(range=[-2,27])),
                   scene_aspectmode='cube',
                  )

fig = go.Figure(data=[scatter_1, scatter_0], layout=layout)
iplot(fig)