In [1]:
## NOTE: This is Python 3 code.
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt # NOTE: This was tested with matplotlib v. 2.1.0
 
from sklearn.preprocessing import StandardScaler

In [2]:
#########################
#
# Data Import
#
#########################
df = pd.read_csv('../_data/operations_imputed_CLEAN.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91862 entries, 0 to 91861
Data columns (total 63 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         91862 non-null  int64  
 1   op_id              91862 non-null  int64  
 2   subject_id         91862 non-null  int64  
 3   hadm_id            91862 non-null  int64  
 4   opdate             91862 non-null  int64  
 5   age                91862 non-null  int64  
 6   sex                91862 non-null  object 
 7   weight             90910 non-null  float64
 8   height             91326 non-null  float64
 9   race               91862 non-null  object 
 10  asa                89532 non-null  float64
 11  emop               91862 non-null  int64  
 12  department         91862 non-null  object 
 13  antype             91862 non-null  object 
 14  icd10_pcs          91862 non-null  object 
 15  category_desc      91862 non-null  object 
 16  desc_short         918

In [11]:
#########################
#
# Perform PCA on the data
#
#########################
# First center and scale the data

## Drop records with NaN in 'asa' 
df = df.dropna(subset=['asa'])
df

cols_to_drop = ['Unnamed: 0','op_id','subject_id','hadm_id','opdate','LOS','is_outlier','prolonged_LOS','category_desc','desc_short','subject_id_y','inhosp_death_time','orin_time','orout_time','opstart_time','opend_time','admission_time','discharge_time','anstart_time','anend_time','cpbon_time','cpboff_time','icuin_time','icuout_time','icd10_pcs','chart_time_x','race','art_dbp','emop']
x_df = df.drop(columns=cols_to_drop, axis=1)
x_df.info()

x_df

<class 'pandas.core.frame.DataFrame'>
Index: 89532 entries, 2 to 91861
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              89532 non-null  int64  
 1   sex              89532 non-null  object 
 2   weight           88611 non-null  float64
 3   height           89054 non-null  float64
 4   asa              89532 non-null  float64
 5   emop             89532 non-null  int64  
 6   department       89532 non-null  object 
 7   antype           89532 non-null  object 
 8   category_id      89532 non-null  object 
 9   art_mbp          89504 non-null  float64
 10  art_sbp          89497 non-null  float64
 11  bt               89479 non-null  float64
 12  cvp              89248 non-null  float64
 13  hr               89515 non-null  float64
 14  pip              89519 non-null  float64
 15  pmean            89508 non-null  float64
 16  rr               89517 non-null  float64
 17  spo2             

Unnamed: 0,age,sex,weight,height,asa,emop,department,antype,category_id,art_mbp,...,glucose,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,icu_visit
2,50,F,66.0,157.0,2.0,0,OS,General,0NQ,100.826720,...,117.007856,11.385262,24.479130,20.414394,223.140988,3.768724,139.116926,0.851504,8.957105,False
3,60,F,62.0,154.0,1.0,0,GS,General,0GT,100.773779,...,118.671026,11.746523,24.746791,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,False
4,35,F,50.0,160.0,1.0,0,OS,Neuraxial,09B,98.084871,...,98.000000,10.400000,22.344397,13.300000,124.000000,3.900000,138.000000,0.600000,6.310000,False
5,20,M,62.0,179.0,1.0,0,OL,General,09Q,92.702290,...,100.845000,14.265657,22.965854,22.134112,237.222222,4.041013,139.824013,1.078906,10.044925,False
6,60,F,52.0,152.0,1.0,0,OL,General,09Q,100.773779,...,118.671026,11.746523,24.746791,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91857,50,F,58.0,162.0,2.0,0,GS,General,0DN,100.826720,...,127.000000,11.385262,23.700000,20.414394,223.140988,3.800000,134.000000,0.851504,8.957105,False
91858,70,F,53.0,162.0,2.0,0,GS,General,0HB,99.036609,...,131.749045,11.550836,24.812058,21.154115,212.690065,3.700000,142.000000,0.760224,8.535719,False
91859,65,F,51.0,152.0,2.0,0,GS,General,0HB,97.106931,...,129.145614,11.638787,24.879181,21.422042,212.006881,3.800000,143.000000,0.780793,8.557983,False
91860,85,M,74.0,171.0,4.0,0,GS,General,0DB,92.000000,...,171.000000,11.360000,22.600000,19.733333,218.800000,3.900000,137.000000,1.800000,9.932500,True


In [12]:
#########################
#
# One-Hot encode categorical Features
#
#########################

# Filter columns with dtype 'object'
object_columns = x_df.select_dtypes(include='object').columns.tolist()
print(object_columns)



['sex', 'department', 'antype', 'category_id']


In [21]:

def encode_cat_vars(x):
    original_cols = set(x.columns)
    
    x = pd.get_dummies(
        x,
        columns=object_columns,
        drop_first=True,
    )
    
    # Get the newly added columns by taking the difference of the sets
    dummy_cols = list(set(x.columns) - original_cols)
    
    return x, dummy_cols


#Dummy variable creation is done before spliting the data , so all the different categories are covered
#create dummy variable
df_encoded, dummy_columns= encode_cat_vars(x_df)

#confirm:
# df_encoded.shape
# df_encoded

df_encoded.to_csv('encoded.csv')

With current categories- there are 164 features after 1-ht. This may be a problem. 
Need to remember that each model will be trained on the category 

In [14]:
# First center and scale the data
scaled_data = preprocessing.scale(df_encoded.T)
 
pca = PCA() # create a PCA object
pca.fit(scaled_data) # do the math
pca_data = pca.transform(scaled_data) # get PCA coordinates for scaled_data
 


ValueError: Input X contains NaN.
PCA does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

In [None]:

#########################
#
# Draw a scree plot and a PCA plot
#
#########################
 
#The following code constructs the Scree plot
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
 
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
 
#the following code makes a fancy looking plot using PC1 and PC2
pca_df = pd.DataFrame(pca_data, index=[*wt, *ko], columns=labels)
 
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
 
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()
 
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pd.Series(pca.components_[0], index=genes)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])