In [1]:
## NOTE: This is Python 3 code.
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt # NOTE: This was tested with matplotlib v. 2.1.0
 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [4]:
#########################
#
# Data Import
#
#########################
df = pd.read_csv('../_data/operations_imputed_CLEAN.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 91862 entries, 0 to 91861
Data columns (total 63 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         91862 non-null  int64  
 1   op_id              91862 non-null  int64  
 2   subject_id         91862 non-null  int64  
 3   hadm_id            91862 non-null  int64  
 4   opdate             91862 non-null  int64  
 5   age                91862 non-null  int64  
 6   sex                91862 non-null  object 
 7   weight             90910 non-null  float64
 8   height             91326 non-null  float64
 9   race               91862 non-null  object 
 10  asa                89532 non-null  float64
 11  emop               91862 non-null  int64  
 12  department         91862 non-null  object 
 13  antype             91862 non-null  object 
 14  icd10_pcs          91862 non-null  object 
 15  category_desc      91862 non-null  object 
 16  desc_short         918

In [16]:
#########################
#
# Drop Target Column, NaN columns and excluded Features
#
#########################
# 
## Drop records with NaN in 'asa' 
df = df.dropna(subset=['asa'])
df

## Drop Targets
y_target_cat = 'prolonged_LOS'
y_target_reg = 'LOS'

## Drop Excluded Features

cols_to_drop = [y_target_reg,'Unnamed: 0','op_id','subject_id','hadm_id','opdate','is_outlier','category_desc','desc_short','subject_id_y','inhosp_death_time','orin_time','orout_time','opstart_time','opend_time','admission_time','discharge_time','anstart_time','anend_time','cpbon_time','cpboff_time','icuin_time','icuout_time','icd10_pcs','chart_time_x','race','art_dbp','emop']

# Confrim
x_df = df.drop(columns=cols_to_drop, axis=1)
x_df.info()
x_df

<class 'pandas.core.frame.DataFrame'>
Index: 89532 entries, 2 to 91861
Data columns (total 35 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              89532 non-null  int64  
 1   sex              89532 non-null  object 
 2   weight           88611 non-null  float64
 3   height           89054 non-null  float64
 4   asa              89532 non-null  float64
 5   department       89532 non-null  object 
 6   antype           89532 non-null  object 
 7   category_id      89532 non-null  object 
 8   art_mbp          89504 non-null  float64
 9   art_sbp          89497 non-null  float64
 10  bt               89479 non-null  float64
 11  cvp              89248 non-null  float64
 12  hr               89515 non-null  float64
 13  pip              89519 non-null  float64
 14  pmean            89508 non-null  float64
 15  rr               89517 non-null  float64
 16  spo2             89517 non-null  float64
 17  vt               

Unnamed: 0,age,sex,weight,height,asa,department,antype,category_id,art_mbp,art_sbp,...,hb,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,LOS,icu_visit
2,50,F,66.0,157.0,2.0,OS,General,0NQ,100.826720,137.244265,...,11.385262,24.479130,20.414394,223.140988,3.768724,139.116926,0.851504,8.957105,2.447917,False
3,60,F,62.0,154.0,1.0,GS,General,0GT,100.773779,138.601562,...,11.746523,24.746791,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,3.493056,False
4,35,F,50.0,160.0,1.0,OS,Neuraxial,09B,98.084871,131.646617,...,10.400000,22.344397,13.300000,124.000000,3.900000,138.000000,0.600000,6.310000,4.236111,False
5,20,M,62.0,179.0,1.0,OL,General,09Q,92.702290,140.826772,...,14.265657,22.965854,22.134112,237.222222,4.041013,139.824013,1.078906,10.044925,1.572917,False
6,60,F,52.0,152.0,1.0,OL,General,09Q,100.773779,138.601562,...,11.746523,24.746791,23.938716,217.282759,3.846584,140.033084,0.744921,8.200501,1.607639,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
91857,50,F,58.0,162.0,2.0,GS,General,0DN,100.826720,137.244265,...,11.385262,23.700000,20.414394,223.140988,3.800000,134.000000,0.851504,8.957105,8.378472,False
91858,70,F,53.0,162.0,2.0,GS,General,0HB,99.036609,142.337732,...,11.550836,24.812058,21.154115,212.690065,3.700000,142.000000,0.760224,8.535719,2.614583,False
91859,65,F,51.0,152.0,2.0,GS,General,0HB,97.106931,138.279011,...,11.638787,24.879181,21.422042,212.006881,3.800000,143.000000,0.780793,8.557983,5.447917,False
91860,85,M,74.0,171.0,4.0,GS,General,0DB,92.000000,152.000000,...,11.360000,22.600000,19.733333,218.800000,3.900000,137.000000,1.800000,9.932500,9.309028,True


In [17]:
#########################
#
# One-Hot encode categorical Features
#
#########################

# Filter columns with dtype 'object'
object_columns = x_df.select_dtypes(include='object').columns.tolist()
print(object_columns)



['sex', 'department', 'antype', 'category_id']


In [25]:
from sklearn.preprocessing import OneHotEncoder
## use OneHot instead of get_dummies
        
def encode_cat_vars(x):
    original_cols = set(x.columns)
    
    # Extract the object columns
    object_columns = [col for col in x.columns if x[col].dtype == 'object']
    
    # Initialize the OneHotEncoder
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    # Fit and transform the encoder on the object columns
    encoded_data = encoder.fit_transform(x[object_columns])
    
    # Create a DataFrame from the encoded data with appropriate column names
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(object_columns))
    
    # Drop the original object columns
    x = x.drop(object_columns, axis=1)
    
    # Concatenate the encoded DataFrame with the original DataFrame
    x = pd.concat([x, encoded_df], axis=1)
    
    # Get the newly added columns
    dummy_cols = list(set(x.columns) - original_cols)
    
    return x, dummy_cols

# Dummy variable creation is done before splitting the data, so all the different categories are covered
# Create dummy variables
df_encoded, dummy_columns = encode_cat_vars(x_df)

#confirm:
df_encoded = df_encoded.dropna(subset=['LOS'])
df_encoded.shape
df_encoded.to_csv('encoded.csv')



With current categories- there are 164 features after 1-ht. This may be a problem. 
Need to remember that each model will be trained on the category 

In [26]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
#########################
#
# Principal Component Analysis - PCA
#
#########################
# 
target_column = y_target_cat  # Replace with the y_regression if want to change model to regression. 
object_columns = [col for col in df_encoded.columns if col != target_column]

# Initialize the HistGradientBoostingClassifier for feature selection
gbdt = HistGradientBoostingClassifier(random_state=0)
gbdt.fit(df_encoded[object_columns], df_encoded[target_column])

# Get feature importances from the trained GBDT model
feature_importances = gbdt.feature_importances_

# Sort features by importance in descending order
sorted_indices = feature_importances.argsort()[::-1]

# Select the top N most important features (you can adjust N as needed)
num_selected_features = 8  # revise thise following our skree plot (below)
selected_feature_indices = sorted_indices[:num_selected_features]

# Extract the selected features from your DataFrame
selected_features = df_encoded.iloc[:, selected_feature_indices]

# Now, you can perform PCA on the selected features
scaled_data = selected_features.values  # Convert to NumPy array
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

KeyError: 'prolonged_LOS'

In [20]:
#########################
#
# Troubleshoot any NaN values in PCA
#
#########################

df_encoded[y_target_reg].isnull().sum()

2274

In [None]:

#########################
#
# Draw a scree plot and a PCA plot
#
#########################
 
import numpy as np
import matplotlib.pyplot as plt


#The following code constructs the Scree plot
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
 
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
 
#the following code makes a fancy looking plot using PC1 and PC2
pca_df = pd.DataFrame(pca_data, index=[*wt, *ko], columns=labels)
 
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
 
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()
 
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pd.Series(pca.components_[0], index=genes)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])