In [1]:
## NOTE: This is Python 3 code.
import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt # NOTE: This was tested with matplotlib v. 2.1.0
 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#########################
#
# Data Import
#
#########################
df = pd.read_csv('../_data/operations_imputed.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58815 entries, 0 to 58814
Data columns (total 64 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Unnamed: 0         58815 non-null  int64  
 1   op_id              58815 non-null  int64  
 2   subject_id         58815 non-null  int64  
 3   hadm_id            58815 non-null  int64  
 4   opdate             58815 non-null  int64  
 5   age                58815 non-null  int64  
 6   sex                58815 non-null  object 
 7   weight             58180 non-null  float64
 8   height             58461 non-null  float64
 9   race               58815 non-null  object 
 10  asa                56842 non-null  float64
 11  emop               58815 non-null  int64  
 12  department         58815 non-null  object 
 13  antype             58815 non-null  object 
 14  icd10_pcs          58815 non-null  object 
 15  category_desc      58815 non-null  object 
 16  desc_short         588

In [3]:
#########################
#
# Drop Target Column, NaN columns and excluded Features
#
#########################
# 
## Drop records with NaN in 'asa' 
df = df.dropna(subset=['asa'])
df

## Drop Targets
y_target_cat = 'prolonged_LOS'
y_target_reg = 'LOS'

## Drop Excluded Features

cols_to_drop = [y_target_reg,'Unnamed: 0','op_id','subject_id','hadm_id','opdate','is_outlier','category_desc','desc_short','subject_id_y','inhosp_death_time','orin_time','orout_time','opstart_time','opend_time','admission_time','discharge_time','anstart_time','anend_time','cpbon_time','cpboff_time','icuin_time','icuout_time','icd10_pcs','chart_time_x','race','art_dbp','emop']

# Confrim
x_df = df.drop(columns=cols_to_drop, axis=1)
x_df.info()
x_df

<class 'pandas.core.frame.DataFrame'>
Index: 56842 entries, 2 to 58814
Data columns (total 36 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              56842 non-null  int64  
 1   sex              56842 non-null  object 
 2   weight           56231 non-null  float64
 3   height           56535 non-null  float64
 4   asa              56842 non-null  float64
 5   department       56842 non-null  object 
 6   antype           56842 non-null  object 
 7   category_id      56842 non-null  object 
 8   art_mbp          56817 non-null  float64
 9   art_sbp          56815 non-null  float64
 10  bt               56805 non-null  float64
 11  cvp              56508 non-null  float64
 12  hr               56832 non-null  float64
 13  pip              56833 non-null  float64
 14  pmean            56829 non-null  float64
 15  rr               56833 non-null  float64
 16  spo2             56836 non-null  float64
 17  vt               

Unnamed: 0,age,sex,weight,height,asa,department,antype,category_id,art_mbp,art_sbp,...,hco3,lymphocyte,platelet,potassium,sodium,total_bilirubin,wbc,icu_visit,or_duration,anesth_duration
2,60,F,62.0,154.0,1.0,GS,General,0GT,98.585052,135.560526,...,24.369110,23.941199,215.854671,3.843406,140.403423,0.825000,7.995625,False,70.0,90.0
3,35,F,50.0,160.0,1.0,OS,Neuraxial,09B,92.767857,130.167421,...,22.356452,13.300000,124.000000,3.900000,138.000000,0.600000,6.310000,False,115.0,150.0
4,60,F,52.0,152.0,1.0,OL,General,09Q,98.585052,135.560526,...,24.369110,23.941199,215.854671,3.843406,140.403423,0.825000,7.995625,False,30.0,90.0
5,25,M,47.0,172.0,2.0,OL,General,09Q,87.596491,135.763636,...,23.760784,19.620690,244.000000,4.001266,138.897436,0.822807,9.888197,True,25.0,55.0
6,70,F,51.0,156.0,2.0,OL,General,0GT,98.824945,142.105145,...,24.330222,23.300000,295.000000,3.828449,139.466418,0.852152,7.390000,False,285.0,305.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58810,50,M,64.0,169.0,2.0,UR,General,0TB,96.824111,135.220408,...,24.075314,19.199569,199.367424,3.500000,136.000000,1.132530,9.246938,False,5.0,25.0
58811,70,F,53.0,162.0,2.0,GS,General,0HB,98.824945,142.105145,...,24.330222,22.205372,206.355948,3.700000,142.000000,0.852152,8.525825,False,60.0,75.0
58812,65,F,51.0,152.0,2.0,GS,General,0HB,94.461538,137.361486,...,24.393939,22.327600,212.325744,3.800000,143.000000,0.766667,8.280235,False,55.0,75.0
58813,85,M,74.0,171.0,4.0,GS,General,0DB,92.000000,152.000000,...,22.600000,18.000000,164.000000,3.900000,137.000000,1.800000,9.695000,True,130.0,170.0


In [4]:
#########################
#
# One-Hot encode categorical Features
#
#########################

# Filter columns with dtype 'object'
object_columns = x_df.select_dtypes(include='object').columns.tolist()
print(object_columns)



['sex', 'department', 'antype', 'category_id']


In [6]:
from sklearn.preprocessing import OneHotEncoder
## use OneHot instead of get_dummies
        
def encode_cat_vars(x):
    original_cols = set(x.columns)
    
    # Extract the object columns
    object_columns = [col for col in x.columns if x[col].dtype == 'object']
    
    # Initialize the OneHotEncoder
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    # Fit and transform the encoder on the object columns
    encoded_data = encoder.fit_transform(x[object_columns])
    
    # Create a DataFrame from the encoded data with appropriate column names
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(object_columns))
    
    # Drop the original object columns
    x = x.drop(object_columns, axis=1)
    
    # Concatenate the encoded DataFrame with the original DataFrame
    x = pd.concat([x, encoded_df], axis=1)
    
    # Get the newly added columns
    dummy_cols = list(set(x.columns) - original_cols)
    
    return x, dummy_cols

# Dummy variable creation is done before splitting the data, so all the different categories are covered
# Create dummy variables
df_encoded, dummy_columns = encode_cat_vars(x_df)

#confirm:
# df_encoded = df_encoded.dropna(subset=['LOS'])
df_encoded.shape
df_encoded.to_csv('encoded.csv')



With current categories- there are 164 features after 1-ht. This may be a problem. 
Need to remember that each model will be trained on the category 

In [26]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
#########################
#
# Principal Component Analysis - PCA
#
#########################
# 
target_column = y_target_cat  # Replace with the y_regression if want to change model to regression. 
object_columns = [col for col in df_encoded.columns if col != target_column]

# Initialize the HistGradientBoostingClassifier for feature selection
gbdt = HistGradientBoostingClassifier(random_state=0)
gbdt.fit(df_encoded[object_columns], df_encoded[target_column])

# Get feature importances from the trained GBDT model
feature_importances = gbdt.feature_importances_

# Sort features by importance in descending order
sorted_indices = feature_importances.argsort()[::-1]

# Select the top N most important features (you can adjust N as needed)
num_selected_features = 8  # revise thise following our skree plot (below)
selected_feature_indices = sorted_indices[:num_selected_features]

# Extract the selected features from your DataFrame
selected_features = df_encoded.iloc[:, selected_feature_indices]

# Now, you can perform PCA on the selected features
scaled_data = selected_features.values  # Convert to NumPy array
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

KeyError: 'prolonged_LOS'

In [20]:
#########################
#
# Troubleshoot any NaN values in PCA
#
#########################

df_encoded[y_target_reg].isnull().sum()

2274

In [None]:

#########################
#
# Draw a scree plot and a PCA plot
#
#########################
 
import numpy as np
import matplotlib.pyplot as plt


#The following code constructs the Scree plot
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
 
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
 
#the following code makes a fancy looking plot using PC1 and PC2
pca_df = pd.DataFrame(pca_data, index=[*wt, *ko], columns=labels)
 
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
 
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()
 
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pd.Series(pca.components_[0], index=genes)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])