In [1]:

import pandas as pd
import numpy as np

from sklearn.decomposition import PCA
from sklearn import preprocessing
import matplotlib.pyplot as plt # NOTE: This was tested with matplotlib v. 2.1.0
 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

In [2]:
#########################
#
# Data Import
#
#########################
df = pd.read_csv('../../_data/operations_imputed_CLEAN.csv', index_col=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 91862 entries, 0 to 102290
Data columns (total 64 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   op_id              91862 non-null  int64  
 1   subject_id         91862 non-null  int64  
 2   hadm_id            91862 non-null  int64  
 3   opdate             91862 non-null  int64  
 4   age                91862 non-null  int64  
 5   sex                91862 non-null  object 
 6   weight             90910 non-null  float64
 7   height             91326 non-null  float64
 8   race               91862 non-null  object 
 9   asa                89532 non-null  float64
 10  emop               91862 non-null  int64  
 11  department         91862 non-null  object 
 12  antype             91862 non-null  object 
 13  icd10_pcs          91862 non-null  object 
 14  category_desc      91862 non-null  object 
 15  desc_short         91862 non-null  object 
 16  category_id        91862 n

In [3]:
#########################
#
# Drop Target Column, NaN columns and excluded Features
#
#########################
# 
## Drop cols with many NaN

columns_w_NaN = ['art_dbp']  # Define as a list

## Drop Excluded Features
cols_to_drop = columns_w_NaN + ['op_id', 'subject_id', 'hadm_id', 'opdate', 'is_outlier', 'category_desc', 'desc_short', 'subject_id_y', 'inhosp_death_time', 'orin_time', 'orout_time', 'opstart_time', 'opend_time', 'admission_time', 'discharge_time', 'anstart_time', 'anend_time', 'cpbon_time', 'cpboff_time', 'icuin_time', 'icuout_time', 'icd10_pcs', 'chart_time_x', 'race', 'emop']  # Include 'art_dbp' directly


# Check if the columns exist in the DataFrame before dropping them
missing_columns = [col for col in cols_to_drop if col not in df.columns]
if missing_columns:
    print("Columns not found in DataFrame:", missing_columns)

df = df.drop(columns=cols_to_drop, axis=1)

# Drop Targets
y_target_reg = 'LOS'

# Confirm
x_df = df.drop([y_target_reg], axis=1)

# Check the info of the resulting DataFrame
x_df.info()

# Display the resulting DataFrame
print(x_df)

<class 'pandas.core.frame.DataFrame'>
Index: 91862 entries, 0 to 102290
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              91862 non-null  int64  
 1   sex              91862 non-null  object 
 2   weight           90910 non-null  float64
 3   height           91326 non-null  float64
 4   asa              89532 non-null  float64
 5   department       91862 non-null  object 
 6   antype           91862 non-null  object 
 7   category_id      91862 non-null  object 
 8   art_mbp          89504 non-null  float64
 9   art_sbp          89497 non-null  float64
 10  bt               89479 non-null  float64
 11  cvp              89248 non-null  float64
 12  hr               89515 non-null  float64
 13  pip              89519 non-null  float64
 14  pmean            89508 non-null  float64
 15  rr               89517 non-null  float64
 16  spo2             89517 non-null  float64
 17  vt              

In [4]:
x_df = x_df.dropna()
x_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 87844 entries, 3 to 102290
Data columns (total 37 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   age              87844 non-null  int64  
 1   sex              87844 non-null  object 
 2   weight           87844 non-null  float64
 3   height           87844 non-null  float64
 4   asa              87844 non-null  float64
 5   department       87844 non-null  object 
 6   antype           87844 non-null  object 
 7   category_id      87844 non-null  object 
 8   art_mbp          87844 non-null  float64
 9   art_sbp          87844 non-null  float64
 10  bt               87844 non-null  float64
 11  cvp              87844 non-null  float64
 12  hr               87844 non-null  float64
 13  pip              87844 non-null  float64
 14  pmean            87844 non-null  float64
 15  rr               87844 non-null  float64
 16  spo2             87844 non-null  float64
 17  vt              

In [5]:
#########################
#
# One-Hot encode categorical Features
#
#########################

# Filter columns with dtype 'object'
object_columns = x_df.select_dtypes(include='object').columns.tolist()
numeric_columns = ['asa']
encode_columns = object_columns +numeric_columns

print(encode_columns)



['sex', 'department', 'antype', 'category_id', 'asa']


In [6]:
from sklearn.preprocessing import OneHotEncoder
## use OneHot instead of get_dummies
        
def encode_cat_vars(x, columns_to_encode):
    original_cols = set(x.columns)
    
    # Extract the object columns
    
    # object_columns = [col for col in x.columns if col in columns_to_encode]
    
    # Initialize the OneHotEncoder
    encoder = OneHotEncoder(drop='first', sparse=False)
    
    # Fit and transform the encoder on the object columns
    encoded_data = encoder.fit_transform(x[columns_to_encode])
    
    # Create a DataFrame from the encoded data with appropriate column names
    encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))
    
    # Drop the original object columns
    x = x.drop(columns_to_encode, axis=1)
    
    # Concatenate the encoded DataFrame with the original DataFrame
    x = pd.concat([x, encoded_df], axis=1)
    
    # Get the newly added columns
    dummy_cols = list(set(x.columns) - original_cols)
    
    return x, dummy_cols

# Dummy variable creation is done before splitting the data, so all the different categories are covered
# Create dummy variables
df_encoded, dummy_columns = encode_cat_vars(x_df, encode_columns)

#confirm:
# df_encoded = df_encoded.dropna(subset=['LOS'])
print(dummy_columns)
df_encoded




['category_id_0HX', 'category_id_0DB', 'category_id_0SG', 'category_id_0H8', 'category_id_0DH', 'category_id_09R', 'category_id_0SP', 'category_id_00P', 'category_id_0RB', 'antype_Neuraxial', 'category_id_0N9', 'department_UR', 'category_id_0W1', 'category_id_0J8', 'asa_6.0', 'category_id_0NP', 'category_id_03B', 'category_id_02J', 'category_id_0QS', 'category_id_08D', 'department_PS', 'category_id_0QH', 'category_id_10D', 'category_id_099', 'category_id_0RP', 'sex_M', 'department_GS', 'category_id_0DT', 'category_id_0D1', 'category_id_0Q9', 'department_OT', 'category_id_03C', 'category_id_0DV', 'category_id_0VC', 'category_id_021', 'category_id_0FY', 'category_id_0BN', 'category_id_08R', 'category_id_00X', 'category_id_0HR', 'category_id_0SR', 'category_id_04C', 'department_CTS', 'category_id_0GT', 'category_id_0H9', 'category_id_0RW', 'category_id_0FB', 'category_id_0CR', 'antype_Regional', 'department_RAD', 'category_id_0MB', 'category_id_0WJ', 'category_id_0RH', 'category_id_0SB', 

Unnamed: 0,age,weight,height,art_mbp,art_sbp,bt,cvp,hr,pip,pmean,...,category_id_0WB,category_id_0WJ,category_id_0Y6,category_id_0YH,category_id_10D,asa_2.0,asa_3.0,asa_4.0,asa_5.0,asa_6.0
3,50.0,66.0,157.0,100.826720,137.244265,29.656043,2.457447,78.792763,4.000000,4.893834,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,60.0,62.0,154.0,100.773779,138.601562,28.462637,0.603448,62.000000,5.000000,4.485501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,35.0,50.0,160.0,98.084871,131.646617,29.201952,7.653846,82.000000,13.546358,3.803659,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
6,20.0,62.0,179.0,92.702290,140.826772,35.800000,0.272727,94.000000,12.820417,4.017967,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
7,60.0,52.0,152.0,100.773779,138.601562,28.462637,0.603448,68.000000,4.000000,4.485501,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
87814,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
87818,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
87822,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
87830,,,,,,,,,,,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0


In [9]:
df_encoded=df_encoded.dropna()

df_encoded.to_csv('../../_data/1hot_encoded.csv')

With current categories- there are 164 features after 1-ht. This may be a problem. 
Need to remember that each model will be trained on the category 

In [None]:
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.decomposition import PCA
#########################
#
# Principal Component Analysis - PCA - Demographics
#
#########################
# 
target_column = y_target_cat  # Replace with the y_regression if want to change model to regression. 
object_columns = [col for col in df_encoded.columns if col != target_column]

# Initialize the HistGradientBoostingClassifier for feature selection
gbdt = HistGradientBoostingClassifier(random_state=0)
gbdt.fit(df_encoded[object_columns], df_encoded[target_column])

# Get feature importances from the trained GBDT model
feature_importances = gbdt.feature_importances_

# Sort features by importance in descending order
sorted_indices = feature_importances.argsort()[::-1]

# Select the top N most important features (you can adjust N as needed)
num_selected_features = 3  # revise thise following our skree plot (below)
selected_feature_indices = sorted_indices[:num_selected_features]

# Extract the selected features from your DataFrame
selected_features = df_encoded.iloc[:, selected_feature_indices]

# Now, you can perform PCA on the selected features
scaled_data = selected_features.values  # Convert to NumPy array
pca = PCA()
pca.fit(scaled_data)
pca_data = pca.transform(scaled_data)

In [None]:
#########################
#
# Troubleshoot any NaN values in PCA
#
#########################

# Check for NaN values in the 'y' column
nan_indices = df_encoded[df_encoded.columns].isna()
print(nan_indices)

In [None]:

#########################
#
# Draw a scree plot and a PCA plot
#
#########################
 
import numpy as np
import matplotlib.pyplot as plt


#The following code constructs the Scree plot
per_var = np.round(pca.explained_variance_ratio_* 100, decimals=1)
labels = ['PC' + str(x) for x in range(1, len(per_var)+1)]
 
plt.bar(x=range(1,len(per_var)+1), height=per_var, tick_label=labels)
plt.ylabel('Percentage of Explained Variance')
plt.xlabel('Principal Component')
plt.title('Scree Plot')
plt.show()
 
#the following code makes a fancy looking plot using PC1 and PC2
pca_df = pd.DataFrame(pca_data, index=[*wt, *ko], columns=labels)
 
plt.scatter(pca_df.PC1, pca_df.PC2)
plt.title('My PCA Graph')
plt.xlabel('PC1 - {0}%'.format(per_var[0]))
plt.ylabel('PC2 - {0}%'.format(per_var[1]))
 
for sample in pca_df.index:
    plt.annotate(sample, (pca_df.PC1.loc[sample], pca_df.PC2.loc[sample]))
 
plt.show()
 
#########################
#
# Determine which genes had the biggest influence on PC1
#
#########################
 
## get the name of the top 10 measurements (genes) that contribute
## most to pc1.
## first, get the loading scores
loading_scores = pd.Series(pca.components_[0], index=genes)
## now sort the loading scores based on their magnitude
sorted_loading_scores = loading_scores.abs().sort_values(ascending=False)
 
# get the names of the top 10 genes
top_10_genes = sorted_loading_scores[0:10].index.values
 
## print the gene names and their scores (and +/- sign)
print(loading_scores[top_10_genes])