In [None]:
import pickle
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File paths
fp_simple = '/gpfs/milgram/project/rtaylor/imc33/LOS/output/model_simple.pkl'
fp_val = '/gpfs/milgram/project/rtaylor/imc33/LOS/output/X_val_selected.csv'
lab_df = pd.read_csv('/gpfs/milgram/project/rtaylor/imc33/LOS/data/new_label_names.csv')

# Create rename dictionary from lab_df
rename_dict = pd.Series(lab_df.new_name.values, index=lab_df.old_name).to_dict()

# Load the model
with open(fp_simple, "rb") as f:
    model = pickle.load(f)

# Load dataset
X_test = pd.read_csv(fp_val, index_col='pat_enc_csn_id')

# Ensure feature order matches what the model expects
#X_test = X_test[model.feature_names_in_]

# Confirm features are identical
#print("Feature order matches:", list(model.feature_names_in_) == list(X_test.columns))

In [None]:
# Rename feature columns using rename_dict while ensuring uniqueness
X_test = X_test.rename(columns=rename_dict)

# Check for duplicate column names and resolve by appending a suffix
counts = {}
new_columns = []
for col in X_test.columns:
    if col in counts:
        counts[col] += 1
        new_columns.append(f"{col} ({counts[col]})")  # Append count to make unique
    else:
        counts[col] = 1
        new_columns.append(col)

X_test.columns = new_columns  # Update column names to be unique


In [None]:
# Compute SHAP values
explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test)  # Compute SHAP values

# Ensure SHAP values use the renamed features
shap_values.feature_names = X_test.columns.tolist()

# Calculate the mean absolute SHAP value for each feature
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

# Create a DataFrame for feature importance
importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Mean Absolute SHAP': mean_abs_shap
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Mean Absolute SHAP', ascending=False)

importance_df

In [None]:
# Generate the Beeswarm plot
fig_beeswarm = plt.figure(figsize=(20, 20))  # Adjusted figure size for beeswarm
shap.plots.beeswarm(shap_values, max_display=10, show=False)

# Show the plot
plt.show()


In [None]:
shap.plots.bar(shap_values, show=False)

In [None]:
shap.force_plot(
    explainer.expected_value, shap_values[0].values, X_test.iloc[0, :], matplotlib=True
)

### Continuous Variables
age, consult counts, imaging counts, ed volume, hospital volume

In [None]:
#import shap
#mport matplotlib.pyplot as plt

# Extract feature names and SHAP values
#feature_names = ["Consult Count", "Age", "Imaging Count", "ED Volume", "Inpatient Volume","US Count"]
                 
#for feature in feature_names:
#    shap.plots.scatter(shap_values[:, feature])

In [None]:
import shap
import matplotlib.pyplot as plt

plt.figure()
plt.subplot(1,2,1)
shap.plots.scatter(shap_values[:, "Consult Count"], ax=axes[0])