In [1]:
def View(df, rows=None, cols=None, width=None):
    """Displays the first `rows` of the DataFrame like R's View() by adjusting Pandas settings."""
    
    # Show only the first `rows` of the DataFrame
    with pd.option_context(
        "display.max_rows", rows,  # Limit number of rows shown
        "display.max_columns", cols,  # Show all columns
        "display.max_colwidth", width,  # Show full column width
        "display.expand_frame_repr", False  # Prevent column wrapping
    ):
        display(df.head(rows))  # Show only the first `rows`

In [2]:
import pickle
import shap
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# File paths
fp_simple = '/gpfs/milgram/project/rtaylor/imc33/LOS/output/model_simple.pkl'
fp_val = '/gpfs/milgram/project/rtaylor/imc33/LOS/output/X_val_selected.csv'
lab_df = pd.read_csv('/gpfs/milgram/project/rtaylor/imc33/LOS/data/new_label_names.csv')

# Create rename dictionary from lab_df
rename_dict = pd.Series(lab_df.new_name.values, index=lab_df.old_name).to_dict()

# Load the model
with open(fp_simple, "rb") as f:
    model = pickle.load(f)

# Load dataset
X_test = pd.read_csv(fp_val, index_col='pat_enc_csn_id')

# Ensure feature order matches what the model expects
#X_test = X_test[model.feature_names_in_]

# Confirm features are identical
#print("Feature order matches:", list(model.feature_names_in_) == list(X_test.columns))

In [3]:
# Rename feature columns using rename_dict while ensuring uniqueness
X_test = X_test.rename(columns=rename_dict)

# Check for duplicate column names and resolve by appending a suffix
counts = {}
new_columns = []
for col in X_test.columns:
    if col in counts:
        counts[col] += 1
        new_columns.append(f"{col} ({counts[col]})")  # Append count to make unique
    else:
        counts[col] = 1
        new_columns.append(col)

X_test.columns = new_columns  # Update column names to be unique


In [4]:
# Compute SHAP values
explainer = shap.TreeExplainer(model, feature_perturbation="tree_path_dependent")
shap_values = explainer(X_test)  # Compute SHAP values

# Ensure SHAP values use the renamed features
shap_values.feature_names = X_test.columns.tolist()

# Calculate the mean absolute SHAP value for each feature
mean_abs_shap = np.abs(shap_values.values).mean(axis=0)

# Create a DataFrame for feature importance
importance_df = pd.DataFrame({
    'Feature': X_test.columns,
    'Mean Absolute SHAP': mean_abs_shap
})

# Sort the DataFrame by importance
importance_df = importance_df.sort_values(by='Mean Absolute SHAP', ascending=False)

importance_df

Unnamed: 0,Feature,Mean Absolute SHAP
3,Consult Count,0.429263
166,Not Ready for Discharge_First,0.246893
54,Discharged Home,0.115305
43,Imaging Count,0.106140
76,DRG Severe Respiratory Infection,0.101749
...,...,...
28,con_service_otolaryngology_ent_count,0.000758
17,con_service_hepatology_count,0.000742
26,con_service_orthopedics_count,0.000729
11,con_service_cardiothoracic_surgery_count,0.000341


In [5]:
df = importance_df.copy()

In [6]:
filtered_df = df[df['Feature'].str.contains(r'count|volume|age', case=False, na=False, regex=True)]


In [7]:
filtered_df = filtered_df[~filtered_df["Feature"].isin(["viz_insurance_Managed Medicare","viz_language_Other"])]

In [None]:
filtered_df.to_csv("../output/numeric_features.csv", index=False)

In [8]:
importance_df.to_csv("../output/all_features.csv", index=False)

In [None]:
View(filtered_df)

In [None]:
# Generate the Beeswarm plot
fig_beeswarm = plt.figure(figsize=(20, 20))  # Adjusted figure size for beeswarm
shap.plots.beeswarm(shap_values, max_display=20, show=False)

# Show the plot
plt.show()

In [None]:
shap.plots.bar(shap_values, max_display=20, show=False)

In [None]:
plt.figure()
plt.subplot(1,2,1)
shap.plots.beeswarm(shap_values, max_display=10, show=False)
plt.subplot(1,2,2)
shap.plots.bar(shap_values, show=False)

# Optional for adjusting the margins:
plt.subplots_adjust(
    left=1,
    bottom=0.1, 
    right=3, 
    top=0.9,
    wspace=0.5   
)

In [None]:
import numpy as np

# Compute sum of SHAP values for each row (i.e., observation)
shap_sums = shap_values.values.sum(axis=1)

# Find the index with the highest positive prediction
max_index = np.argmax(shap_sums)

print(f"Index with highest positive prediction: {max_index}")


In [None]:
shap.force_plot(
    explainer.expected_value, shap_values[601].values, X_test.iloc[0, :], matplotlib=True
)

### Continuous Variables
age, consult counts, imaging counts, ed volume, hospital volume

In [None]:
import shap
import matplotlib.pyplot as plt

# Extract feature names and SHAP values
feature_names = ["Consult Count", "Age", "Imaging Count", "ED Volume", "Inpatient Volume","US Count"]
                 
for feature in feature_names:
    shap.plots.scatter(shap_values[:, feature])

In [None]:
import shap
import matplotlib.pyplot as plt

# Extract feature names and SHAP values
feature_names = ["Consult Count", "Age", "Imaging Count", "ED Volume", "US Count","Inpatient Volume"]

# Create a figure with 3 rows and 2 columns
fig, axes = plt.subplots(nrows=3, ncols=2, figsize=(14, 14))

# Flatten the axes array for easy iteration
for ax, feature in zip(axes.flatten(), feature_names):
    shap.plots.scatter(shap_values[:, feature], ax=ax)

plt.show()
