In [1]:
#DATA PREPROCESSING

import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
os.environ["TOKENIZERS_PARALLELISM"] = "false"
import umap
import hdbscan
import gc
import torch
from transformers import LEDTokenizer, LEDModel
from bertopic import BERTopic

# Global config
PATH_X1 = "/Users/jingyi/Desktop/Trauma_LLM/all_patient/data_Hsp12.feather"
PATH_X2 = "/Users/jingyi/Desktop/Trauma_LLM/all_patient/indvd_metric.csv"
PATH_METRIC_DEF = "/Users/jingyi/Desktop/Trauma_LLM/metric_def.xlsx"

EMB_MODEL_NAME = "allenai/led-base-16384"
MAX_TOKENS = 16000
N_COMPONENTS_CLUST = 10   # dimension need to be changed
N_NEIGHBORS = 15

if torch.backends.mps.is_available():
    DEVICE = torch.device("mps")
elif torch.cuda.is_available():
    DEVICE = torch.device("cuda")
else:
    DEVICE = torch.device("cpu")
print("Device:", DEVICE)

Device: mps


In [2]:
X1 = pd.read_feather(PATH_X1)
print("Original X1 shape:", X1.shape)

cols_to_drop = [
    'mpp_121', 'mpp_125', 'mpp_16', 'mpp_168', 'mpp_236', 'mpp_242', 'mpp_71',
    'proc_01_icd', 'proc_02_icd', 'proc_03_icd', 'proc_04_icd', 'proc_05_icd',
    'proc_06_icd', 'proc_07_icd', 'proc_08_icd', 'proc_09_icd', 'proc_10_icd',
    'proc_11_icd', 'proc_12_icd', 'proc_13_icd', 'proc_14_icd', 'proc_15_icd',
    'proc_16_icd', 'proc_17_icd', 'proc_18_icd', 'proc_19_icd', 'proc_20_icd',
    'proc_21_icd', 'proc_22_icd', 'proc_23_icd', 'proc_24_icd', 'proc_25_icd',
    'proc_26_icd', 'proc_27_icd', 'proc_28_icd', 'proc_29_icd', 'proc_30_icd',
    'proc_31_icd', 'proc_32_icd', 'proc_33_icd', 'proc_34_icd', 'proc_35_icd',
    'proc_36_icd', 'proc_37_icd', 'proc_38_icd', 'proc_39_icd', 'proc_40_icd',
    'proc_41_icd', 'proc_42_icd', 'proc_43_icd', 'proc_44_icd', 'proc_45_icd',
    'proc_46_icd', 'proc_47_icd', 'proc_48_icd', 'proc_49_icd', 'proc_50_icd',
    'proc_51_icd', 'proc_52_icd', 'proc_53_icd', 'proc_54_icd', 'proc_55_icd',
    'proc_56_icd', 'proc_57_icd', 'proc_58_icd', 'proc_59_icd', 'proc_60_icd',
    'proc_61_icd', 'proc_62_icd', 'proc_63_icd', 'proc_64_icd', 'proc_65_icd',
    'proc_66_icd', 'proc_67_icd', 'proc_68_icd', 'proc_69_icd', 'proc_70_icd',
    'proc_71_icd', 'proc_72_icd', 'proc_73_icd', 'proc_74_icd', 'proc_75_icd',
    'proc_76_icd', 'proc_77_icd', 'proc_78_icd', 'proc_79_icd', 'proc_80_icd',
    'proc_81_icd', 'proc_82_icd', 'proc_83_icd', 'proc_84_icd', 'fac_key', 'disp_tx', 'scene_tx', 'leave_tx',
    'gcs40eye_s',  'gcs40ver_s', 'gcs40mot_s',  'gcs40eye_r', 'gcs40ver_r','gcs40mot_r'
]
cols_to_drop += [f"ais_sev_{i:02d}" for i in range(1, 28)]
cols_to_drop += [f"icd9_{i:02d}" for i in range(1, 28)]

# predot_01 ... predot_27
cols_to_drop += [f"predot_{i:02d}" for i in range(1, 28)]

# proc_01 ... proc_84
cols_to_drop += [f"proc_{i:02d}" for i in range(1, 85)]

# ais_01 ... ais_27
cols_to_drop += [f"ais_{i:02d}" for i in range(1, 28)]

X1 = X1.drop(columns=cols_to_drop, errors="ignore")
print("X1 shape after dropping cols:", X1.shape)

# 1b. Read X2 csv
X2 = pd.read_csv(PATH_X2)
X2 = X2.drop(columns="QI", errors="ignore")
# Align X2 rows so that inc_key order matches X1
if "inc_key" not in X1.columns or "inc_key" not in X2.columns:
    raise ValueError("Both X1 and X2 must contain 'inc_key' column.")

X2 = X2.set_index("inc_key").reindex(X1["inc_key"]).reset_index()
print("Aligned X2 shape:", X2.shape)

# Assumes X1 and X2 are already loaded and aligned as in your existing code.
X2_features_only = X2.drop(columns=["inc_key"])
X4 = pd.concat([X1, X2_features_only], axis=1)
print("X4 shape:", X4.shape)

Original X1 shape: (103362, 1571)
X1 shape after dropping cols: (103362, 1278)
Aligned X2 shape: (103362, 116)
X4 shape: (103362, 1393)


In [3]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import normalize

# Load your data (Your existing code)
X1_emb = np.load("/Users/jingyi/Desktop/Trauma_LLM/all_patient/originaldata_LED_emb_103362.npy")
X2_emb = np.load("/Users/jingyi/Desktop/Trauma_LLM/all_patient/indvdmetric_LED_emb.npy")

X1_inc_key = pd.read_csv("/Users/jingyi/Desktop/Trauma_LLM/all_patient/originaldata_inc_key.csv")
X2_inc_key = pd.read_csv("/Users/jingyi/Desktop/Trauma_LLM/all_patient/indvdmetric_inc_key.csv")

print("Original shapes:")
print(X1_emb.shape, X1_inc_key.shape)
print(X2_emb.shape, X2_inc_key.shape)

# --- NORMALIZATION STEP ---

# L2 Normalization maps all vectors to the unit sphere
# This makes Euclidean distance equivalent to Cosine distance
print("\nNormalizing embeddings...")
X1_emb = normalize(X1_emb, norm='l2')
X2_emb = normalize(X2_emb, norm='l2')

print("Normalization complete.")

Original shapes:
(103362, 768) (103362, 1)
(103362, 768) (103362, 1)

Normalizing embeddings...
Normalization complete.


In [4]:
import numpy as np

data = np.load("X1_umap_hdbscan_outputs.npz")

X1_umap_d = data["X1_umap_d"]
X1_labels = data["X1_labels"]

print("Loaded X1_umap_d shape:", X1_umap_d.shape)
print("Loaded X1_labels shape:", X1_labels.shape)
print("Unique clusters (excluding -1):", set(X1_labels) - {-1})


Loaded X1_umap_d shape: (103362, 10)
Loaded X1_labels shape: (103362,)
Unique clusters (excluding -1): {np.int32(0), np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7)}


In [10]:
#iai loading
import os, interpretableai
os.environ["JULIA_PKG_USE_CLI_GIT"] = "true"
interpretableai.install_julia(version="1.12.2")
interpretableai.install_system_image(accept_license=True)

Downloading https://julialang-s3.julialang.org/bin/mac/aarch64/1.12/julia-1.12.2-macaarch64.tar.gz
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mJulia version info


Julia Version 1.12.2
Commit ca9b6662be4 (2025-11-20 16:25 UTC)
Build Info:
  Official https://julialang.org release
Platform Info:
  OS: macOS (arm64-apple-darwin24.0.0)
  uname: Darwin 25.2.0 Darwin Kernel Version 25.2.0: Tue Nov 18 21:09:41 PST 2025; root:xnu-12377.61.12~1/RELEASE_ARM64_T6031 arm64 arm
  CPU: Apple M3 Max: 
                 speed         user         nice          sys         idle          irq
       #1-14  2400 MHz     965048 s          0 s     717654 s   24298894 s          0 s
  Memory: 36.0 GB (666.671875 MB free)
  Uptime: 219853.0 sec
  Load Avg:  9.869140625  5.18798828125  3.7314453125
  WORD_SIZE: 64
  LLVM: libLLVM-18.1.7 (ORCJIT, apple-m3)
  GC: Built with stock GC
Threads: 1 default, 1 interactive, 1 GC (on 10 virtual cores)
Environment:
  JULIA_PKG_USE_CLI_GIT = true
  NVM_CD_FLAGS = -q
  TERM = xterm-color
  HOMEBREW_REPOSITORY = /opt/homebrew
  PATH = /Users/jingyi/.local/bin:/opt/anaconda3/bin:/opt/anaconda3/condabin:/Users/jingyi/.nvm/versions/node/v

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mJulia executable: /Users/jingyi/Library/Application Support/InterpretableAI/julia/1.12.2/julia-1.12.2/bin/julia
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTrying to import PyCall...
[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mPyCall is already installed and compatible with Python executable.
[36m[1m│ [22m[39m
[36m[1m│ [22m[39mPyCall:
[36m[1m│ [22m[39m    python: /Users/jingyi/.venv/qwen/bin/python
[36m[1m│ [22m[39m    libpython: /opt/anaconda3/lib/libpython3.13.dylib
[36m[1m│ [22m[39mPython:
[36m[1m│ [22m[39m    python: /Users/jingyi/.venv/qwen/bin/python
[36m[1m└ [22m[39m    libpython: 
Installed Julia to /Users/jingyi/Library/Application Support/InterpretableAI/julia/1.12.2/julia-1.12.2
Downloading https://iai-system-images.s3.amazonaws.com/macos_aarch64/julia1.12.2/v3.2.2/sys-macos_aarch64-julia1.12.2-iai3.2.2.zip
Installed IAI system image to /Users/jingyi/Library/Application Support/InterpretableAI/sysi

Installing artifacts for system image...


[32m[1m  Activating[22m[39m new project at `/var/folders/lw/8mtrh4rn7zx78y3kqh7h96740000gn/T/jl_cNLPhZ`
[32m[1m    Updating[22m[39m git-repo `https://github.com/InterpretableAI/IAISystemImages.jl`
[32m[1m    Updating[22m[39m registry at `~/.julia/registries/General.toml`
[32m[1m   Resolving[22m[39m package versions...
[32m[1m    Updating[22m[39m `/private/var/folders/lw/8mtrh4rn7zx78y3kqh7h96740000gn/T/jl_cNLPhZ/Project.toml`
  [90m[413e7391] [39m[92m+ IAISystemImages v0.1.0 `https://github.com/InterpretableAI/IAISystemImages.jl#master`[39m
[32m[1m    Updating[22m[39m `/private/var/folders/lw/8mtrh4rn7zx78y3kqh7h96740000gn/T/jl_cNLPhZ/Manifest.toml`
  [90m[413e7391] [39m[92m+ IAISystemImages v0.1.0 `https://github.com/InterpretableAI/IAISystemImages.jl#master`[39m
  [90m[dc30da40] [39m[92m+ SystemImageLoader v0.10.2[39m
  [90m[0dad84c5] [39m[92m+ ArgTools v1.1.2[39m
  [90m[56f22d72] [39m[92m+ Artifacts v1.11.0[39m
  [90m[2a0f44e3] [39m[92m

Installed artifacts for system image


True

In [2]:
  #one time only build Pycall
  import os, sys, subprocess
  from pathlib import Path

  JULIA = Path("/Users/jingyi/Library/Application Support/InterpretableAI/julia/1.12.2/julia-1.12.2/bin/julia")
  assert JULIA.exists(), JULIA

  # Rebuild PyCall to match this Python
  code = f'''
  import Pkg
  ENV["PYTHON"] = "{sys.executable}"
  ENV["PYTHONHOME"] = ""
  Pkg.build("PyCall")
  using PyCall
  println("PyCall.python=", PyCall.python)
  println("PyCall.libpython=", PyCall.libpython)
  '''
  subprocess.run([str(JULIA), "--startup-file=no", "-e", code], check=True)


[32m[1m    Building[22m[39m Conda ─→ `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/8f06b0cfa4c514c7b9546756dbae91fcfbc92dc9/build.log`
[32m[1m    Building[22m[39m PyCall → `~/.julia/scratchspaces/44cfe95a-1eb2-52ea-b672-e2afdf69b78f/9816a3826b0ebf49ab4926e2b18842ad8b5c8f04/build.log`


PyCall.python=/Users/jingyi/.venv/qwen/bin/python
PyCall.libpython=/opt/anaconda3/lib/libpython3.13.dylib


CompletedProcess(args=['/Users/jingyi/Library/Application Support/InterpretableAI/julia/1.12.2/julia-1.12.2/bin/julia', '--startup-file=no', '-e', '\nimport Pkg\nENV["PYTHON"] = "/Users/jingyi/.venv/qwen/bin/python"\nENV["PYTHONHOME"] = ""\nPkg.build("PyCall")\nusing PyCall\nprintln("PyCall.python=", PyCall.python)\nprintln("PyCall.libpython=", PyCall.libpython)\n'], returncode=0)

In [5]:
  cell = """import os
  from pathlib import Path

  # Clean env that can break PyCall
  os.environ.pop("PYTHONHOME", None)
  os.environ.pop("PYTHONPATH", None)

  # Tell interpretableai where Julia + sysimage are
  os.environ["IAI_JULIA"] = "/Users/jingyi/Library/Application Support/InterpretableAI/julia/1.12.2/julia-1.12.2/bin/julia"
  os.environ["IAI_SYSTEM_IMAGE"] = "/Users/jingyi/Library/Application Support/InterpretableAI/sysimage/v3.2.2/sys.dylib"

  # Important: let IAI start Julia, but disable compiled modules
  os.environ["IAI_DISABLE_COMPILED_MODULES"] = "1"

  # License
  os.environ["IAI_LICENSE_FILE"] = "/Users/jingyi/Desktop/Trauma_LLM/iai.lic"

  # Now import
  from interpretableai import iai
  print("IAI import OK")
  """
  from pathlib import Path
  Path("iai_import_cell.py").write_text(cell)
  print("Saved: iai_import_cell.py  (run in notebook with: %run iai_import_cell.py)")



Saved: iai_import_cell.py  (run in notebook with: %run iai_import_cell.py)


In [6]:
%run iai_import_cell.py



IAI import OK


In [3]:
  # Toy model
  import numpy as np
  import pandas as pd
  from interpretableai import iai

  # Synthetic binary classification dataset
  rng = np.random.default_rng(1)
  X = pd.DataFrame(
      rng.normal(size=(300, 4)),
      columns=["x1", "x2", "x3", "x4"],
  )
  y = ((X["x1"] + 0.5 * X["x2"] - 0.2 * X["x3"]) > 0).astype(int)

  (train_X, train_y), (test_X, test_y) = iai.split_data(
      "classification", X, y, seed=1
  )

  grid = iai.GridSearch(
      iai.OptimalTreeClassifier(random_seed=1),
      max_depth=range(1, 4),
  )
  grid.fit(train_X, train_y)

  print(grid.get_learner())
  print("AUC:", grid.score(test_X, test_y, criterion="auc"))




Fitted OptimalTreeClassifier:
  1) Split: x2 < 0.2299
    2) Split: x1 < 0.39
      3) Predict: 0 (96.47%), [82,3], 85 points, error 0.03529
      4) Predict: 1 (88.89%), [4,32], 36 points, error 0.1111
    5) Split: x1 < -0.4499
      6) Predict: 0 (92.59%), [25,2], 27 points, error 0.07407
      7) Predict: 1 (93.55%), [4,58], 62 points, error 0.06452
AUC: 0.9216027874564459


In [7]:
   # Clean data
   # --- Inputs ---
  codebook_path = "/Users/jingyi/Desktop/Trauma_LLM/Feature_codebook/column_profile.xlsx"
  sheet_name = "summary"   # the sheet with columns: "column" and "Type"
  # X1 is assumed to exist in your notebook

  # --- Load codebook ---
  codebook = pd.read_excel(codebook_path, sheet_name=sheet_name)

  # Normalize column names for matching
  codebook["column_norm"] = codebook["column"].astype(str).str.strip()
  codebook["Type_norm"] = codebook["Type"].astype(str).str.strip().str.lower()

  # Normalize X1 columns
  x1_cols = pd.Series(X1.columns.astype(str)).str.strip()
  x1_norm = x1_cols

  # Map X1 columns to Type
  type_map = dict(zip(codebook["column_norm"], codebook["Type_norm"]))
  x1_types = x1_norm.map(type_map)

  # Check missing
  missing = x1_cols[x1_types.isna()]

  # Group
  cat_types = {"categorical", "categorical (numeric-coded)"}
  num_types = {"numeric"}

  cat_cols = x1_cols[x1_types.isin(cat_types)]
  num_cols = x1_cols[x1_types.isin(num_types)]

  # Any columns not in cat/num?
  other = x1_cols[~x1_types.isin(cat_types.union(num_types))]

  # Results
  print("Total X1 cols:", len(x1_cols))
  print("Categorical cols:", len(cat_cols))
  print("Numerical cols:", len(num_cols))

  if len(missing) > 0:
      print("\nMissing in codebook (not found in summary 'column'):")
      print(list(missing))

  if len(other) > 0:
      print("\nFound in codebook but Type not in cat/num:")
      print(pd.DataFrame({"column": other, "Type": x1_types[~x1_types.isin(cat_types.union(num_types))]}))

Total X1 cols: 1278
Categorical cols: 1052
Numerical cols: 226


In [8]:
  # map ecode
  import pandas as pd
  import numpy as np

  matrix_path = "/Users/jingyi/Desktop/Trauma_LLM/Feature_codebook/ICD10CM_NonPoisoning_Cause_Matrix.xlsx"
  matrix = pd.read_excel(matrix_path)

  # exact column names in codebook
  code_col = "Ecode"
  mech_col = "Mechanism"
  intent_col = "Intent"
  trauma_col = "Trauma_Type"

  # choose RAW ICD-10 columns here
  # If you saved raw columns earlier, use them:
  # src_cols = {"e1_icd10": "e1_icd10_raw", "e2_icd10": "e2_icd10_raw"}
  # Otherwise, re-load X1 and use the original columns:
  src_cols = {"e1_icd10": "e1_icd10", "e2_icd10": "e2_icd10"}

  def norm_code(s):
      if pd.isna(s):
          return None
      return str(s).strip().upper()

  def key6(s):
      s = norm_code(s)
      if not s:
          return None
      return s[:6] if len(s) > 6 else s  # keep dot, just truncate

  # build mapping based on first 6 chars
  matrix["code6"] = matrix[code_col].map(key6)
  matrix[mech_col] = matrix[mech_col].astype(str).str.replace("\n", " ").str.strip()
  matrix[intent_col] = matrix[intent_col].astype(str).str.replace("\n", " ").str.strip()
  matrix[trauma_col] = matrix[trauma_col].astype(str).str.replace("\n", " ").str.strip()
  matrix["label"] = matrix[mech_col] + "," + matrix[intent_col] + "," + matrix[trauma_col]

  map6 = dict(zip(matrix["code6"], matrix["label"]))
  code6_set = set(matrix["code6"].dropna())

  unmatched = {}

  for target_col, raw_col in src_cols.items():
      raw = X1[raw_col].map(norm_code)
      raw6 = raw.map(key6)

      # replace with label
      X1[target_col] = raw6.map(map6)

      # unmatched codes (first 6 chars)
      missing = raw6[~raw6.isna() & ~raw6.isin(code6_set)].unique()
      unmatched[target_col] = sorted(missing)

  # unique counts after replacement
  print("Unique counts after replacement (first 6 chars):")
  for col in src_cols.keys():
      print(f"{col}: {X1[col].dropna().nunique()}")

  print("\nUnmatched codes (first 6 chars):")
  for col, miss in unmatched.items():
      print(f"{col} ({len(miss)}): {miss[:50]}{' ...' if len(miss)>50 else ''}")


Unique counts after replacement (first 6 chars):
e1_icd10: 70
e2_icd10: 60

Unmatched codes (first 6 chars):
e1_icd10 (20): ['.', 'NA', 'P90', 'T04.8X', 'T20.20', 'T20.40', 'T21.26', 'T24.60', 'T33.52', 'T33.53', 'T78.8X', 'V43.5X', 'V48.5', 'W06.89', 'W09.XX', 'W17.78', 'W18.XX', 'W19.30', 'W49.50', 'X95.0X']
e2_icd10 (11): ['713', '<N/A>', '<UNK>', 'NA', 'T14.8X', 'T33.53', 'W230XX', 'W31.XX', 'Y93.64', 'Y93.84', 'Y93.H1']


In [9]:
  matrix_path = "/Users/jingyi/Desktop/Trauma_LLM/Feature_codebook/ICD10CM_NonPoisoning_Cause_Matrix.xlsx"
  matrix = pd.read_excel(matrix_path)

  code_col   = "Ecode"
  mech_col   = "Mechanism"
  intent_col = "Intent"
  trauma_col = "Trauma_Type"

  matrix[mech_col]   = matrix[mech_col].astype(str).str.replace("\n"," ").str.strip()
  matrix[intent_col] = matrix[intent_col].astype(str).str.replace("\n"," ").str.strip()
  matrix[trauma_col] = matrix[trauma_col].astype(str).str.replace("\n"," ").str.strip()
  matrix["label"] = matrix[mech_col] + "," + matrix[intent_col] + "," + matrix[trauma_col]

  matrix["code3"] = (
      matrix[code_col]
      .astype(str).str.strip().str.upper()
      .str.replace(".", "", regex=False)
      .str[:3]
  )
  map3 = dict(zip(matrix["code3"], matrix["label"]))

  unmatched = {
      "e1_icd10": ['.', 'NA', 'P90', 'T04.8X', 'T20.20', 'T20.40', 'T21.26', 'T24.60',
                   'T33.52', 'T33.53', 'T78.8X', 'V43.5X', 'V48.5', 'W06.89', 'W09.XX',
                   'W17.78', 'W18.XX', 'W19.30', 'W49.50', 'X95.0X'],
      "e2_icd10": ['713', '<N/A>', '<UNK>', 'NA', 'T14.8X', 'T33.53', 'W230XX',
                   'W31.XX', 'Y93.64', 'Y93.84', 'Y93.H1']
  }

  for col, miss in unmatched.items():
      miss_set = {m.upper() for m in miss}

      raw_col = f"{col}_raw" if f"{col}_raw" in X1.columns else col
      raw = X1[raw_col].astype(str).str.strip().str.upper()

      needs_fix = raw.isin(miss_set)
      dot_mask = needs_fix & raw.str.contains(".", regex=False)

      mapped3 = raw[dot_mask].str.replace(".", "", regex=False).str[:3].map(map3)
      X1.loc[dot_mask, col] = mapped3

      # any remaining unmatched -> 'na'
      still_unmapped = needs_fix & (X1[col].isna() | (X1[col].astype(str).str.lower() == "nan"))
      X1.loc[still_unmapped, col] = "na"

  # --- final checks ---
  print("Unique counts after replacement:")
  for col in unmatched.keys():
      print(f"{col}: {X1[col].dropna().nunique()}")

  # ensure no raw codes remain (must be label or 'na')
  for col in unmatched.keys():
      bad = X1[col].dropna().astype(str).str.contains(r"^[A-Z]\d|^\d", regex=True)
      print(f"{col} raw-code-like entries left:", bad.sum())


Unique counts after replacement:
e1_icd10: 70
e2_icd10: 60
e1_icd10 raw-code-like entries left: 0
e2_icd10 raw-code-like entries left: 0


In [10]:
  def is_code_like(s):
      s = str(s).strip()
      if s.lower() == "na":
          return False
      # letters/digits/dots only, no spaces
      return bool(re.fullmatch(r"[A-Z0-9.]+", s))

  def has_no_letters(s):
      s = str(s).strip()
      if s.lower() == "na":
          return False
      # no alphabetic letters anywhere
      return not bool(re.search(r"[A-Za-z]", s))
  target_columns1 =['e1_icd10','e2_icd10']
  for col in target_columns1:
      if col not in X1.columns:
          continue
      ser = X1[col].dropna().astype(str).str.strip()

      code_like = ser[ser.map(is_code_like)]
      no_letters = ser[ser.map(has_no_letters)]

      print(f"{col}: code-like={len(code_like)}, no-letters-not-na={len(no_letters)}")

      # show a few examples if any
      if len(code_like) > 0:
          print("  code-like examples:", code_like.unique()[:10])
      if len(no_letters) > 0:
          print("  no-letters examples:", no_letters.unique()[:10])

e1_icd10: code-like=0, no-letters-not-na=0
e2_icd10: code-like=0, no-letters-not-na=0


In [11]:
  # ---- Procedure code matching ----
  target_columns = []
  for i in range(1, 85):
      target_columns.append(f'pr_{i:02d}_i10')
  for i in range(1, 13):
      target_columns.append(f'rpd{i:02d}_i10')
  for i in range(1, 13):
      target_columns.append(f'rpt{i:02d}_i10')

  # ---- load CCSR mapping ----
  file_path = '/Users/jingyi/Desktop/Trauma_LLM/Feature_codebook/PRCCSR_v2026-1/PRCCSR_v2026-1.csv'
  ccsr_df = pd.read_csv(file_path)

  cols_to_keep = ['ICD-10-PCS', 'PRCCSR', 'PRCCSR DESCRIPTION']
  ccsr_df = ccsr_df[cols_to_keep].copy()

  for col in cols_to_keep:
      if ccsr_df[col].dtype == 'object':
          ccsr_df[col] = ccsr_df[col].astype(str).str.replace("'", "").str.strip()

  # ---- mapping dict (code -> description) ----
  code_to_desc = dict(zip(ccsr_df['ICD-10-PCS'], ccsr_df['PRCCSR DESCRIPTION']))
  code_set = set(code_to_desc.keys())

  # ---- replace codes with description + collect unmatched ----
  unmatched = {}
  for col in target_columns:
      if col not in X1.columns:
          continue

      raw = X1[col].astype(str).str.strip()
      # treat empty-like as NaN
      raw = raw.replace({"": np.nan, "NA": np.nan, "N/A": np.nan, "<NA>": np.nan, "<N/A>": np.nan})

      # map to description
      X1[col] = raw.map(code_to_desc)

      # unmatched codes (present in data but not in mapping)
      missing = raw[raw.notna() & ~raw.isin(code_set)].unique()
      if len(missing) > 0:
          unmatched[col] = sorted(missing)
          X1.loc[raw.isin(missing), col] = "na"

  # ---- report ----
  print("Total target columns found in X1:", sum(c in X1.columns for c in target_columns))

  # unique matched categories (across all target columns)
  all_desc = pd.unique(pd.concat([X1[c] for c in target_columns if c in X1.columns]))
  all_desc = [d for d in all_desc if pd.notna(d)]
  print("Unique PRCCSR DESCRIPTION count:", len(all_desc))

  print("\nUnmatched codes by column:")
  for col, miss in unmatched.items():
      print(f"{col} ({len(miss)}): {miss[:50]}{' ...' if len(miss)>50 else ''}")
  # unique categories per column
  for col in target_columns:
      if col in X1.columns:
          uniq = X1[col].dropna().unique()
          print(f"{col}: {len(uniq)} unique PRCCSR DESCRIPTION")

Total target columns found in X1: 108
Unique PRCCSR DESCRIPTION count: 288

Unmatched codes by column:
pr_01_i10 (24): ['0', '0HSV04Z', '0WPB3ZZ', '0WSG04Z', '2W32X3Z`', '2W35X32', '317093', '318091', '5A19552', '8E0KXYZ', '8W28ZZZ', '<n/a>', '<unk>', 'B4W0ZZZ', 'BBW211Z', 'BE28ZZZ', 'BW04ZZZ', 'BW18ZZZ', 'BW278ZZZ', 'BW27ZZZ', 'BW50ZZZ', 'IIIIIIII', 'OHQ0XZZ', 'OHQOXZZ']
pr_02_i10 (12): ['0B518EZ', '0QFS04Z', '0SQH06Z', '0W9920Z', '0W99302', '243RX1Z', '317090', '8R20ZZZ', '<n/a>', 'B2464ZZ', 'BR28ZZZ', 'BW24CZZ']
pr_03_i10 (14): ['0QC1XZZ', '0W9920Z', '20320N1', '2W35X32', '317090', '318091', '318094', '3E03TBZ', '3E10X9Z', 'BR251ZZ', 'BR28ZZZ', 'BR38ZZZ', 'BW221ZZ', 'NW251ZZ']
pr_04_i10 (11): ['030T3BZ', '03HY3ZZ', '0NSRXAA', '0W9920Z', '2W35X32', '317090', '317093', '318091', '3E10X8S', 'B242ZZA', 'BR28ZZZ']
pr_05_i10 (9): ['02QA0XX', '0BN3ZZZ', '0HQ1ZZZ', '0JB08ZZ', '2W35X32', '317090', '317093', '3E0TBCZ', 'OQSB04Z']
pr_06_i10 (7): ['0W9920Z', '270346', '2W35X32', '318091', '<n/a

In [12]:
  def is_code_like(s):
      s = str(s).strip()
      if s.lower() == "na":
          return False
      # letters/digits/dots only, no spaces
      return bool(re.fullmatch(r"[A-Z0-9.]+", s))

  def has_no_letters(s):
      s = str(s).strip()
      if s.lower() == "na":
          return False
      # no alphabetic letters anywhere
      return not bool(re.search(r"[A-Za-z]", s))

  for col in target_columns:
      if col not in X1.columns:
          continue
      ser = X1[col].dropna().astype(str).str.strip()

      code_like = ser[ser.map(is_code_like)]
      no_letters = ser[ser.map(has_no_letters)]

      print(f"{col}: code-like={len(code_like)}, no-letters-not-na={len(no_letters)}")

      # show a few examples if any
      if len(code_like) > 0:
          print("  code-like examples:", code_like.unique()[:10])
      if len(no_letters) > 0:
          print("  no-letters examples:", no_letters.unique()[:10])

pr_01_i10: code-like=0, no-letters-not-na=0
pr_02_i10: code-like=0, no-letters-not-na=0
pr_03_i10: code-like=0, no-letters-not-na=0
pr_04_i10: code-like=0, no-letters-not-na=0
pr_05_i10: code-like=0, no-letters-not-na=0
pr_06_i10: code-like=0, no-letters-not-na=0
pr_07_i10: code-like=0, no-letters-not-na=0
pr_08_i10: code-like=0, no-letters-not-na=0
pr_09_i10: code-like=0, no-letters-not-na=0
pr_10_i10: code-like=0, no-letters-not-na=0
pr_11_i10: code-like=0, no-letters-not-na=0
pr_12_i10: code-like=0, no-letters-not-na=0
pr_13_i10: code-like=0, no-letters-not-na=0
pr_14_i10: code-like=0, no-letters-not-na=0
pr_15_i10: code-like=0, no-letters-not-na=0
pr_16_i10: code-like=0, no-letters-not-na=0
pr_17_i10: code-like=0, no-letters-not-na=0
pr_18_i10: code-like=0, no-letters-not-na=0
pr_19_i10: code-like=0, no-letters-not-na=0
pr_20_i10: code-like=0, no-letters-not-na=0
pr_21_i10: code-like=0, no-letters-not-na=0
pr_22_i10: code-like=0, no-letters-not-na=0
pr_23_i10: code-like=0, no-lette

In [13]:
  import pandas as pd
  import numpy as np

  diag_cols = [f'icd10_{i:02d}' for i in range(1, 28)]

  file_path = '/Users/jingyi/Desktop/Trauma_LLM/Feature_codebook/DXCCSR-v2026-1/DXCCSR_v2026-1.csv'
  ccsr_df = pd.read_csv(file_path)

  cols_to_keep = [
      'ICD-10-CM CODE',
      'Default CCSR CATEGORY IP',
      'Default CCSR CATEGORY DESCRIPTION IP'
  ]
  ccsr_df = ccsr_df[cols_to_keep].copy()

  # clean columns
  for col in cols_to_keep:
      if ccsr_df[col].dtype == 'object':
          ccsr_df[col] = ccsr_df[col].astype(str).str.replace("'", "").str.strip()

  # normalize for matching: remove dots + whitespace, uppercase
  def norm_code(s):
      if pd.isna(s):
          return None
      s = str(s).strip().upper()
      s = s.replace(".", "")
      s = "".join(s.split())  # remove all whitespace
      return s if s else None

  # build mapping on normalized codes
  ccsr_df["code_norm"] = ccsr_df["ICD-10-CM CODE"].map(norm_code)
  code_to_desc = dict(zip(ccsr_df["code_norm"], ccsr_df["Default CCSR CATEGORY DESCRIPTION IP"]))
  code_set = set(code_to_desc.keys())

  unmatched = {}
  unique_counts = {}

  for col in diag_cols:
      if col not in X1.columns:
          continue

      # keep original (with dots) for unmatched reporting
      raw_orig = X1[col].astype(str).str.strip()
      raw_orig = raw_orig.replace({"": np.nan, "NA": np.nan, "N/A": np.nan, "<NA>": np.nan, "<N/A>": np.nan})

      # normalized for matching
      raw_norm = raw_orig.map(norm_code)

      # map to description
      X1[col] = raw_norm.map(code_to_desc)

      # unmatched (report original form)
      miss_mask = raw_norm.notna() & ~raw_norm.isin(code_set)
      missing = raw_orig[miss_mask].dropna().unique()
      if len(missing) > 0:
          unmatched[col] = sorted(missing)

      # unique matched categories
      unique_counts[col] = X1[col].dropna().nunique()

  print("Unique matched categories per column:")
  for col, n in unique_counts.items():
      print(f"{col}: {n}")

  print("\nUnmatched codes by column (original with dot):")
  for col, miss in unmatched.items():
      print(f"{col} ({len(miss)}): {miss[:50]}{' ...' if len(miss)>50 else ''}")


Columns (15,17) have mixed types. Specify dtype option on import or set low_memory=False.



Unique matched categories per column:
icd10_01: 27
icd10_02: 27
icd10_03: 27
icd10_04: 27
icd10_05: 27
icd10_06: 26
icd10_07: 26
icd10_08: 26
icd10_09: 26
icd10_10: 27
icd10_11: 27
icd10_12: 26
icd10_13: 26
icd10_14: 26
icd10_15: 25
icd10_16: 23
icd10_17: 24
icd10_18: 24
icd10_19: 24
icd10_20: 22
icd10_21: 21
icd10_22: 22
icd10_23: 21
icd10_24: 22
icd10_25: 20
icd10_26: 20
icd10_27: 19

Unmatched codes by column (original with dot):
icd10_01 (2): ['6402', 'T31.9']
icd10_06 (1): ['4502']
icd10_07 (2): ['4502', '999999']


In [14]:
      # --- handle unmatched replacements ---
      # Special case: T31.9 -> fixed category
      special_label = "Burn and corrosion, initial encounter"
      is_t319 = raw_orig.str.upper().eq("T31.9") | raw_orig.map(norm_code).eq("T319")
      X1.loc[is_t319, col] = special_label

      # All other unmatched -> "na"
      other_unmatched = miss_mask & ~is_t319
      X1.loc[other_unmatched, col] = "na"

In [15]:
  for col in diag_cols:
      if col not in X1.columns:
          continue
      ser = X1[col].dropna().astype(str).str.strip()

      code_like = ser[ser.map(is_code_like)]
      no_letters = ser[ser.map(has_no_letters)]

      print(f"{col}: code-like={len(code_like)}, no-letters-not-na={len(no_letters)}")

      # show a few examples if any
      if len(code_like) > 0:
          print("  code-like examples:", code_like.unique()[:10])
      if len(no_letters) > 0:
          print("  no-letters examples:", no_letters.unique()[:10])

icd10_01: code-like=0, no-letters-not-na=0
icd10_02: code-like=0, no-letters-not-na=0
icd10_03: code-like=0, no-letters-not-na=0
icd10_04: code-like=0, no-letters-not-na=0
icd10_05: code-like=0, no-letters-not-na=0
icd10_06: code-like=0, no-letters-not-na=0
icd10_07: code-like=0, no-letters-not-na=0
icd10_08: code-like=0, no-letters-not-na=0
icd10_09: code-like=0, no-letters-not-na=0
icd10_10: code-like=0, no-letters-not-na=0
icd10_11: code-like=0, no-letters-not-na=0
icd10_12: code-like=0, no-letters-not-na=0
icd10_13: code-like=0, no-letters-not-na=0
icd10_14: code-like=0, no-letters-not-na=0
icd10_15: code-like=0, no-letters-not-na=0
icd10_16: code-like=0, no-letters-not-na=0
icd10_17: code-like=0, no-letters-not-na=0
icd10_18: code-like=0, no-letters-not-na=0
icd10_19: code-like=0, no-letters-not-na=0
icd10_20: code-like=0, no-letters-not-na=0
icd10_21: code-like=0, no-letters-not-na=0
icd10_22: code-like=0, no-letters-not-na=0
icd10_23: code-like=0, no-letters-not-na=0
icd10_24: c

In [16]:
  # check the work
  target_columns = []
  for i in range(1, 85):
      target_columns.append(f'pr_{i:02d}_i10')
  for i in range(1, 13):
      target_columns.append(f'rpd{i:02d}_i10')
  for i in range(1, 13):
      target_columns.append(f'rpt{i:02d}_i10')

  diag_cols = [f'icd10_{i:02d}' for i in range(1, 28)]

  # keep only columns that exist in X1
  cols_to_show = ["e1_icd10", "e2_icd10"] \
                 + [c for c in target_columns if c in X1.columns] \
                 + [c for c in diag_cols if c in X1.columns]

  # show first 10 rows
  display(X1[cols_to_show].head(10))

Unnamed: 0,e1_icd10,e2_icd10,pr_01_i10,pr_02_i10,pr_03_i10,pr_04_i10,pr_05_i10,pr_06_i10,pr_07_i10,pr_08_i10,...,icd10_18,icd10_19,icd10_20,icd10_21,icd10_22,icd10_23,icd10_24,icd10_25,icd10_26,icd10_27
0,"Fall,Unintentional,Blunt",,Ultrasonography,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Computerized tomography (CT) with contrast,Computerized tomography (CT) without contrast,,,,...,,,,,,,,,,
1,"MVT Occupant,Unintentional,Blunt",,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,,,,,,,...,,,,,,,,,,
2,"Fall,Unintentional,Blunt",,Plain radiography,Electrocardiogram (ECG),Computerized tomography (CT) without contrast,Plain radiography,Plain radiography,,,,...,,,,,,,,,,
3,"Fall,Unintentional,Blunt",,Plain radiography,Electrocardiogram (ECG),Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Plain radiography,Plain radiography,,...,,,,,,,,,,
4,"Fall,Unintentional,Blunt",,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Plain radiography,Plain radiography,Cardiac monitoring,,,...,,,,,,,,,,
5,"Fall,Unintentional,Blunt",,,,,,,,,,...,,,,,,,,,,
6,"Fall,Unintentional,Blunt",,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Plain radiography,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Cardiac monitoring,...,,,,,,,,,,
7,"Fall,Unintentional,Blunt",,Plain radiography,Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Plain radiography,Ultrasonography,Plain radiography,,,...,,,,,,,,,,
8,"Fall,Unintentional,Blunt",,Electrocardiogram (ECG),Computerized tomography (CT) without contrast,Computerized tomography (CT) without contrast,Plain radiography,Other cardiovascular system measurement and mo...,,,,...,,,,,,,,,,
9,"MVT Occupant,Unintentional,Blunt",,Plain radiography,Plain radiography,Plain radiography,Plain radiography,Plain radiography,Plain radiography,Plain radiography,Fluoroscopic angiography (excluding coronary),...,,,,,,,,,,


In [17]:
#MISSING RATE
import os
import pandas as pd
import numpy as np

# ---- inputs you already have ----
# df = your dataframe
# x1_cols = [...]
# cat_cols = [...]
# num_cols = [...]
df= X1
# ---- define missing tokens (case-insensitive) ----
missing_tokens = {"", "na", "n/a", "nan", "<n,a>", "none", "null"}

def _missing_mask(s: pd.Series) -> pd.Series:
    if pd.api.types.is_string_dtype(s) or s.dtype == "O":
        s_str = s.astype("string")
        norm = s_str.str.strip().str.lower()
        token_missing = norm.isin(missing_tokens)
        return s.isna() | token_missing
    return s.isna()

# choose which columns to analyze
cols = x1_cols 

# compute missing rate per column
missing_rate = {c: _missing_mask(df[c]).mean() for c in cols}

# optional: see which missing tokens actually appear in your data
tokens_found = (
    pd.concat([df[c] for c in cols], axis=0)
      .astype("string")
      .str.strip()
      .str.lower()
      .value_counts()
)
tokens_found = tokens_found[tokens_found.index.isin(missing_tokens)]
print("Missing tokens found:\n", tokens_found)


cat_set = set(cat_cols.astype(str).str.strip().tolist())
num_set = set(num_cols.astype(str).str.strip().tolist())
x1_list = x1_cols.astype(str).str.strip().tolist()

def _cat_label(col):
    if col in cat_set:
        return "cat"
    if col in num_set:
        return "num"
    return "other"
summary_df = pd.DataFrame({
    "feature": cols,
    "Cat": [ _cat_label(c) for c in cols ],
    "missing_rate": [ missing_rate[c] for c in cols ],
})

# save to Desktop
out_path = os.path.expanduser("~/Desktop/feature_missing_rates.csv")
summary_df.to_csv(out_path, index=False)
print("Saved:", out_path)

Missing tokens found:
 na    8283325
Name: count, dtype: Int64
Saved: /Users/jingyi/Desktop/feature_missing_rates.csv


In [17]:
labels = np.asarray(X1_labels)

# Remove noise (-1) from cluster set
unique_clusters = sorted(set(labels) - {-1})
print("Number of clusters (excluding -1):", len(unique_clusters))
print("Cluster IDs (first 20):", unique_clusters[:20])

Number of clusters (excluding -1): 8
Cluster IDs (first 20): [np.int32(0), np.int32(1), np.int32(2), np.int32(3), np.int32(4), np.int32(5), np.int32(6), np.int32(7)]


In [18]:
  # Decision Tree for X1
  #--------------------------
  # 0) Inputs
  # --------------------------
  MISSING_TOKENS = {
      "", " ", "  ", "\t", "\n", "\r",
      "na", "n/a", "nan", "null", "none", "nil",
      ".", "..", "...",
      "<unk>", "unk", "unknown", "missing", "nan", "na"
  }

  # X1, cat_cols, num_cols already defined
  # X1_labels loaded from npz

  # --------------------------
  # 1) Drop inc_key + align columns
  # --------------------------
  X = X1.drop(columns=["inc_key","C15_dt_mins"], errors="ignore").copy()

  cat_cols = [c for c in cat_cols if c in X.columns]
  num_cols = [c for c in num_cols if c in X.columns]

  # --------------------------
  # 2) Encode missingness + impute
  # --------------------------
  missing_indicator_cols = []

  for col in cat_cols + num_cols:
      s = X[col]

      # detect missing tokens (string-aware)
      s_str = s.astype(str).str.strip().str.lower()
      miss = s.isna() | s_str.isin(MISSING_TOKENS)

      # add indicator
      miss_col = f"{col}__missing"
      X[miss_col] = miss.astype(int)
      missing_indicator_cols.append(miss_col)

      # impute
      if col in cat_cols:
          X.loc[miss, col] = "MISSING"
          X[col] = X[col].astype(str)
      else:
          X[col] = pd.to_numeric(s, errors="coerce")
          #X.loc[miss, col] = np.nan
          miss2 = miss | X[col].isna()
          X.loc[miss2, col] = 0

  # update numeric columns to include missing indicators
  cat_cols = cat_cols + missing_indicator_cols


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use `newframe = frame.copy()`


DataFrame is highly fragmented.  This is usually the result of calling `frame.insert` many times, which has poor performance.  Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented fr

In [19]:
  # --------------------------
  # 3) Prepare labels (drop noise cluster -1 if present)
  # --------------------------
  y = X1_labels
  mask = y != -1
  X = X.loc[mask].reset_index(drop=True)
  y = y[mask]

In [20]:

  # --- fix feature types before training ---
  cat_cols = [c for c in cat_cols if c in X.columns]
  num_cols = [c for c in num_cols if c in X.columns]

  # remove any overlap
  overlap = set(cat_cols) & set(num_cols)
  if overlap:
      print("Removing from num_cols (categorical):", overlap)
      num_cols = [c for c in num_cols if c not in overlap]

  # force categorical dtype
  for c in cat_cols:
      X[c] = X[c].astype("category")

  # force numeric dtype
  for c in num_cols:
      X[c] = pd.to_numeric(X[c], errors="coerce")

In [22]:
  # --------------------------
  # 4) Train Optimal  (with raw feature dataset)
  # --------------------------
  (train_X, train_y), (test_X, test_y) = iai.split_data(
    "classification", X, y, seed=1)

  grid = iai.GridSearch(
      iai.OptimalTreeClassifier(random_seed=1),
      max_depth=5,
  )
  grid.fit(train_X, train_y)

  learner = grid.get_learner()
  print(learner)
  print("Test accuracy:", grid.score(test_X, test_y))

[33m[1m│ [22m[39m- e1_icd10
[33m[1m│ [22m[39m- MechanismType
[33m[1m│ [22m[39m- age_y
[33m[1m│ [22m[39m- county
[33m[1m│ [22m[39m- scene_prov
[33m[1m│ [22m[39m- eddispo
[33m[1m│ [22m[39m- prot_dev_1
[33m[1m│ [22m[39m- prot_dev_2
[33m[1m│ [22m[39m- rpd01_i10
[33m[1m│ [22m[39m- rpd02_i10
[33m[1m│ [22m[39m- rpd03_i10
[33m[1m│ [22m[39m- rpd04_i10
[33m[1m│ [22m[39m- rpd05_i10
[33m[1m│ [22m[39m- rpd06_i10
[33m[1m│ [22m[39m- rpd07_i10
[33m[1m│ [22m[39m- rpd08_i10
[33m[1m│ [22m[39m- rpd09_i10
[33m[1m│ [22m[39m- rpd10_i10
[33m[1m│ [22m[39m- rpd11_i10
[33m[1m│ [22m[39m- rpd12_i10
[33m[1m│ [22m[39m- rpt01_i10
[33m[1m│ [22m[39m- rpt02_i10
[33m[1m│ [22m[39m- rpt03_i10
[33m[1m│ [22m[39m- rpt04_i10
[33m[1m│ [22m[39m- rpt05_i10
[33m[1m│ [22m[39m- rpt06_i10
[33m[1m│ [22m[39m- rpt07_i10
[33m[1m│ [22m[39m- rpt08_i10
[33m[1m│ [22m[39m- rpt09_i10
[33m[1m│ [22m[39m- rpt10_i10
[33m[1m│ [

Fitted OptimalTreeClassifier:
  1) Split: PH_brady in [0.0,1.0] or is missing
    2) Split: pr_10_i10 in [Arterial oxygen saturation monitoring,Cholecystectomy,Colonoscopy and proctoscopy with biopsy,Common bile duct sphincterotomy and stenting,Diagnostic audiology,Electrophysiologic studies,MISSING,Measurement and monitoring, NEC,Nasal and sinus excision,Robotic-assisted procedures,Thoracentesis (diagnostic),Thyroidectomy]
      3) Split: iftx_prov in [6.0,7.0,MISSING]
        4) Split: pr_06_i10 in [Abdominal wall procedures, NEC,Abdominal wall repair (including hernia),Above knee and other proximal lower extremity amputation,Administration of nutritional and electrolytic substances,Arterial oxygen saturation monitoring,Artery, vein, and great vessel procedures, NEC,Arthrocentesis,Bone and joint biopsy,Bronchoscopy (therapeutic),Cardiac chest compression,Cardiac stress tests,Chest wall procedures, NEC,Colectomy,Control of bleeding (non-endoscopic),Coronary artery bypass grafts (CABG)

[33m[1m│ [22m[39m- pr_36_i10
[33m[1m│ [22m[39m- pr_37_i10
[33m[1m│ [22m[39m- pr_38_i10
[33m[1m│ [22m[39m- pr_39_i10
[33m[1m│ [22m[39m- pr_40_i10
[33m[1m│ [22m[39m- pr_41_i10
[33m[1m│ [22m[39m- pr_42_i10
[33m[1m│ [22m[39m- pr_43_i10
[33m[1m│ [22m[39m- pr_44_i10
[33m[1m│ [22m[39m- pr_45_i10
[33m[1m│ [22m[39m- pr_46_i10
[33m[1m│ [22m[39m- pr_47_i10
[33m[1m│ [22m[39m- pr_48_i10
[33m[1m│ [22m[39m- pr_49_i10
[33m[1m│ [22m[39m- pr_50_i10
[33m[1m│ [22m[39m- pr_51_i10
[33m[1m│ [22m[39m- pr_52_i10
[33m[1m│ [22m[39m- pr_53_i10
[33m[1m│ [22m[39m- pr_54_i10
[33m[1m│ [22m[39m- pr_55_i10
[33m[1m│ [22m[39m- pr_56_i10
[33m[1m│ [22m[39m- pr_57_i10
[33m[1m│ [22m[39m- pr_58_i10
[33m[1m│ [22m[39m- pr_59_i10
[33m[1m│ [22m[39m- pr_60_i10
[33m[1m│ [22m[39m- pr_61_i10
[33m[1m│ [22m[39m- pr_62_i10
[33m[1m│ [22m[39m- pr_63_i10
[33m[1m│ [22m[39m- pr_64_i10
[33m[1m│ [22m[39m- pr_65_i10
[33m[1m│ 

Test accuracy: 0.8575897320132865


In [23]:
  import json

  test_acc = grid.score(test_X, test_y)

  # save learner
  learner.write_json("learner_raw_feature_missing_col.json")

  # (optional) save grid search too
  grid.write_json("grid.json")

  # save metrics
  with open("metrics.json", "w") as f:
      json.dump({"test_accuracy": float(test_acc)}, f, indent=2)

  print("Saved: learner.json, grid.json (optional), metrics.json")

Saved: learner.json, grid.json (optional), metrics.json


In [24]:
  from IPython.display import IFrame, display

  html_path = "optimal_tree.html"
  learner.write_html(html_path)

  display(IFrame(src=html_path, width=1100, height=800))

In [111]:
  #reload
  import json
  from interpretableai.iaibase import read_json

  learner = read_json("learner.json")

  with open("metrics.json", "r") as f:
      metrics = json.load(f)

  print("Reloaded test_accuracy:", metrics["test_accuracy"])


Reloaded test_accuracy: 0.8583314521590506


In [25]:
  html_path = "/Users/jingyi/Desktop/Trauma_LLM/iai_X1/optimal_tree_missing.html"
  learner.write_html(html_path)
  print("Saved to:", html_path)

Saved to: /Users/jingyi/Desktop/Trauma_LLM/iai_X1/optimal_tree_missing.html


In [26]:
X["iftx_HR"]

0          98.0
1           0.0
2           0.0
3           0.0
4           0.0
          ...  
103357    124.0
103358      0.0
103359      0.0
103360     98.0
103361     98.0
Name: iftx_HR, Length: 103362, dtype: float64