In [7]:
import os
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from sklearn.preprocessing import StandardScaler
from sksurv.metrics import concordance_index_censored
from statsmodels.stats.multitest import multipletests
from lifelines import CoxPHFitter
from embed_surv_utils import run_grid_CoxPH_parallel, get_heldout_risk_scores_CoxPH, generate_survival_embedding_df
from statsmodels.stats.multitest import multipletests
from scipy import stats

random.seed(42)  # set seed for reproducibility

# Paths
IO_PATH = '/data/gusev/USERS/mjsaleh/'
PROJ_PATH = '/data/gusev/USERS/jpconnor/clinical_text_project/'
FIGURE_PATH = os.path.join(PROJ_PATH, 'figures/model_metrics/')
DATA_PATH = os.path.join(PROJ_PATH, 'data/')
SURV_PATH = os.path.join(DATA_PATH, 'survival_data/')
RESULTS_PATH = os.path.join(SURV_PATH, 'results/')
NOTES_PATH = os.path.join(DATA_PATH, 'batched_datasets/VTE_data/processed_datasets/')
STAGE_PATH = '/data/gusev/PROFILE/CLINICAL/OncDRS/DERIVED_FROM_CLINICAL_TEXTS_2024_03/derived_files/cancer_stage/'
MARKER_PATH = os.path.join(DATA_PATH, 'biomarker_analysis/')

interaction_IO_df = pd.read_csv(os.path.join(MARKER_PATH, 'IPTW_IO_interaction_runs_df.csv'))

results_df = pd.read_csv(os.path.join(MARKER_PATH, 'IPTW_runs/pan_cancer_IPTW_IO_predictive_markers.csv'))
results_noiptw_df = pd.read_csv(os.path.join(MARKER_PATH, 'IPTW_runs/pan_cancer_noIPTW_IO_predictive_markers.csv'))

In [15]:
results_df['classifier'].unique()

array(['IO_specific_effect', 'predictive_IO_harm', 'no_signal',
       'predictive_IO_benefit', 'prognostic_nonIO'], dtype=object)

In [11]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from lifelines import KaplanMeierFitter, CoxPHFitter

# -----------------------------
# Assumed columns:
# PX_on_IO       : 0/1 treatment indicator
# tt_death       : time-to-death (>0)
# death          : 0/1 event indicator
# IO_prediction  : propensity score P(PX_on_IO = 1 | X)
# -----------------------------

df = interaction_IO_df.copy()

# Drop invalid rows
df = df.loc[
    (df["tt_death"] > 0) &
    (df["IO_prediction"] > 0) &
    (df["IO_prediction"] < 1)
].copy()

# Stabilized IPTW
p_treat = df["PX_on_IO"].mean()
df["iptw"] = np.where(
    df["PX_on_IO"] == 1,
    p_treat / df["IO_prediction"],
    (1 - p_treat) / (1 - df["IO_prediction"])
)

# Truncate extreme weights (1st–99th percentile)
lo, hi = df["iptw"].quantile([0.01, 0.99])
df["iptw"] = df["iptw"].clip(lo, hi)

# Effective sample size
ess = (df["iptw"].sum() ** 2) / (df["iptw"] ** 2).sum()
print(f"Effective sample size (IPTW): {ess:.1f}")

Effective sample size (IPTW): 113.8


In [None]:
# -----------------------------
# IPTW-adjusted Kaplan–Meier
# -----------------------------
kmf = KaplanMeierFitter()

plt.figure(figsize=(7, 5))

for a, label in [(0, "No IO"), (1, "On IO")]:
    d = df[df["PX_on_IO"] == a]
    kmf.fit(
        durations=d["tt_death"],
        event_observed=d["death"],
        weights=d["iptw"],
        label=label
    )
    kmf.plot_survival_function(ci_show=False)

plt.title("IPTW-adjusted Kaplan–Meier: Overall Survival")
plt.xlabel("Time")
plt.ylabel("Survival probability")
plt.tight_layout()
plt.show()

# -----------------------------
# Weighted Cox model (robust SE)
# -----------------------------
cph = CoxPHFitter()
cph.fit(
    df[["tt_death", "death", "PX_on_IO", "iptw"]],
    duration_col="tt_death",
    event_col="death",
    weights_col="iptw",
    robust=True
)

cph.print_summary()