In [13]:
# === Imports ===
import os
import pickle
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sksurv.metrics import concordance_index_censored
from embed_surv_utils import run_grid_CoxPH_parallel, get_heldout_risk_scores_CoxPH

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# === Paths ===
PROJ_PATH = '/data/gusev/USERS/jpconnor/clinical_text_project/'
FIGURE_PATH = os.path.join(PROJ_PATH, 'figures/model_metrics/')
DATA_PATH = os.path.join(PROJ_PATH, 'data/')
SURV_PATH = os.path.join(DATA_PATH, 'survival_data/')
RESULTS_PATH = os.path.join(SURV_PATH, 'results/')
OUTPUT_PATH = os.path.join(RESULTS_PATH, 'phecode_complete_model/')
HELD_OUT_PATH = os.path.join(RESULTS_PATH, 'phecode_single_modality_held_out_risk/')
os.makedirs(HELD_OUT_PATH, exist_ok=True)
STAGE_PATH = '/data/gusev/PROFILE/CLINICAL/OncDRS/DERIVED_FROM_CLINICAL_TEXTS_2024_03/derived_files/cancer_stage/'

os.environ["JOBLIB_DEFAULT_WORKER_TIMEOUT"] = "600"

# Load text data
time_decayed_events_df = pd.read_csv(os.path.join(SURV_PATH, 'time-to-phecode/time_decayed_events_df.csv'))

# Load datasets
mrn_stage_dict = pickle.load(open(os.path.join(STAGE_PATH, 'dfci_cancer_mrn_to_derived_cancer_stage.pkl'), 'rb'))
mrn_stage_df = pd.get_dummies(pd.DataFrame({'DFCI_MRN' : mrn_stage_dict.keys(), 
                                            'CANCER_STAGE' : mrn_stage_dict.values()}),
                              columns=['CANCER_STAGE'], drop_first=True)

# Load cancer types
cancer_type_df = pd.read_csv(
    '/data/gusev/PROFILE/CLINICAL/robust_VTE_pred_project_2025_03_cohort/data/first_treatments_dfci_w_inferred_cancers.csv',
    usecols=['DFCI_MRN', 'med_genomics_merged_cancer_group']).rename(columns={'med_genomics_merged_cancer_group': 'CANCER_TYPE'})
cancer_type_sub = cancer_type_df.loc[cancer_type_df['DFCI_MRN'].isin(time_decayed_events_df['DFCI_MRN'].unique())]

cancer_type_counts = cancer_type_sub['CANCER_TYPE'].value_counts()
types_to_keep = cancer_type_counts[cancer_type_counts >= 500].index.tolist()
cancer_type_sub['CANCER_TYPE'] = cancer_type_sub['CANCER_TYPE'].where(cancer_type_sub['CANCER_TYPE'].isin(types_to_keep), 'OTHER')
cancer_type_sub = pd.get_dummies(cancer_type_sub, columns=['CANCER_TYPE'], drop_first=True)

# Load genomics
somatic_df = pd.read_csv(os.path.join(DATA_PATH, 'PROFILE_2024_MUTATION_CARRIERS.csv')).drop_duplicates(subset='DFCI_MRN', keep='first')
somatic_df = pd.get_dummies(somatic_df.drop(columns=['sample_id', 'cbio_sample_id', 'cbio_patient_id', 
                                                     'onco_tree_code', 'briefcase', 'riker_pipeline_version', 
                                                     'riker_run_version', 'CANCER_TYPE']), columns=['PANEL_VERSION'])

# Load PRS
idmap = pd.read_csv("/data/gusev/PROFILE/CLINICAL/PROFILE_2024_idmap.csv").rename(columns={'MRN' : 'DFCI_MRN'})
prs_df = (pd.read_csv('/data/gusev/USERS/mjsaleh/PRS_PGScatalog/pgs_matrix_with_avg.tsv', sep='\t')
          .merge(idmap[['cbio_sample_id', 'DFCI_MRN']].rename(columns={'cbio_sample_id' : 'IID'}))
          .drop_duplicates(subset='DFCI_MRN', keep='first'))

# Load treatment data
treatment_df = (pd.read_csv("/data/gusev/USERS/mjsaleh/profile_lines_of_rx/profile_rxlines.csv")
                .rename(columns={"MRN": "DFCI_MRN", "LOT_start_date": "treatment_start_date"}))
treatment_df["treatment_start_date"] = pd.to_datetime(treatment_df["treatment_start_date"])
treatment_df = treatment_df.sort_values(["DFCI_MRN", "treatment_start_date"])
treatment_df["treatment_line"] = treatment_df.groupby("DFCI_MRN").cumcount() + 1
treatments = (treatment_df["Treatment_type"]
              .str.replace(";", "", regex=False)
              .str.split().explode())
dummies = pd.get_dummies(treatments, prefix="PX_on", drop_first=True).groupby(level=0).max()
treatment_df = pd.concat([treatment_df, dummies], axis=1)

# define prediction column types
somatic_mutation_cols = [col for col in somatic_df.columns if col != 'DFCI_MRN']
prs_cols = [col for col in prs_df.columns if 'PGS' in col]
treatment_cols = [c for c in treatment_df if c.startswith("PX_on_")]
cancer_type_cols = [col for col in cancer_type_sub.columns if col.startswith('CANCER_TYPE')]
stage_cols = [col for col in mrn_stage_df.columns if col.startswith('CANCER_STAGE')]
embed_cols = [c for c in time_decayed_events_df.columns if ('EMBEDDING' in c or '2015' in c)]
base_vars = ['GENDER', 'AGE_AT_TREATMENTSTART']
continuous_vars = ['AGE_AT_TREATMENTSTART'] + embed_cols + prs_cols

prs_pca_dict = {'PGS' : (prs_cols, 1500)}

full_prediction_df = (time_decayed_events_df
                      .merge(somatic_df[['DFCI_MRN'] + somatic_mutation_cols], on='DFCI_MRN')
                      .merge(prs_df[['DFCI_MRN'] + prs_cols], on='DFCI_MRN')
                      .merge(treatment_df.loc[treatment_df['treatment_line'] == 1, ['DFCI_MRN'] + treatment_cols], on='DFCI_MRN')
                      .merge(cancer_type_sub[['DFCI_MRN'] + cancer_type_cols], on='DFCI_MRN')
                      .merge(mrn_stage_df[['DFCI_MRN'] + stage_cols], on='DFCI_MRN'))

# Find all time-to-event columns
events = [col.split('_', 1)[1] for col in time_decayed_events_df.columns if col.startswith('tt')]
tt_events = [f"tt_{e}" for e in events]

# CoxPH hyperparameters
alphas_to_test = np.logspace(-5, 0, 30)
l1_ratios = [0.5, 1.0]

events_data = time_decayed_events_df[[event for event in events]]
event_freq = events_data.sum(axis=0) / len(events_data)
events_to_include = event_freq[event_freq >= 0.05].index

# final_events = list(set(events_to_include).difference(set(os.listdir(OUTPUT_PATH))))
final_events = events_to_include
max_iter=1000

events = os.listdir(OUTPUT_PATH)
import functools

for event in tqdm(events):
    EVENT_PATH = os.path.join(HELD_OUT_PATH, event)
    os.makedirs(EVENT_PATH, exist_ok=True)

    event_pred_df = full_prediction_df.loc[full_prediction_df[f'tt_{event}'] > 0].copy()

    try:
        somatic_val_metrics = pd.read_csv(os.path.join(OUTPUT_PATH, event, 'type_somatic_val.csv'))
        somatic_l1_ratio, somatic_alpha = somatic_val_metrics.sort_values(by='mean_auc(t)', ascending=False).iloc[0][['l1_ratio', 'alpha']]
    
        prs_val_metrics = pd.read_csv(os.path.join(OUTPUT_PATH, event, 'type_prs_val.csv'))
        prs_l1_ratio, prs_alpha = prs_val_metrics.sort_values(by='mean_auc(t)', ascending=False).iloc[0][['l1_ratio', 'alpha']]
    
        text_val_metrics = pd.read_csv(os.path.join(OUTPUT_PATH, event, 'type_text_val.csv'))
        text_l1_ratio, text_alpha = text_val_metrics.sort_values(by='mean_auc(t)', ascending=False).iloc[0][['l1_ratio', 'alpha']]
    
        stage_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols + stage_cols, ['AGE_AT_TREATMENTSTART'], [], 
                                                               event_col=event, tstop_col=f'tt_{event}', penalized=False).rename(columns={'risk_score' : 'stage_risk_score'})
    
        treatment_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols + treatment_cols, ['AGE_AT_TREATMENTSTART'], [], 
                                                                   event_col=event, tstop_col=f'tt_{event}', penalized=False).rename(columns={'risk_score' : 'treatment_risk_score'})
    
        somatic_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols, ['AGE_AT_TREATMENTSTART'], somatic_mutation_cols,
                                                                 event_col=event, tstop_col=f'tt_{event}', penalized=True, l1_ratio=somatic_l1_ratio, alpha=somatic_alpha
                                                                ).rename(columns={'risk_score' : 'somatic_risk_score'})
    
        prs_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols, ['AGE_AT_TREATMENTSTART'], [], pca_config=prs_pca_dict,
                                                             event_col=event, tstop_col=f'tt_{event}', penalized=True, l1_ratio=prs_l1_ratio, alpha=prs_alpha
                                                            ).rename(columns={'risk_score' : 'prs_risk_score'})
    
        text_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols, ['AGE_AT_TREATMENTSTART'] + embed_cols, embed_cols,
                                                              event_col=event, tstop_col=f'tt_{event}', penalized=True, l1_ratio=text_l1_ratio, alpha=text_alpha
                                                             ).rename(columns={'risk_score' : 'text_risk_score'})
        
        complete_risk_predictions = functools.reduce(
            lambda left, right: pd.merge(left, right, on="DFCI_MRN"),
            [stage_risk_predictions, treatment_risk_predictions, somatic_risk_predictions, prs_risk_predictions, text_risk_predictions])
        complete_risk_predictions.to_csv(os.path.join(EVENT_PATH, 'held_out_risk_predictions.csv'), index=False)
    except:
        continue

100%|██████████| 53/53 [03:05<00:00,  3.51s/it]


In [15]:
complete_risk_predictions

Unnamed: 0,DFCI_MRN,stage_risk_score,treatment_risk_score,somatic_risk_score,prs_risk_score,text_risk_score
0,103663,,,1.144351,0.0,0.963060
1,107014,,,-0.623163,0.0,-0.671033
2,107217,,,1.286928,0.0,2.011979
3,124902,,,0.538326,0.0,1.121317
4,125735,,,-0.751291,0.0,-0.659296
...,...,...,...,...,...,...
4898,1150732,,,-0.150184,0.0,-0.620990
4899,1152105,,,0.437459,0.0,0.241728
4900,1152526,,,-0.851810,0.0,-1.508803
4901,1152712,,,0.747425,0.0,0.443991


In [16]:
os.listdir(os.path.join(HELD_OUT_PATH, 'death'))

['held_out_risk_predictions.csv']

In [17]:
held_out_death = pd.read_csv(os.path.join(HELD_OUT_PATH, 'death/held_out_risk_predictions.csv'))

In [18]:
held_out_death

Unnamed: 0,DFCI_MRN,stage_risk_score,treatment_risk_score,somatic_risk_score,prs_risk_score,text_risk_score
0,103663,,,0.222335,0.0,0.624907
1,107014,,,-0.021524,0.0,0.598990
2,107217,,,0.443498,0.0,0.750736
3,124902,,,0.699940,0.0,0.421456
4,125735,,,-0.752659,0.0,-1.279567
...,...,...,...,...,...,...
4898,1150732,,,-0.425592,0.0,-0.396113
4899,1152105,,,-0.539143,0.0,-2.778417
4900,1152526,,,-0.875323,0.0,-1.393201
4901,1152712,,,0.247219,0.0,-0.850213


In [19]:
stage_risk_predictions = get_heldout_risk_scores_CoxPH(event_pred_df, base_vars + cancer_type_cols + stage_cols, ['AGE_AT_TREATMENTSTART'], [], 
                                                       event_col=event, tstop_col=f'tt_{event}', penalized=False).rename(columns={'risk_score' : 'stage_risk_score'})

In [20]:
stage_risk_predictions

Unnamed: 0,DFCI_MRN,stage_risk_score
0,103663,
1,107014,
2,107217,
3,124902,
4,125735,
...,...,...
4898,1150732,
4899,1152105,
4900,1152526,
4901,1152712,


Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: Matrix is singular.
Fold failure: 