In [2]:
import os
import time
import pickle
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 
from lifelines import CoxPHFitter
from sklearn.preprocessing import StandardScaler
from sksurv.metrics import concordance_index_censored
from statsmodels.stats.multitest import multipletests
from embed_surv_utils import run_grid_CoxPH_parallel, get_heldout_risk_scores_CoxPH, generate_survival_embedding_df

random.seed(42)  # set seed for reproducibility

# Paths
IO_PATH = '/data/gusev/USERS/mjsaleh/'
PROJ_PATH = '/data/gusev/USERS/jpconnor/clinical_text_project/'
FIGURE_PATH = os.path.join(PROJ_PATH, 'figures/model_metrics/')
DATA_PATH = os.path.join(PROJ_PATH, 'data/')
SURV_PATH = os.path.join(DATA_PATH, 'survival_data/')
RESULTS_PATH = os.path.join(SURV_PATH, 'results/')
NOTES_PATH = os.path.join(DATA_PATH, 'batched_datasets/VTE_data/processed_datasets/')
STAGE_PATH = '/data/gusev/PROFILE/CLINICAL/OncDRS/DERIVED_FROM_CLINICAL_TEXTS_2024_03/derived_files/cancer_stage/'

biomarker_df = pd.read_csv(os.path.join(SURV_PATH, 'IO_biomarker_discovery.csv')).drop_duplicates(subset=['DFCI_MRN'], keep='first')
irAE_df = pd.read_csv(os.path.join(IO_PATH, 'IO_START.csv'), index_col=0).rename(columns={'MRN' : 'DFCI_MRN'})

death_df = biomarker_df.loc[biomarker_df['DFCI_MRN'].isin(irAE_df['DFCI_MRN'].unique())].copy()

base_vars = ['GENDER', 'AGE_AT_TREATMENTSTART'] + \
            [col for col in death_df if col.startswith('CANCER_TYPE') and death_df[col].sum() > 0] + \
            [col for col in death_df if col.startswith('PANEL_VERSION') and death_df[col].sum() > 0]
genomics_cols = [col for col in biomarker_df if not col in ['DFCI_MRN', 'tt_death', 'death'] + base_vars]

In [5]:
cancer_type_counts = death_df[[col for col in death_df.columns if col.startswith('CANCER_TYPE_') and ('OTHER' not in col)]].sum(axis=0).sort_values(ascending=False)
cancer_types_to_test = cancer_type_counts[cancer_type_counts >= 100].index.tolist()

MARKER_PATH = os.path.join(DATA_PATH, 'biomarker_analysis/text_risk_runs/')

In [None]:
base_vars = ['GENDER', 'AGE_AT_TREATMENTSTART'] + \
            [col for col in death_df if col.startswith('PANEL_VERSION') and death_df[col].sum() > 0]
genomics_cols = [col for col in biomarker_df if not col in ['DFCI_MRN', 'tt_death', 'death'] + base_vars + [col for col in biomarker_df.columns if col.startswith('CANCER_TYPE')]]

for cancer_type in cancer_types_to_test:
    
    print(f'Starting {cancer_type}.')
    start_time = time.time()
    
    type_df = death_df.loc[death_df[cancer_type]].copy()

    marker_dfs = []
    markers_to_test = [m for m in genomics_cols if (type_df[m].sum()/len(type_df)) >= 0.01]
    for test_col in tqdm(markers_to_test):
        try:
            cols_base = ['tt_death', 'death'] + base_vars + [test_col]
            base_cph = CoxPHFitter()
            base_cph.fit(type_df[cols_base], duration_col='tt_death', event_col='death', robust=True)
            base_sum = base_cph.summary.reset_index()
            base_entry = base_sum.loc[base_sum['covariate'] == test_col]
            base_entry.columns = [col + '_without_text_risk' for col in base_entry.columns]

            cols_risk = ['tt_death', 'death'] + base_vars + ['IO_risk_score', test_col]
            risk_cph = CoxPHFitter()
            risk_cph.fit(type_df[cols_risk], duration_col='tt_death', event_col='death', robust=True)
            risk_sum = risk_cph.summary.reset_index()
            risk_entry = risk_sum.loc[risk_sum['covariate'] == test_col]
            risk_entry.columns = [col + '_with_text_risk' for col in risk_entry.columns]

            entry = pd.concat([base_entry.reset_index(drop=True), risk_entry.reset_index(drop=True)], axis=1)
            entry.insert(0, 'covariate', test_col)
            entry.insert(1, 'c_index_without_text_risk', base_cph.concordance_index_)
            entry.insert(1, 'c_index_with_text_risk', risk_cph.concordance_index_)

            marker_dfs.append(entry)

        except Exception as e:
            continue
    try: 
        type_IO_marker_df = pd.concat(marker_dfs)
    
        reject, pvals_corrected, _, _ = multipletests(type_IO_marker_df["p_without_text_risk"], alpha=0.05, method="fdr_bh")
        risk_reject, risk_pvals_corrected, _, _ = multipletests(type_IO_marker_df["p_with_text_risk"], alpha=0.05, method="fdr_bh")
        
        type_IO_marker_df['corrected_p_without_text_risk'] = pvals_corrected
        type_IO_marker_df['corrected_p_with_text_risk'] = risk_pvals_corrected
        
        type_IO_marker_df['significant_without_text_risk'] = reject
        type_IO_marker_df['significant_with_text_risk'] = risk_reject
        
        all_sig_type_hits = type_IO_marker_df.loc[(type_IO_marker_df['significant_without_text_risk']) |
                                             (type_IO_marker_df['significant_with_text_risk'])]
    
        all_sig_type_hits.to_csv(os.path.join(MARKER_PATH, f'all_sig_{cancer_type.replace('CANCER_TYPE_', '').lower()}_hits.csv'), index=False)
    
        print(f'{cancer_type} finished. Time elapsed = {(time.time() - start_time) / 60 : 0.2f}')
    
    except:
        print(f'{cancer_type} finished. Time elapsed = {(time.time() - start_time) / 60 : 0.2f}')

Starting CANCER_TYPE_LUNG.


  6%|â–Œ         | 22/371 [00:05<01:30,  3.84it/s]

In [56]:
marker_files = []
for filename in os.listdir(MARKER_PATH):
    df = pd.read_csv(os.path.join(MARKER_PATH, filename))
    df['cancer_type'] = filename.replace('all_sig_', '').replace('_hits.csv', '')
    marker_files.append(df)

cols_to_keep = ['covariate', 'cancer_type', 'coef_without_text_risk', 'p_without_text_risk', 'corrected_p_without_text_risk', 
                'significant_without_text_risk', 'coef_with_text_risk', 'p_with_text_risk', 'corrected_p_with_text_risk', 'significant_with_text_risk']
full_marker_df = pd.concat(marker_files)[cols_to_keep]
full_marker_df.groupby(by='cancer_type')[['significant_without_text_risk', 'significant_with_text_risk']].sum().reset_index()
full_marker_df['delta_coef_with_text'] = full_marker_df['coef_with_text_risk'] - full_marker_df['coef_without_text_risk']

In [57]:
full_marker_df['text_makes_sig'] = False
full_marker_df.loc[full_marker_df['significant_with_text_risk'] & ~full_marker_df['significant_without_text_risk'], 'text_makes_sig'] = True

full_marker_df['text_makes_not_sig'] = False
full_marker_df.loc[~full_marker_df['significant_with_text_risk'] & full_marker_df['significant_without_text_risk'], 'text_makes_not_sig'] = True

In [59]:
full_marker_df.loc[full_marker_df['text_makes_sig']].groupby('cancer_type')['delta_coef_with_text'].agg(mean='mean', std='std').reset_index()

Unnamed: 0,cancer_type,mean,std
0,bladder,-0.197213,0.380478
1,bowel,0.108156,0.305281
2,brain,0.122341,0.206573
3,breast,-0.438157,
4,cup,-0.613623,0.818656
5,head_and_neck,-0.053416,0.149836
6,kidney,0.055553,0.34903
7,lung,0.07978,
8,skin,0.36913,
9,stomach,0.055419,0.385749


In [60]:
full_marker_df.loc[full_marker_df['text_makes_not_sig']].groupby('cancer_type')['delta_coef_with_text'].agg(mean='mean', std='std').reset_index()

Unnamed: 0,cancer_type,mean,std
0,bladder,-0.20255,0.251988
1,bowel,0.068727,0.190807
2,brain,-0.096665,0.244874
3,breast,-0.104834,0.243587
4,cup,-0.294969,0.577283
5,head_and_neck,0.044036,0.065165
6,kidney,-0.160207,0.379193
7,skin,-0.151572,0.392123
8,stomach,-0.342194,0.480784
