In [45]:
import os
import pandas as pd
import numpy as np

# Paths
PROJ_PATH = '/data/gusev/USERS/jpconnor/clinical_text_project/'
PROFILE_PATH = '/data/gusev/PROFILE/CLINICAL/'
DATA_PATH = os.path.join(PROJ_PATH, 'data/')
MARKER_PATH = os.path.join(DATA_PATH, 'biomarker_analysis/')
OncDRS_PATH = '/data/gusev/PROFILE/CLINICAL/OncDRS/ALL_2025_03/'
IPTW_RESULTS_PATH = os.path.join(MARKER_PATH, 'IPTW_runs/')
FIGURE_PATH = os.path.join(PROJ_PATH, 'figures/')
MARKER_FIG_PATH = os.path.join(FIGURE_PATH, 'biomarker_analysis/')
IPTW_FIG_PATH = os.path.join(MARKER_FIG_PATH, 'IPTW_figures/')
KM_FIG_PATH = os.path.join(IPTW_FIG_PATH, 'KM_curves/')

os.listdir('/data/gusev/PROFILE/CLINICAL/OncDRS/ALL_2025_03/')

interaction_IO_df = pd.read_csv(os.path.join(MARKER_PATH, 'IPTW_IO_interaction_runs_df.csv'))

required_vars = ['DFCI_MRN', 'tt_death', 'death']
panel_cols = [col for col in interaction_IO_df.columns if 'PANEL_VERSION' in col]
cancer_type_cols = [col for col in interaction_IO_df.columns if col.startswith('CANCER_TYPE_')]
biomarker_cols = [col for col in interaction_IO_df.columns if col not in (['GENDER', 'AGE_AT_TREATMENTSTART'] + required_vars + panel_cols + cancer_type_cols + ['PX_on_IO', 'IO_prediction'])]

In [48]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
from tqdm import tqdm
from statsmodels.stats.multitest import multipletests

def test_marker_associations(df, marker_cols, score_col="prognostic_score"):
    results = []

    for marker in tqdm(marker_cols):
        d = df[[marker, score_col]].dropna()

        # ensure binary 0/1
        if not set(d[marker].unique()).issubset({0,1}):
            continue

        X = sm.add_constant(d[marker])
        y = d[score_col]

        model = sm.OLS(y, X).fit()

        beta = model.params[marker]
        ci_low, ci_high = model.conf_int().loc[marker]
        pval = model.pvalues[marker]

        results.append({
            "marker": marker,
            "n": len(d),
            "beta_mean_difference": beta,
            "ci_low": ci_low,
            "ci_high": ci_high,
            "p_value": pval,
            "mean_marker_pos": d.loc[d[marker]==1, score_col].mean(),
            "mean_marker_neg": d.loc[d[marker]==0, score_col].mean()
        })

    results_df = pd.DataFrame(results)

    # multiple testing correction
    reject, p_adj, _, _ = multipletests(results_df["p_value"], method="fdr_bh")
    results_df["p_adj_fdr"] = p_adj
    results_df["significant_fdr"] = reject

    return results_df.sort_values("p_adj_fdr")

association_df = test_marker_associations(interaction_IO_df, biomarker_cols, score_col='IO_prediction')

100%|██████████| 529/529 [00:02<00:00, 248.79it/s]


In [52]:
association_df.loc[association_df['significant_fdr']].sort_values(by='beta_mean_difference')

Unnamed: 0,marker,n,beta_mean_difference,ci_low,ci_high,p_value,mean_marker_pos,mean_marker_neg,p_adj_fdr,significant_fdr
231,PIK3CA,4151,-0.059425,-0.087005,-0.031846,2.447526e-05,0.158487,0.217912,,True
117,TP53,4151,-0.041405,-0.060428,-0.022382,2.022375e-05,0.189724,0.231129,,True
265,FAT1,4151,0.053384,0.023136,0.083631,0.0005454119,0.257166,0.203782,,True
249,SETBP1,4151,0.056404,0.01662,0.096188,0.005468014,0.26269,0.206286,,True
399,KMT2D,4151,0.060832,0.033255,0.088408,1.562572e-05,0.262173,0.201341,,True
261,ASXL1,4151,0.061099,0.018056,0.104142,0.005410891,0.267673,0.206574,,True
402,COL7A1,4151,0.06299,0.031722,0.094259,7.962753e-05,0.266219,0.203229,,True
130,FLT4,4151,0.063997,0.022229,0.105765,0.002681204,0.270206,0.206209,,True
24,ALK,4151,0.067364,0.028653,0.106075,0.0006517987,0.272739,0.205374,,True
161,ATR,4151,0.067895,0.022696,0.113094,0.003247316,0.274462,0.206567,,True


In [None]:
full_marker_set_df = pd.read_csv(os.path.join(IPTW_RESULTS_PATH, 'compiled_IPTW_IO_marker_df.csv'))

In [7]:
lung_w_ALK = interaction_IO_df.loc[(interaction_IO_df['ALK']) & (interaction_IO_df['CANCER_TYPE_LUNG'])]
lung_w_B2M = interaction_IO_df.loc[(interaction_IO_df['B2M']) & (interaction_IO_df['CANCER_TYPE_LUNG'])]

In [8]:
lung_w_ALK

Unnamed: 0,DFCI_MRN,tt_death,death,GENDER,AGE_AT_TREATMENTSTART,RHPN2,CANCER_TYPE_HEAD_AND_NECK,SH2B3,GPC3,TLR4,...,EPCAM,IDH2,CDK5,ERCC1,KDM5A,GNB2L1,SMC3,DDR2,PX_on_IO,IO_prediction
50,269901,38.0,1,1.0,60,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0.323090
57,277029,2525.0,0,1.0,68,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,1,0,1,0.449380
61,283551,9.0,1,0.0,72,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0.005341
62,286228,513.0,1,1.0,63,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0.900327
87,321233,797.0,1,1.0,78,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,1,0.001867
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2874,757761,2920.0,0,1.0,70,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,1.0,0.0,0,0,0,0.015077
3346,843571,276.0,1,1.0,77,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0.954666
3438,862596,126.0,1,1.0,67,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0.661483
3593,902186,330.0,0,1.0,76,0.0,False,0,0,0.0,...,0.0,0,0.0,0.0,0.0,0.0,0,0,0,0.803477


In [10]:
spec_data = pd.read_csv(os.path.join(OncDRS_PATH, 'SOMATIC_SPECIMEN.csv'))
cnv_results = pd.read_csv(os.path.join(OncDRS_PATH, 'SOMATIC_CNV_RESULTS.csv'))
mut_results = pd.read_csv(os.path.join(OncDRS_PATH, 'SOMATIC_MUTATION_RESULTS.csv'))
sv_results = pd.read_csv(os.path.join(OncDRS_PATH, 'SOMATIC_SV_RESULTS.csv'))

In [39]:
mut_results.loc[(mut_results['DFCI_MRN'].isin(lung_w_ALK['DFCI_MRN'])) & 
                (mut_results['GENE'] == 'ALK')][['DFCI_MRN', 'GENE', 'VARIANT_TYPE']].drop_duplicates()

Unnamed: 0,DFCI_MRN,GENE,VARIANT_TYPE
2193,599567,ALK,SNP
7524,212064,ALK,SNP
11669,740459,ALK,SNP
16956,662854,ALK,SNP
17611,870339,ALK,SNP
...,...,...,...
573881,660635,ALK,SNP
577529,671006,ALK,SNP
601094,344011,ALK,SNP
615175,796398,ALK,SNP


In [32]:
cnv_results.loc[(cnv_results['DFCI_MRN'].isin(lung_w_ALK['DFCI_MRN'])) &
                (cnv_results['GENE'] == 'ALK'), ['DFCI_MRN', 'GENE', 'CNV_TYPE_DESCR']].drop_duplicates()

Unnamed: 0,DFCI_MRN,GENE,CNV_TYPE_DESCR
68587,842865,ALK,Low-level Gain
454959,757761,ALK,Low-level Gain
661943,715479,ALK,One-copy or Heterozygous Deletion
705172,344011,ALK,Low-level Gain
838173,906562,ALK,Low-level Gain
1489174,1011130,ALK,Low-level Gain
2063002,714977,ALK,Low-level Gain
2540415,843571,ALK,Low-level Gain


In [30]:
sv_results.loc[(sv_results['DFCI_MRN'].isin(lung_w_ALK['DFCI_MRN'])) & 
               ((sv_results['PARTNER1_HUGO_GENE_NM'] == 'ALK') | (sv_results['PARTNER2_HUGO_GENE_NM'] == 'ALK')),
               ['DFCI_MRN', 'SV_TYPE', 'PARTNER1_HUGO_GENE_NM', 'PARTNER2_HUGO_GENE_NM']].drop_duplicates()

Unnamed: 0,DFCI_MRN,SV_TYPE,PARTNER1_HUGO_GENE_NM,PARTNER2_HUGO_GENE_NM
330,662854,Rearrangement,ALK,EML4
331,662854,Rearrangement,EML4,ALK
338,662854,Rearrangement_inversion,ALK,EML4
7651,667190,Not Available from Source Panel,EML4,ALK
12406,715479,Not Available from Source Panel,ALK,EML4


In [40]:
mut_results.loc[(mut_results['DFCI_MRN'].isin(lung_w_B2M['DFCI_MRN'])) & 
                (mut_results['GENE'] == 'B2M')][['DFCI_MRN', 'GENE', 'VARIANT_TYPE']].drop_duplicates()

Unnamed: 0,DFCI_MRN,GENE,VARIANT_TYPE
23924,273358,B2M,SNP
39791,702590,B2M,SNP
99637,841987,B2M,DEL
117131,739016,B2M,SNP
123638,779691,B2M,INS
138417,767084,B2M,DEL
151851,788780,B2M,DEL
176395,569400,B2M,SNP
197587,702581,B2M,SNP
327902,774443,B2M,DEL


In [None]:
cnv_results.loc[(cnv_results['DFCI_MRN'].isin(lung_w_ALK['DFCI_MRN'])) &
                (cnv_results['GENE'] == 'ALK'), ['DFCI_MRN', 'GENE', 'CNV_TYPE_DESCR']].drop_duplicates()