In [29]:
import os 
import re
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from itertools import product
from tqdm import tqdm
import time

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored, integrated_brier_score

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import sem

import importlib.util
spec = importlib.util.spec_from_file_location('script_utils', '/PHShome/jpc91/clinical_text_project/python_scripts/script_utils.py') 
script_utils = importlib.util.module_from_spec(spec)	 
spec.loader.exec_module(script_utils)

import warnings
warnings.filterwarnings("ignore")

oncDRS_path = '/data/gusev/PROFILE/CLINICAL/OncDRS/'
genomic_data_path = oncDRS_path + 'ALL_2024_05/'

data_path = '/data/gusev/USERS/jpconnor/clinical_text_project/data/'
surv_path = data_path + 'survival_data/'
notes_path = data_path + 'batched_datasets/VTE_data/processed_datasets/'

delta_c_indices_text_df = pd.read_csv(surv_path + 'genomics_delta_c_index_base_profiling.csv', index_col=0)
delta_c_indices_no_text_df = pd.read_csv(surv_path + 'genomics_delta_c_index_text_profiling.csv', index_col=0)

In [30]:
delta_c_indices_text_df

Unnamed: 0,VAR,LUNG,UTERUS,BRAIN,LYMPHOMA,SKIN,BOWEL,PANCREAS,OTHER,CUP,...,SOFT_TISSUE,MYELOMA,LIVER,VULVA,TESTIS,KIDNEY,LEUKEMIA,BONE,ADRENAL_GLAND,PAN_CANCER
0,TP53_SNP,-0.000849,-0.001764,-0.002773,0.001418,-0.001390,0.000764,0.000550,0.001803,0.001837,...,-0.000414,0.004842,0.005369,0.000000,0.000000,-0.000539,-0.001405,0.012698,-0.029412,0.000127
1,MYC_LA,-0.000463,-0.000934,-0.000576,0.000591,0.000800,-0.000124,-0.000353,0.000081,-0.000270,...,0.001436,-0.000215,-0.004027,0.000000,0.021583,-0.001348,-0.000468,-0.003175,0.000000,-0.000343
2,RAD21_LA,-0.000516,-0.000664,-0.000547,0.000591,0.000918,0.000056,-0.000345,0.000131,-0.000351,...,0.001290,-0.000323,-0.006711,0.007692,0.021583,-0.001348,-0.000468,-0.003175,0.000000,-0.000347
3,EXT1_LA,-0.000687,-0.000498,-0.000684,0.000355,0.000957,-0.000009,-0.000715,0.000104,-0.000054,...,0.001485,-0.000323,-0.006711,0.007692,0.021583,-0.000809,-0.000702,-0.003175,0.000000,-0.000330
4,TP53_1DEL,-0.000134,0.000145,0.001035,0.001182,0.000511,-0.000366,0.001413,-0.000003,0.000837,...,-0.000097,0.000323,0.000000,0.000000,0.007194,-0.001348,-0.001171,0.000000,-0.029412,-0.000288
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,RELA_SNP,-0.000182,0.000166,-0.000107,0.000591,-0.000013,-0.000201,-0.000329,-0.000018,0.000041,...,0.000049,-0.000646,0.000000,0.000000,0.000000,0.000000,0.000234,0.000000,0.000000,-0.000034
1271,CXCR4_SNP,-0.000111,-0.000083,-0.000088,0.000946,-0.000406,0.000023,-0.000041,0.000020,0.000027,...,-0.000390,0.000215,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,-0.000020
1272,PRPF40B_SNP,-0.000652,0.000622,-0.002158,-0.004846,-0.000026,0.000144,0.000600,-0.000674,-0.001013,...,-0.000974,0.000430,0.000000,0.000000,0.007194,-0.003236,-0.003512,0.000000,0.000000,-0.000399
1273,TFE3_SNP,-0.000004,0.000041,-0.000166,-0.000118,-0.000354,0.000161,-0.000156,-0.000002,0.000162,...,-0.000024,0.000215,0.000000,0.000000,0.000000,0.000270,0.000000,0.000000,0.000000,-0.000012


In [31]:
delta_c_indices_no_text_df

Unnamed: 0,VAR,LUNG,UTERUS,BRAIN,LYMPHOMA,SKIN,BOWEL,PANCREAS,OTHER,CUP,...,SOFT_TISSUE,MYELOMA,LIVER,VULVA,TESTIS,KIDNEY,LEUKEMIA,BONE,ADRENAL_GLAND,PAN_CANCER
0,TP53_SNP,0.020904,0.025781,-0.037270,-0.003841,-0.002288,0.007704,0.021743,0.031845,0.017893,...,0.040996,0.035130,0.013423,0.038462,0.057554,0.026160,0.031140,0.088889,-0.264706,0.022553
1,MYC_LA,0.008793,0.019173,-0.010098,-0.011878,0.041849,0.008355,0.008023,0.000399,-0.006205,...,0.024698,0.005487,0.052349,0.084615,-0.021583,0.007956,0.002341,-0.006349,-0.088235,0.003692
2,RAD21_LA,0.007343,0.020002,-0.009131,-0.009278,0.037142,0.007148,0.009777,0.001071,-0.001182,...,0.020815,0.003766,0.046980,0.115385,0.021583,0.003910,0.005385,0.009524,-0.088235,0.002904
3,EXT1_LA,0.005887,0.022275,-0.006968,-0.010933,0.037607,0.005803,0.008302,0.001128,-0.001573,...,0.022543,0.003766,0.051007,0.107692,0.014388,0.002562,0.003980,0.006349,-0.088235,0.003355
4,TP53_1DEL,-0.001368,0.013591,0.012886,0.014242,0.004570,-0.001792,0.025325,0.003106,0.007974,...,0.021058,0.002905,-0.022819,0.038462,0.043165,0.018339,0.031374,0.057143,-0.294118,0.002227
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1270,RELA_SNP,0.000949,-0.000363,-0.000039,0.000118,0.000125,-0.001593,0.000440,0.000031,0.000993,...,0.000073,-0.001399,-0.005369,0.000000,0.000000,-0.001079,0.000468,0.000000,0.000000,0.000097
1271,CXCR4_SNP,0.003023,0.005187,-0.001997,-0.003605,0.008753,0.001264,-0.001614,-0.001080,-0.004146,...,-0.001290,0.000323,-0.040268,0.000000,0.000000,0.000809,0.002576,-0.003175,0.000000,0.000734
1272,PRPF40B_SNP,0.000394,0.000405,-0.000425,-0.000355,0.000079,-0.000005,-0.000361,-0.000313,-0.001040,...,0.000024,0.000215,0.000000,0.000000,0.000000,-0.000270,-0.006790,0.000000,0.000000,-0.000181
1273,TFE3_SNP,0.000236,-0.002459,-0.003989,-0.000473,0.002072,-0.001260,-0.000407,-0.000183,0.000270,...,0.000560,-0.001937,0.000000,0.023077,0.000000,0.000270,0.001405,-0.003175,0.000000,-0.000431


In [33]:
genomics_cols = [col for col in delta_c_indices_text_df.columns if col != 'VAR']

In [34]:
top_hits_pan_cancer_w_text = delta_c_indices_text_df.sort_values(by='PAN_CANCER', ascending=False)
top_hits_pan_cancer_w_text['VAR'].iloc[0:10]

222    CDKN2A_2DEL
247    CDKN2B_2DEL
459      MTAP_2DEL
424       NTHL1_LA
375        TSC2_LA
470        SLX4_LA
818        EGFR_HA
178       MAF_1DEL
504       TRAF7_LA
234     NOTCH1_SNP
Name: VAR, dtype: object

In [35]:
top_hits_pan_cancer_no_text = delta_c_indices_no_text_df.sort_values(by='PAN_CANCER', ascending=False)
top_hits_pan_cancer_no_text['VAR'].iloc[0:10]

0           TP53_SNP
222      CDKN2A_2DEL
247      CDKN2B_2DEL
44          KRAS_SNP
73     PDCD1LG2_1DEL
68       CDKN2B_1DEL
67       CDKN2A_1DEL
77        CD274_1DEL
10           ETV1_LA
72         JAK2_1DEL
Name: VAR, dtype: object

In [39]:
text_vs_no_text_df = delta_c_indices_text_df[genomics_cols] - delta_c_indices_no_text_df[genomics_cols]
text_vs_no_text_df.insert(0, 'VAR', delta_c_indices_text_df['VAR'])

In [41]:
text_vs_no_text_df.sort_values(by='PAN_CANCER', ascending=True)['VAR'].iloc[0:10]

0           TP53_SNP
222      CDKN2A_2DEL
247      CDKN2B_2DEL
44          KRAS_SNP
73     PDCD1LG2_1DEL
68       CDKN2B_1DEL
67       CDKN2A_1DEL
77        CD274_1DEL
10           ETV1_LA
72         JAK2_1DEL
Name: VAR, dtype: object

In [None]:
hits_data=[]
n_vars=5
for col in sorted([col for col in delta_c_indices_df.columns if col != 'VAR']):
    top_vars = delta_c_indices_df.sort_values(by=col, ascending=False).iloc[0:n_vars]['VAR']
    for var_hit in top_vars:
        hits_data.append([col, var_hit] + re.split('_', var_hit))
hits_df = pd.DataFrame(hits_data, columns=['CANCER', 'VAR', 'GENE', 'VAR_TYPE'])

In [None]:
hits_df['VAR_TYPE'].value_counts().iloc[0:20]

In [None]:
n_vars=5
for col in sorted([col for col in delta_c_indices_df.columns if col != 'VAR']):
    top_vars = delta_c_indices_df.sort_values(by=col, ascending=False).iloc[0:n_vars]['VAR']
    print(f'Top {n_vars} Mutations for {col} Cancer:')
    for var in top_vars:
        print(var)
    print()