In [20]:
import os 
import re
import json
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
from itertools import product
from tqdm import tqdm
import time

from sksurv.linear_model import CoxPHSurvivalAnalysis, CoxnetSurvivalAnalysis
from sksurv.ensemble import RandomSurvivalForest, GradientBoostingSurvivalAnalysis

from sksurv.metrics import cumulative_dynamic_auc, concordance_index_censored, integrated_brier_score

from sklearn.model_selection import GridSearchCV, KFold, train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

from scipy.stats import sem

import importlib.util
spec = importlib.util.spec_from_file_location('script_utils', '/PHShome/jpc91/clinical_text_project/python_scripts/script_utils.py') 
script_utils = importlib.util.module_from_spec(spec)	 
spec.loader.exec_module(script_utils)

import warnings
warnings.filterwarnings("ignore")

oncDRS_path = '/data/gusev/PROFILE/CLINICAL/OncDRS/'
genomic_data_path = oncDRS_path + 'ALL_2024_05/'

data_path = '/data/gusev/USERS/jpconnor/clinical_text_project/data/'
surv_path = data_path + 'survival_data/'
notes_path = data_path + 'batched_datasets/VTE_data/processed_datasets/'

rel_risk_df = pd.read_csv(surv_path + 'genomics_risk_profiling.csv', index_col=0)

In [21]:
base_profiles = pd.read_csv(surv_path + 'genomics_delta_c_index_base_profiling.csv', index_col=0)
text_profiles = pd.read_csv(surv_path + 'genomics_delta_c_index_text_profiling.csv', index_col=0)

cancer_types = [col for col in base_profiles.columns if col != 'VAR']

In [25]:
base_profiles['GENE'] = base_profiles['VAR'].apply(lambda x : re.split('_', x)[0])
base_profiles['MUT_TYPE'] = base_profiles['VAR'].apply(lambda x : re.split('_', x)[1])

text_profiles['GENE'] = text_profiles['VAR'].apply(lambda x : re.split('_', x)[0])
text_profiles['MUT_TYPE'] = text_profiles['VAR'].apply(lambda x : re.split('_', x)[1])

In [46]:
var_list = []
delta_list = []
cancer_type_list = []
base_model_list = []
for cancer_type in cancer_types:
    base_deltas = base_profiles[cancer_type].values.tolist()
    text_deltas = text_profiles[cancer_type].values.tolist()
    
    var_list += (base_profiles['VAR'].values.tolist() + text_profiles['VAR'].values.tolist())
    delta_list += (base_deltas + text_deltas)
    cancer_type_list += [cancer_type for _ in range(len(base_deltas) + len(text_deltas))]
    base_model_list += (['no_text' for _ in range(len(base_deltas))] + ['text' for _ in range(len(text_deltas))])

hist_df = pd.DataFrame({'var' : var_list, 'delta_c_index' : delta_list, 'cancer_type' : cancer_type_list, 'base_model' : base_model_list})

In [48]:
hist_df['gene'] = hist_df['var'].apply(lambda x : re.split('_', x)[0])
hist_df['mut_type'] = hist_df['var'].apply(lambda x : re.split('_', x)[1])

In [50]:
mut_types = hist_df['mut_type'].unique().tolist()
for cancer_type in cancer_types:
    ct_df = hist_df.loc[hist_df['cancer_type'] == cancer_type]
    
    plt.figure(figsize=(8,6))
    sns.histplot(ct_df, x='delta_c_index', hue='base_model').set(title=f'{cancer_type} deltas')
    plt.tight_layout()
    plt.savefig(f'biomarker_figures/across_mutation_types/{cancer_type}_deltas_hist.png')
    plt.close()
    
    for mut_type in mut_types:
        ct_mt_df = ct_df.loc[ct_df['mut_type'] == mut_type]
        
        plt.figure(figsize=(8,6))
        sns.histplot(ct_mt_df, x='delta_c_index', hue='base_model').set(title=f'{cancer_type} deltas in {mut_type}')
        plt.tight_layout()
        plt.savefig(f'biomarker_figures/{mut_type}/{cancer_type}_{mut_type}_deltas_hist.png')
        plt.close()

In [57]:
just_hits_list = []
for cancer_type in cancer_types:
    for model in ['no_text', 'text']:
        spec_df = hist_df.loc[(hist_df['cancer_type'] == cancer_type) &
                              (hist_df['base_model'] == model)].sort_values(by='delta_c_index', ascending=False)
        just_hits_list += spec_df.iloc[0:10].values.tolist()
just_hits_df = pd.DataFrame(just_hits_list, columns=spec_df.columns)

In [59]:
var_hits_dict = {key : dict() for key in cancer_types}
gene_hits_dict = {key : dict() for key in cancer_types}
for cancer_type in cancer_types:
    ct_hits_df = just_hits_df.loc[just_hits_df['cancer_type'] == cancer_type]
    
    base_ct_hits = ct_hits_df.loc[ct_hits_df['base_model'] == 'no_text']
    text_ct_hits = ct_hits_df.loc[ct_hits_df['base_model'] == 'text']
    
    base_ct_vars = set(base_ct_hits['var'].tolist())
    text_ct_vars = set(text_ct_hits['var'].tolist())
    
    base_ct_genes = set(base_ct_hits['gene'].tolist())
    text_ct_genes = set(text_ct_hits['gene'].tolist())
    
    var_hits_dict[cancer_type]['overlapping'] = base_ct_vars.intersection(text_ct_vars)
    var_hits_dict[cancer_type]['only_base'] = base_ct_vars.difference(text_ct_vars)
    var_hits_dict[cancer_type]['only_text'] = text_ct_vars.difference(base_ct_vars)
    
    gene_hits_dict[cancer_type]['overlapping'] = base_ct_genes.intersection(text_ct_genes)
    gene_hits_dict[cancer_type]['only_base'] = base_ct_genes.difference(text_ct_genes)
    gene_hits_dict[cancer_type]['only_text'] = text_ct_genes.difference(base_ct_genes)

In [75]:
just_text_hits_df = just_hits_df.loc[just_hits_df['base_model'] == 'text']
just_base_hits_df = just_hits_df.loc[just_hits_df['base_model'] == 'no_text']



In [72]:
for key in sorted(var_hits_dict.keys()):
    print(f'{key} num_overlapping = {len(var_hits_dict[key]['overlapping'])}')
    print(f'{key} num_unique = {len(var_hits_dict[key]['only_base'])} \n')
    # print(var_hits_dict[key]['overlapping'])
    # print(var_hits_dict[key]['only_text'])
    # print(var_hits_dict[key]['only_base'])
    # print()

ADRENAL_GLAND num_overlapping = 0
ADRENAL_GLAND num_unique = 10 

AGGR_NHL num_overlapping = 0
AGGR_NHL num_unique = 10 

BILIARY_TRACT num_overlapping = 2
BILIARY_TRACT num_unique = 8 

BLADDER num_overlapping = 2
BLADDER num_unique = 8 

BONE num_overlapping = 6
BONE num_unique = 4 

BOWEL num_overlapping = 5
BOWEL num_unique = 5 

BRAIN num_overlapping = 6
BRAIN num_unique = 4 

BREAST num_overlapping = 1
BREAST num_unique = 9 

CUP num_overlapping = 4
CUP num_unique = 6 

HEAD_AND_NECK num_overlapping = 1
HEAD_AND_NECK num_unique = 9 

KIDNEY num_overlapping = 0
KIDNEY num_unique = 10 

LEUKEMIA num_overlapping = 2
LEUKEMIA num_unique = 8 

LIVER num_overlapping = 1
LIVER num_unique = 9 

LUNG num_overlapping = 6
LUNG num_unique = 4 

LYMPHOMA num_overlapping = 3
LYMPHOMA num_unique = 7 

MYELOMA num_overlapping = 5
MYELOMA num_unique = 5 

OTHER num_overlapping = 2
OTHER num_unique = 8 

OVARY num_overlapping = 2
OVARY num_unique = 8 

PANCREAS num_overlapping = 6
PANCREAS num_uni

In [71]:
for key in sorted(gene_hits_dict.keys()):
    print(f'{key} num_overlapping = {len(var_hits_dict[key]['overlapping'])}')
    print(f'{key} num_unique = {len(var_hits_dict[key]['only_base'])} \n')
    # print(var_hits_dict[key]['overlapping'])
    # print(var_hits_dict[key]['only_text'])
    # print(var_hits_dict[key]['only_base'])
    # print()

ADRENAL_GLAND num_overlapping = 0
ADRENAL_GLAND num_unique = 10 

AGGR_NHL num_overlapping = 0
AGGR_NHL num_unique = 10 

BILIARY_TRACT num_overlapping = 2
BILIARY_TRACT num_unique = 8 

BLADDER num_overlapping = 2
BLADDER num_unique = 8 

BONE num_overlapping = 6
BONE num_unique = 4 

BOWEL num_overlapping = 5
BOWEL num_unique = 5 

BRAIN num_overlapping = 6
BRAIN num_unique = 4 

BREAST num_overlapping = 1
BREAST num_unique = 9 

CUP num_overlapping = 4
CUP num_unique = 6 

HEAD_AND_NECK num_overlapping = 1
HEAD_AND_NECK num_unique = 9 

KIDNEY num_overlapping = 0
KIDNEY num_unique = 10 

LEUKEMIA num_overlapping = 2
LEUKEMIA num_unique = 8 

LIVER num_overlapping = 1
LIVER num_unique = 9 

LUNG num_overlapping = 6
LUNG num_unique = 4 

LYMPHOMA num_overlapping = 3
LYMPHOMA num_unique = 7 

MYELOMA num_overlapping = 5
MYELOMA num_unique = 5 

OTHER num_overlapping = 2
OTHER num_unique = 8 

OVARY num_overlapping = 2
OVARY num_unique = 8 

PANCREAS num_overlapping = 6
PANCREAS num_uni