In [21]:
import os
import re
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm
from time import time

import warnings
warnings.filterwarnings("ignore")

proj_path = '/data/gusev/USERS/jpconnor/clinical_text_project/'
fig_path = proj_path + 'figures/model_metrics/'
data_path = proj_path + 'data/'
surv_path = data_path + 'survival_data/'
notes_path = data_path + 'batched_datasets/VTE_data/processed_datasets/'
results_path = surv_path + 'results/'

output_path = results_path + 'phecode_model_comps_final/'
events = os.listdir(output_path)

icd_to_phecode_map = pd.read_csv(data_path + 'code_data/icd_to_phecode_map.csv')
phecode_descr_map = dict(zip(icd_to_phecode_map['PHECODE'].astype(str), icd_to_phecode_map['PHECODE_DESCR']))
phecode_cat_descr_map = dict(zip(icd_to_phecode_map['PHECODE'].astype(str), icd_to_phecode_map['PHECODE_CAT_DESCR']))
phecode_descr_map.update({event : event for event in events if event not in phecode_descr_map.keys()})
phecode_cat_descr_map.update({'death' : 'Death', 'vte' : 'VTE'} | {met_event : 'Metastasis' for met_event in events if met_event.endswith('M')})

metric_data = []
for event in events:
    event_path = output_path + event + '/'
    
    stage_results = pd.read_csv(event_path + 'coxPH_stage_model_metrics.csv')
    type_results = pd.read_csv(event_path + 'coxPH_type_model_metrics.csv')
    stage_type_results = pd.read_csv(event_path + 'coxPH_stage_type_model_metrics.csv')
    embed_type_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_type_test_metrics.csv')
    embed_stage_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_stage_test_metrics.csv')
    embed_stage_type_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_stage_type_test_metrics.csv')
    
    stage_entry = stage_results.loc[stage_results['eval_data'] == 'test_data'].iloc[0]['mean_c_index']
    type_entry = type_results.loc[type_results['eval_data'] == 'test_data'].iloc[0]['mean_c_index']
    stage_type_entry = stage_type_results.loc[stage_type_results['eval_data'] == 'test_data'].iloc[0]['mean_c_index']
    embed_type_entry = embed_type_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']
    embed_stage_entry = embed_stage_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']
    embed_stage_type_entry = embed_stage_type_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']

    description = phecode_descr_map[event]
        
    metric_data.append([event, description, stage_entry, type_entry, stage_type_entry,
                        embed_type_entry, embed_stage_entry, embed_stage_type_entry])
    
metrics_df = pd.DataFrame(metric_data, columns=['event', 'event_descr', 'stage_mean_c_index', 'type_mean_c_index', 'stage_type_mean_c_index',
                                                'text_plus_type_mean_c_index', 'text_plus_stage_mean_c_index', 'text_plus_stage_type_mean_c_index'])

output_path = results_path + 'phecode_model_comps_full_cohort/'
events = os.listdir(output_path)

metric_data = []
for event in events:
    event_path = output_path + event + '/'
    
    base_results = pd.read_csv(event_path + 'coxPH_base_model_metrics.csv')
    embed_test_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_type_test_metrics.csv')
    
    base_entry = base_results.loc[base_results['eval_data'] == 'test_data'].iloc[0]['mean_c_index']
    embed_entry = embed_test_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']

    description = phecode_descr_map[event]
    
    metric_data.append([event, description, base_entry, embed_entry])
        
full_cohort_metrics_df = pd.DataFrame(metric_data, columns=['event', 'event_descr', 'type_mean_c_index', 'text_plus_type_mean_c_index'])

events_to_include = list(set(metrics_df.dropna()['event']) & set(full_cohort_metrics_df.dropna()['event']) - {'452.2', '452.0'}) 

metrics_df = metrics_df.loc[metrics_df['event'].isin(events_to_include)]
full_cohort_metrics_df = full_cohort_metrics_df.loc[full_cohort_metrics_df['event'].isin(events_to_include)]

rename_event_descr_map = {'death' : 'Death', 'vte' : 'VTE', '507.0' : 'Pleural effusion', '386.9' : 'Dizziness and giddiness', '280.1' : 'Iron deficiency anemias',
                          '288.2' : 'Elevated WBC', 'lungM' : 'Lung met.', 'liverM' : 'Liver met.', 'boneM' : 'Bone met.', 'nodeM' : 'Node met.'}
rename_event_descr_map = rename_event_descr_map | \
                         {event : event_descr for event, event_descr in zip(metrics_df['event'], metrics_df['event_descr']) \
                          if event not in rename_event_descr_map.keys()}

metrics_df['event_descr'] = metrics_df['event'].map(rename_event_descr_map)
full_cohort_metrics_df['event_descr'] = full_cohort_metrics_df['event'].map(rename_event_descr_map)

metrics_df.to_csv(os.path.join(results_path, 'stage_subset_metrics.csv'), index=False)
full_cohort_metrics_df.to_csv(os.path.join(results_path, 'full_cohort_metrics.csv'), index=False)

In [29]:
import os
import re
import pickle
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm
from time import time

import warnings
warnings.filterwarnings("ignore")

proj_path = '/data/gusev/USERS/jpconnor/clinical_text_project/'
fig_path = proj_path + 'figures/model_metrics/'
data_path = proj_path + 'data/'
surv_path = data_path + 'survival_data/'
notes_path = data_path + 'batched_datasets/VTE_data/processed_datasets/'
results_path = surv_path + 'results/'

output_path = results_path + 'phecode_model_comps_final/'
events = os.listdir(output_path)

icd_to_phecode_map = pd.read_csv(data_path + 'code_data/icd_to_phecode_map.csv')
phecode_descr_map = dict(zip(icd_to_phecode_map['PHECODE'].astype(str), icd_to_phecode_map['PHECODE_DESCR']))
phecode_cat_descr_map = dict(zip(icd_to_phecode_map['PHECODE'].astype(str), icd_to_phecode_map['PHECODE_CAT_DESCR']))
phecode_descr_map.update({event : event for event in events if event not in phecode_descr_map.keys()})
phecode_cat_descr_map.update({'death' : 'Death', 'vte' : 'VTE'} | {met_event : 'Metastasis' for met_event in events if met_event.endswith('M')})

metric_data = []
for event in events:
    event_path = output_path + event + '/'
    
    stage_results = pd.read_csv(event_path + 'coxPH_stage_model_metrics.csv')
    type_results = pd.read_csv(event_path + 'coxPH_type_model_metrics.csv')
    stage_type_results = pd.read_csv(event_path + 'coxPH_stage_type_model_metrics.csv')
    embed_type_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_type_val_metrics.csv')
    embed_stage_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_stage_val_metrics.csv')
    embed_stage_type_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_stage_type_val_metrics.csv')
    
    stage_entry = stage_results.loc[stage_results['eval_data'] == 'cv_data'].iloc[0]['mean_c_index']
    type_entry = type_results.loc[type_results['eval_data'] == 'cv_data'].iloc[0]['mean_c_index']
    stage_type_entry = stage_type_results.loc[stage_type_results['eval_data'] == 'cv_data'].iloc[0]['mean_c_index']
    embed_type_entry = embed_type_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']
    embed_stage_entry = embed_stage_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']
    embed_stage_type_entry = embed_stage_type_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']

    description = phecode_descr_map[event]
        
    metric_data.append([event, description, stage_entry, type_entry, stage_type_entry,
                        embed_type_entry, embed_stage_entry, embed_stage_type_entry])
    
metrics_df = pd.DataFrame(metric_data, columns=['event', 'event_descr', 'stage_mean_c_index', 'type_mean_c_index', 'stage_type_mean_c_index',
                                                'text_plus_type_mean_c_index', 'text_plus_stage_mean_c_index', 'text_plus_stage_type_mean_c_index'])

output_path = results_path + 'phecode_model_comps_full_cohort/'
events = os.listdir(output_path)

metric_data = []
for event in events:
    event_path = output_path + event + '/'
    
    base_results = pd.read_csv(event_path + 'coxPH_base_model_metrics.csv')
    embed_test_results = pd.read_csv(event_path + 'coxPH_decayed_embeddings_plus_type_val_metrics.csv')
    
    base_entry = base_results.loc[base_results['eval_data'] == 'cv_data'].iloc[0]['mean_c_index']
    embed_entry = embed_test_results.sort_values(by='mean_c_index', ascending=False).iloc[0]['mean_c_index']

    description = phecode_descr_map[event]
    
    metric_data.append([event, description, base_entry, embed_entry])
        
full_cohort_metrics_df = pd.DataFrame(metric_data, columns=['event', 'event_descr', 'type_mean_c_index', 'text_plus_type_mean_c_index'])

events_to_include = list(set(metrics_df.dropna()['event']) & set(full_cohort_metrics_df.dropna()['event']) - {'452.2', '452.0'}) 

metrics_df = metrics_df.loc[metrics_df['event'].isin(events_to_include)]
full_cohort_metrics_df = full_cohort_metrics_df.loc[full_cohort_metrics_df['event'].isin(events_to_include)]

rename_event_descr_map = {'death' : 'Death', 'vte' : 'VTE', '507.0' : 'Pleural effusion', '386.9' : 'Dizziness and giddiness', '280.1' : 'Iron deficiency anemias',
                          '288.2' : 'Elevated WBC', 'lungM' : 'Lung met.', 'liverM' : 'Liver met.', 'boneM' : 'Bone met.', 'nodeM' : 'Node met.'}
rename_event_descr_map = rename_event_descr_map | \
                         {event : event_descr for event, event_descr in zip(metrics_df['event'], metrics_df['event_descr']) \
                          if event not in rename_event_descr_map.keys()}

metrics_df['event_descr'] = metrics_df['event'].map(rename_event_descr_map)
full_cohort_metrics_df['event_descr'] = full_cohort_metrics_df['event'].map(rename_event_descr_map)