In [13]:
import os
import warnings
import random
from tqdm import tqdm
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Paths
DATA_PATH = '/data/gusev/USERS/jpconnor/data/clinical_text_embedding_project/'
NOTES_PATH = os.path.join(DATA_PATH, 'batched_datasets/processed_datasets/')
MARKER_PATH = os.path.join(DATA_PATH, 'biomarker_analysis/')
IPTW_RESULTS_PATH = os.path.join(MARKER_PATH, 'IPTW_runs/')
FIGURE_PATH = '/data/gusev/USERS/jpconnor/figures/clinical_text_embedding_project/'
MARKER_FIG_PATH = os.path.join(FIGURE_PATH, 'biomarker_analysis/')
IPTW_FIG_PATH = os.path.join(MARKER_FIG_PATH, 'IPTW_figures/')
KM_FIG_PATH = os.path.join(IPTW_FIG_PATH, 'KM_curves/')

cancer_types = ['pan_cancer', 'LUNG', 'SKIN']
complete_marker_set = []
IO_effects = []
type_specificity = []
for cancer_type in cancer_types:

    marker_df = pd.read_csv(os.path.join(IPTW_RESULTS_PATH, f'{cancer_type}_IPTW_IO_predictive_markers.csv'))

    IO_pred_marker_df = (marker_df
                         .query("significant_predictive")
                         .sort_values("beta_marker_IO", ascending=True))
    markers_w_IO_harm = IO_pred_marker_df.loc[IO_pred_marker_df['beta_marker_IO'] > 0.5, 'marker'].tolist()
    markers_w_IO_benefit = IO_pred_marker_df.loc[IO_pred_marker_df['beta_marker_IO'] < -0.5, 'marker'].tolist()
    
    complete_marker_set += markers_w_IO_harm + markers_w_IO_benefit
    IO_effects += ['harm' for _ in range(len(markers_w_IO_harm))] + ['benefit' for _ in range(len(markers_w_IO_benefit))]
    type_specificity += [cancer_type for _ in range(len(markers_w_IO_harm + markers_w_IO_benefit))]
    
full_marker_set_df = pd.DataFrame({'marker' : complete_marker_set, 'IO_effect' : IO_effects, 'cancer_type_specificity' : type_specificity})
(full_marker_set_df
 .sort_values(by=['cancer_type_specificity', 'IO_effect', 'marker'])
 .to_csv(os.path.join(IPTW_RESULTS_PATH, 'compiled_IPTW_IO_marker_df.csv'), index=False))

full_marker_set_df = pd.read_csv(os.path.join(IPTW_RESULTS_PATH, 'compiled_IPTW_IO_marker_df.csv'))