In [5]:
import os
import numpy as np
import pandas as pd
import joblib
import sys
sys.path.append('codes')
import utils
import importlib
importlib.reload(utils)

<module 'utils' from '/Users/intaemoon/Documents/github/onconpc/codes/utils.py'>

In [6]:
# Load processed CUP data
onconpc_processed_cups_df = pd.read_csv('data/onconpc_processed_cups_data.csv', index_col='RANDID')

# Load fully trained OncoNPC model
xgb_onconpc = joblib.load('model/xgboost_OncoNPC_full')

# Specify cancer types to consider
cancer_types_to_consider = ['Acute Myeloid Leukemia', 'Bladder Urothelial Carcinoma', 'Cholangiocarcinoma',
                            'Colorectal Adenocarcinoma', 'Diffuse Glioma', 'Endometrial Carcinoma',
                            'Esophagogastric Adenocarcinoma', 'Gastrointestinal Neuroendocrine Tumors', 'Gastrointestinal Stromal Tumor',
                            'Head and Neck Squamous Cell Carcinoma', 'Invasive Breast Carcinoma', 'Melanoma', 'Meningothelial Tumor',
                            'Non-Hodgkin Lymphoma', 'Non-Small Cell Lung Cancer', 'Ovarian Epithelial Tumor', 'Pancreatic Adenocarcinoma',
                            'Pancreatic Neuroendocrine Tumor', 'Pleural Mesothelioma', 'Prostate Adenocarcinoma', 'Renal Cell Carcinoma',
                            'Well-Differentiated Thyroid Cancer']

# Predict primary sites of CUP tumors
cup_preds_df = utils.get_xgboost_cancer_type_preds(xgb_onconpc,
                                                   onconpc_processed_cups_df,
                                                   cancer_types_to_consider)



In [7]:
shaps_cup = utils.obtain_shap_values(xgb_onconpc, onconpc_processed_cups_df)

In [None]:
# Specify query tumor sample ID
query_randid = 'DFCI_963265'

# Get OncoNPC prediction
pred_prob = cup_preds_df.at[query_randid, 'max_posterior']
pred_cancer = cup_preds_df.at[query_randid, 'cancer_type']
pred_cancer_idx = cancer_types_to_consider.index(pred_cancer)

# Get SHAP-based explanation for the prediction
feature_sample_df = onconpc_processed_cups_df.loc[query_randid]
shap_pred_cancer_df = pd.DataFrame(shaps_cup[pred_cancer_idx],
                                   index=onconpc_processed_cups_df.index,
                                   columns=onconpc_processed_cups_df.columns)
shap_pred_sample_df = shap_pred_cancer_df.loc[query_randid]

# Generate explanation plot
sample_info = f'RANDID: {query_randid}\nPrediction: {pred_cancer}\nPrediction probability: {pred_prob:.3f}'
feature_group_to_features_dict = utils.partiton_feature_names_by_group(onconpc_processed_cups_df.columns)
utils.get_individual_pred_interpretation(shap_pred_sample_df, feature_sample_df, feature_group_to_features_dict,
                                         sample_info=sample_info)