In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from engines.datasets.base_datasets import SurvivalGEDataset
from engines.hp_dict.base import HP_dict
from engines.models import cox_models
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from engines.models import functions
plt.rcParams["svg.fonttype"] = "none"


In [2]:
### import some basic general arguments
from experiments.parsers import parse_arguments


## Data
The data used to train the network is described here. We use 
* **mutation profiles** : is described by presence/absence of NPM1 mutation, FLT3-ITD mutation and IDH1-R32 mutation.
* **transcriptomic profile** : is descr. by gene expression of the protein coding genes with selection/projection-based dimensionality reduction. (to up to 17 components.)    
* **cytogenetic profile** : is descr. by multiple (18) cytogenetic abnormalities or groups.
    
* **age** (described by age > 60 years boolean), **sex** (is patient female? bool)

In [3]:
## Clinical FEATURES 
mutations = ["NPM1 mutation", "FLT3-ITD mutation", "IDH1-R132 mutation"]
age_sex = ["Sex_F","Age_gt_60"]
cytogenetics = ['MLL translocations (+MLL FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       'Intermediate abnormal karyotype (except isolated trisomy/tetrasomy 8)',
       'Normal karyotype',
       'Complex (3 and more chromosomal abnormalities)',
       'Trisomy/tetrasomy 8 (isolated)',
       'Monosomy 5/ 5q-/Monosomy 7/ 7q- (less than 3 chromosomal abnormalities)',
       'NUP98-NSD1(normal karyotype)',
       't(8;21)(q22;q22)/RUNX1-RUNX1T1 (Irrespective of additional cytogenetic abnormalities)',
       'inv(16)(p13.1q22)/t(16;16)(p13.1;q22)/CBFB-MYH11 (Irrespective of additional cytogenetic abnormalities)',
       'EVI1 rearrangements (+EVI1 FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       't(6;9)(p23;q34) (Irrespective of additional cytogenetic abnormalities)',
       'Monosomy17/del17p (less than 3 chromosomal abnormalities)',
       'Hyperdiploid numerical abnormalities only']
clinical_features = np.concatenate([mutations, cytogenetics, age_sex])

In [4]:
SGE = SurvivalGEDataset()
## cohort ## input_types ## other params
## data{x: input_data, y: target} 
clin_factors = SGE.new("lgn_pronostic", clinical_features, gene_expressions=None)
clin_factors_lsc17 = SGE.new("lgn_pronostic", clinical_features, gene_expressions="LSC17")
clin_factors_pca = SGE.new("lgn_pronostic", clinical_features, gene_expressions="PCA")

Loading and assembling Gene Repertoire...
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


In [5]:
clin_factors_copy = clin_factors.clone()
print( "Is the cloned copy equal in values ? : ", np.all(clin_factors.x == clin_factors_copy.x))
clin_factors.split_train_test(5)
clin_factors_copy.split_train_test(5)


Is the cloned copy equal in values ? :  True


In [6]:
# splitting different copies of the cloned set will shuffle the data internally 
n = clin_factors.folds[0].train.x.index.isin(clin_factors_copy.folds[0].train.x.index).sum()

print (f"The number of common samples between two shuffles: {n}")

The number of common samples between two shuffles: 190


## Example
### Parameters, preprocessing, splitting


In [7]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
data = clin_factors.clone()
# preprocess data (remove low variance columns)
data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
# splitting
data.split_train_test(HyperParams.nfolds)
# generate model parameters 
params = HyperParams.generate_default(model_type = "cphdnn_2l", data = data)



### Launch training

In [8]:
# c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        

## Figures 
#### Data generation

In [9]:
test1 = SGE.new("lgn_pronostic", clinical_features, gene_expressions="LSC17")

Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


In [10]:
test1.name

'clin. factors + LSC17'

In [14]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
CDS = SGE.get_data("lgn_pronostic")["CDS"]
cyt = pd.DataFrame(SGE.data["CF"]["Cytogenetic risk"])
cyt_levels = [{"intermediate cytogenetics":1, "Intermediate/Normal":1, "adverse cytogenetics": 2, "favorable cytogenetics":0, "Favorable":0, "Standard":1, "Low":0, "Poor":2, None: 1}[level] for level in cyt["Cytogenetic risk"]] 
cyt["pred_risk"] = cyt_levels
cyt_c_scores, cyt_metrics = functions.compute_cyto_risk_c_index(cyt["pred_risk"], CDS.y, gamma = 0.001, n = HyperParams.bootstr_n)
print("C index method 1: ", cyt_metrics)

results = [(1, "c_index", "cytogenetics", cyt_metrics[0], cyt_metrics[1], cyt_metrics[2] )]

for repn in range(1,4,1):        
    # for model_type in ["ridge_cph_lifelines_CF", "cphdnn_2l"]:
    #     data = clin_factors.clone()
    #     # preprocess data (remove low variance columns)
    #     data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
    #     # splitting
    #     data.split_train_test(HyperParams.nfolds)
    #     # generate model parameters 
    #     params = HyperParams.generate_default(model_type = model_type, data = data)
    #     # train and evaluate model
    #     c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
    #     # append to results
    #     results.append((repn, params["modeltype"], data.name, c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))

    # for model_type in ["ridge_cph_lifelines_CF_LSC17", "cphdnn_5l"]:
    #     data = clin_factors_lsc17.clone()
    #     # preprocess data (remove low variance columns)
    #     data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        
    #     # splitting
    #     data.split_train_test(HyperParams.nfolds)
    #     # generate model parameters 
    #     params = HyperParams.generate_default(model_type = model_type, data = data)
    #     # train and evaluate model
    #     c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
    #     # append to results
    #     results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))

    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_5l"]:
        data = clin_factors_pca.clone()
        # preprocess data 
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))



Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


100%|██████████| 1000/1000 [00:00<00:00, 1183.71it/s]


C index method 1:  (0.631055645492546, 0.6151448522482905, 0.6473288713847344)


clin. factors; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.64it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 937.54it/s]


training c indices:  [0.68 0.69 0.69 0.71 0.67]
valid c indices (aggregated):  (0.6748774260008484, 0.6420470654947548, 0.7033702957804576)
Setting up stack... saving to GPU


clin. factors; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.89s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1008.65it/s]


training c indices:  [0.7  0.72 0.74 0.75 0.78]
valid c indices (aggregated):  (0.6978140340827361, 0.6704084235916228, 0.7247247728474593)


clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  8.28it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 973.76it/s]


training c indices:  [0.77 0.75 0.76 0.74 0.74]
valid c indices (aggregated):  (0.6671016121983845, 0.638828997173111, 0.6969944894963545)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:29<00:00,  5.97s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 999.21it/s]


training c indices:  [0.87 0.93 0.98 0.99 0.99]
valid c indices (aggregated):  (0.8091393752567002, 0.7883938894492141, 0.8299443234083757)


clin. factors; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.43it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 975.57it/s]


training c indices:  [0.7  0.7  0.68 0.68 0.69]
valid c indices (aggregated):  (0.6710152164091664, 0.6420142732647149, 0.700881565699973)
Setting up stack... saving to GPU


clin. factors; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.86s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1007.05it/s]


training c indices:  [0.7  0.72 0.74 0.76 0.77]
valid c indices (aggregated):  (0.705473855407937, 0.6781790377602634, 0.7336163625275601)


clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  8.32it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 968.57it/s]


training c indices:  [0.76 0.73 0.76 0.74 0.75]
valid c indices (aggregated):  (0.6962327840680644, 0.668049492293963, 0.7221190711581588)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:29<00:00,  5.96s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1008.01it/s]


training c indices:  [0.87 0.95 0.96 0.98 0.99]
valid c indices (aggregated):  (0.7949496696738594, 0.7705805841314887, 0.8180707571866663)


clin. factors; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.53it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 974.00it/s]


training c indices:  [0.69 0.68 0.68 0.69 0.69]
valid c indices (aggregated):  (0.6847668473390893, 0.6567317814949117, 0.7136378201178182)
Setting up stack... saving to GPU


clin. factors; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.85s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1008.01it/s]


training c indices:  [0.71 0.73 0.75 0.75 0.75]
valid c indices (aggregated):  (0.7055661007263107, 0.6763423212192262, 0.7340561686508416)


clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  8.41it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 972.16it/s]


training c indices:  [0.75 0.75 0.75 0.74 0.75]
valid c indices (aggregated):  (0.6932384262103204, 0.6659968040288606, 0.7213194041400658)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:29<00:00,  5.97s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1007.74it/s]

training c indices:  [0.86 0.91 0.97 0.96 0.97]
valid c indices (aggregated):  (0.7926200694067431, 0.7681967134690363, 0.8146126288596983)





In [15]:

colnames = ("repn", "model t.", "input t.", "c.ind med", "c.ind 5%", "c.ind 95%")
resdf = pd.DataFrame(results, columns = colnames)

resdf



Unnamed: 0,repn,model t.,input t.,c.ind med,c.ind 5%,c.ind 95%
0,1,c_index,cytogenetics,0.631056,0.615145,0.647329
1,1,ridge_cph_lifelines,clin. factors,0.674877,0.642047,0.70337
2,1,CPHDNN,clin. factors,0.697814,0.670408,0.724725
3,1,ridge_cph_lifelines,clin. factors + LSC17,0.667102,0.638829,0.696994
4,1,CPHDNN,clin. factors + LSC17,0.809139,0.788394,0.829944
5,2,ridge_cph_lifelines,clin. factors,0.671015,0.642014,0.700882
6,2,CPHDNN,clin. factors,0.705474,0.678179,0.733616
7,2,ridge_cph_lifelines,clin. factors + LSC17,0.696233,0.668049,0.722119
8,2,CPHDNN,clin. factors + LSC17,0.79495,0.770581,0.818071
9,3,ridge_cph_lifelines,clin. factors,0.684767,0.656732,0.713638


In [61]:
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize = (22,10))
cph_ax = axes[0]
cphdnn_ax = axes[1]
title = "CPHDNN, CPH, Cyto risk with leucegene from varying input factors"

cph_df = resdf[(resdf["model t."] == "ridge_cph_lifelines")]
for (xmark, input_type) in enumerate(np.unique(cph_df["input t."])):
    data = cph_df[(cph_df["input t."] == input_type)]
    xaxis = np.array(xmark + (0.15 * data["repn"].values.astype(int)))
    cph_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cph_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cph_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cph_ax.set_title(title)
    cph_ax.set_xlabel("method + input")
    cph_ax.set_ylabel("concordance index")
    cph_ax.grid(visible = True, linestyle = "--")
cph_ax.set_ylim((0.5,0.9))
cph_ax.legend()

cphdnn_df = resdf[(resdf["model t."] == "CPHDNN")]
for (xmark, input_type) in enumerate(np.unique(cphdnn_df["input t."])):
    data = cphdnn_df[(cphdnn_df["input t."] == input_type)]
    xaxis = np.array(xmark + (0.2 * data["repn"].values.astype(int)))
    cphdnn_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cphdnn_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cphdnn_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cphdnn_ax.set_title(title)
    cphdnn_ax.set_xlabel("method + input")
    cphdnn_ax.set_ylabel("concordance index")
    cphdnn_ax.grid(visible = True, linestyle = "--")
cphdnn_ax.legend()
cphdnn_ax.set_ylim((0.5,0.9))
plt.tight_layout()
plt.savefig(f"RES/V2/fig1.png")


#### **Figure** : using clinical factors with cph / cphdnn with leucegene and comparing to cytogentic risk only benchmark 