In [36]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from engines.datasets.base_datasets import SurvivalGEDataset
from engines.hp_dict.base import HP_dict
from engines.models import cox_models
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from datetime import datetime
import re 
stamp = datetime.now()
hash1 = sum([int(i) for i in re.sub(r'[^\w\s]','', str(stamp)).replace(" ", "")]) 
from engines.models import functions
plt.rcParams["svg.fonttype"] = "none"


In [37]:
### import some basic general arguments
from experiments.parsers import parse_arguments


## Data
The data used to train the network is described here. We use 
* **mutation profiles** : is described by presence/absence of NPM1 mutation, FLT3-ITD mutation and IDH1-R32 mutation.
* **transcriptomic profile** : is descr. by gene expression of the protein coding genes with selection/projection-based dimensionality reduction. (to up to 17 components.)    
* **cytogenetic profile** : is descr. by multiple (18) cytogenetic abnormalities or groups.
    
* **age** (described by age > 60 years boolean), **sex** (is patient female? bool)

In [38]:
## Clinical FEATURES 
mutations = ["NPM1 mutation", "FLT3-ITD mutation", "IDH1-R132 mutation"]
age_sex = ["Sex_F","Age_gt_60"]
cytogenetics = ['MLL translocations (+MLL FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       'Intermediate abnormal karyotype (except isolated trisomy/tetrasomy 8)',
       'Normal karyotype',
       'Complex (3 and more chromosomal abnormalities)',
       'Trisomy/tetrasomy 8 (isolated)',
       'Monosomy 5/ 5q-/Monosomy 7/ 7q- (less than 3 chromosomal abnormalities)',
       'NUP98-NSD1(normal karyotype)',
       't(8;21)(q22;q22)/RUNX1-RUNX1T1 (Irrespective of additional cytogenetic abnormalities)',
       'inv(16)(p13.1q22)/t(16;16)(p13.1;q22)/CBFB-MYH11 (Irrespective of additional cytogenetic abnormalities)',
       'EVI1 rearrangements (+EVI1 FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       't(6;9)(p23;q34) (Irrespective of additional cytogenetic abnormalities)',
       'Monosomy17/del17p (less than 3 chromosomal abnormalities)',
       'Hyperdiploid numerical abnormalities only']
clinical_features = np.concatenate([mutations, cytogenetics, age_sex])

In [39]:
SGE = SurvivalGEDataset()
SGE.get_data("lgn_pronostic")
## cohort ## input_types ## other params
## data{x: input_data, y: target} 
clin_factors = SGE.new(clinical_features, gene_expressions="None")
clin_factors_lsc17 = SGE.new(clinical_features, gene_expressions="LSC17")
clin_factors_pca = SGE.new(clinical_features, gene_expressions="PCA")

pca_only = SGE.new(None, gene_expressions = "PCA")
lsc17_only = SGE.new(None, gene_expressions = "LSC17")
lsc17_pca =  SGE.new(None, gene_expressions = "LSC17+PCA")

clin_factors_lsc17_pca = SGE.new(clinical_features, gene_expressions="LSC17+PCA")

Loading and assembling Gene Repertoire...
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


In [40]:
data = SGE.data["CF_bin"].merge(SGE.data["LSC17"].x, left_index = True, right_index = True)

In [41]:
clin_factors_copy = clin_factors.clone()
print( "Is the cloned copy equal in values ? : ", np.all(clin_factors.x == clin_factors_copy.x))
clin_factors.split_train_test(5)
clin_factors_copy.split_train_test(5)


Is the cloned copy equal in values ? :  True


In [42]:
# splitting different copies of the cloned set will shuffle the data internally 
n = clin_factors.folds[0].train.x.index.isin(clin_factors_copy.folds[0].train.x.index).sum()

print (f"The number of common samples between two shuffles: {n}")

The number of common samples between two shuffles: 192


## Example
### Parameters, preprocessing, splitting


In [43]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
data = clin_factors.clone()
# preprocess data (remove low variance columns)
data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
# splitting
data.split_train_test(HyperParams.nfolds)
# generate model parameters 
params = HyperParams.generate_default(model_type = "cphdnn_2l", data = data)



### Launch training

In [44]:
# c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        

## Figures 
#### Data generation

In [45]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
CDS = SGE.get_data("lgn_pronostic")["CDS"]
cyt = pd.DataFrame(SGE.data["CF"]["Cytogenetic risk"])
cyt_levels = [{"intermediate cytogenetics":1, "Intermediate/Normal":1, "adverse cytogenetics": 2, "favorable cytogenetics":0, "Favorable":0, "Standard":1, "Low":0, "Poor":2, None: 1}[level] for level in cyt["Cytogenetic risk"]] 
cyt["pred_risk"] = cyt_levels
cyt_c_scores, cyt_metrics = functions.compute_cyto_risk_c_index(cyt["pred_risk"], CDS.y, gamma = 0.001, n = HyperParams.bootstr_n)
print("C index method 1: ", cyt_metrics)

results = [(1, "c_index", "cytogenetics", cyt_metrics[0], cyt_metrics[1], cyt_metrics[2] )]


Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


100%|██████████| 1000/1000 [00:00<00:00, 1141.18it/s]

C index method 1:  (0.630862348080315, 0.6142266895401938, 0.6466764926184551)





In [46]:

for repn in range(1,4,1):        
    for model_type in ["ridge_cph_lifelines_CF", "cphdnn_2l"]:
        data = clin_factors.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = None)
        # append to results
        results.append((repn, params["modeltype"], data.name, c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))

    for model_type in ["ridge_cph_lifelines_CF_LSC17", "cphdnn_5l"]:
        data = clin_factors_lsc17.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = None)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))

    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_1l"]:
        data = clin_factors_pca.clone()
        # preprocess data 
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
            # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        pca_params = {"min_col": 16, "max_col": data.x.shape[1], "pca_n": 17 }
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = pca_params)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))
        
    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_1l"]:
        data = clin_factors_lsc17_pca.clone()
        # preprocess data 
        var = data.x.var(0)
        data.x = data.x[data.x.columns[np.where( var > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        pca_params = {"min_col": 33, "max_col": data.x.shape[1], "pca_n": 17 }
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = pca_params)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))
    

    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_1l"]:
        data = lsc17_pca.clone()
        # preprocess data 
        var = data.x.var(0)
        data.x = data.x[data.x.columns[np.where( var > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        pca_params = {"min_col": 17, "max_col": data.x.shape[1], "pca_n": 17 }
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = pca_params)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))


    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_2l"]:
        data = pca_only.clone()
        # preprocess data 
        var = data.x.var(0)
        data.x = data.x[data.x.columns[np.where( var > np.median(var))]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        pca_params = {"min_col": 0, "max_col": data.x.shape[1], "pca_n": 30 }
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = pca_params)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))
    
    for model_type in ["ridge_cph_lifelines_LSC17", "cphdnn_1l"]:
        data = lsc17_only.clone()
        # preprocess data 
        var = data.x.var(0)
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)

        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_params = None)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))
    

clin. factors + None; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.60it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 959.03it/s]


training c indices:  [0.68 0.69 0.7  0.69 0.68]
valid c indices (aggregated):  (0.6748028695973676, 0.6434496978996835, 0.7042583288620067)
Setting up stack... saving to GPU


clin. factors + None; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.94s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 999.98it/s]


training c indices:  [0.71 0.74 0.75 0.74 0.78]
valid c indices (aggregated):  (0.6803227930246507, 0.6501063258009487, 0.7088933985913811)



>>> events = df['E'].astype(bool)
>>> print(df.loc[events, 'Complex (3 and more chromosomal abnormalities)'].var())
>>> print(df.loc[~events, 'Complex (3 and more chromosomal abnormalities)'].var())

A very low variance means that the column Complex (3 and more chromosomal abnormalities) completely determines whether a subject dies or not. See https://stats.stackexchange.com/questions/11109/how-to-deal-with-perfect-separation-in-logistic-regression.

clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  7.38it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 960.22it/s]


training c indices:  [0.73 0.75 0.75 0.75 0.75]
valid c indices (aggregated):  (0.6492366130836248, 0.6190712161970834, 0.6783373453774102)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:30<00:00,  6.03s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 990.73it/s]


training c indices:  [0.87 0.92 0.98 0.99 1.  ]
valid c indices (aggregated):  (0.7925010235186479, 0.7699655271473714, 0.8132219869079708)


clin. factors + PCA; ridge_cph_lifelines, INsize: 32: 100%|██████████| 5/5 [00:03<00:00,  1.53it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 918.95it/s]


training c indices:  [0.75 0.76 0.76 0.75 0.74]
valid c indices (aggregated):  (0.7074233380607305, 0.6773153785545448, 0.735326418547895)
Setting up stack... saving to GPU


clin. factors + PCA; CPHDNN, INsize: 32: 100%|██████████| 5/5 [00:32<00:00,  6.42s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 974.68it/s]


training c indices:  [0.69 0.74 0.78 0.79 0.82]
valid c indices (aggregated):  (0.6437353667745621, 0.6084348864994026, 0.6754872035689129)


clin. factors + LSC17+PCA; ridge_cph_lifelines, INsize: 49: 100%|██████████| 5/5 [00:04<00:00,  1.22it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 919.03it/s]


training c indices:  [0.77 0.78 0.79 0.76 0.76]
valid c indices (aggregated):  (0.6904113339351376, 0.6620151710319164, 0.7174334140435835)
Setting up stack... saving to GPU


clin. factors + LSC17+PCA; CPHDNN, INsize: 49: 100%|██████████| 5/5 [00:32<00:00,  6.40s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 964.77it/s]


training c indices:  [0.75 0.79 0.8  0.82 0.81]
valid c indices (aggregated):  (0.694421405427144, 0.6663519271741236, 0.7227176748230033)


LSC17+PCA; ridge_cph_lifelines, INsize: 33: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 926.28it/s]


training c indices:  [0.72 0.75 0.74 0.75 0.72]
valid c indices (aggregated):  (0.6691519096726407, 0.6403400421419748, 0.698454302721505)
Setting up stack... saving to GPU


LSC17+PCA; CPHDNN, INsize: 33: 100%|██████████| 5/5 [00:31<00:00,  6.39s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 960.60it/s]


training c indices:  [0.7  0.75 0.77 0.79 0.8 ]
valid c indices (aggregated):  (0.6377261553100676, 0.6066846725185685, 0.6682281685475666)


PCA; ridge_cph_lifelines, INsize: 30: 100%|██████████| 5/5 [00:02<00:00,  2.12it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 920.18it/s]


training c indices:  [0.72 0.75 0.73 0.73 0.73]
valid c indices (aggregated):  (0.6840200032155782, 0.6547707144732758, 0.7158149917192713)
Setting up stack... saving to GPU


PCA; CPHDNN, INsize: 30: 100%|██████████| 5/5 [00:31<00:00,  6.31s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 980.34it/s]


training c indices:  [0.88 0.87 0.88 0.86 0.88]
valid c indices (aggregated):  (0.49143681205295947, 0.4570392147565081, 0.5225884027385831)


LSC17; ridge_cph_lifelines, INsize: 17: 100%|██████████| 5/5 [00:00<00:00, 12.04it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 952.31it/s]


training c indices:  [0.69 0.69 0.7  0.71 0.7 ]
valid c indices (aggregated):  (0.665468149436251, 0.6346736762337786, 0.6975372618695094)
Setting up stack... saving to GPU


LSC17; CPHDNN, INsize: 17: 100%|██████████| 5/5 [00:29<00:00,  5.89s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 987.22it/s]


training c indices:  [0.66 0.69 0.7  0.71 0.73]
valid c indices (aggregated):  (0.6554891192997169, 0.6237483150394763, 0.68350787239703)


clin. factors + None; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.62it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 954.48it/s]


training c indices:  [0.69 0.69 0.69 0.69 0.67]
valid c indices (aggregated):  (0.6781287402895319, 0.648077835785477, 0.7080183330183331)
Setting up stack... saving to GPU


clin. factors + None; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.93s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 996.89it/s]


training c indices:  [0.7  0.73 0.75 0.76 0.75]
valid c indices (aggregated):  (0.7039646989455419, 0.6743261677407216, 0.7324398707670217)


clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  8.21it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 955.42it/s]


training c indices:  [0.75 0.76 0.75 0.75 0.73]
valid c indices (aggregated):  (0.7001538971537697, 0.6717998075072185, 0.7291625712600747)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:30<00:00,  6.04s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 995.14it/s]


training c indices:  [0.89 0.96 0.97 0.98 0.99]
valid c indices (aggregated):  (0.8153663754486302, 0.793214901941724, 0.8347634224074528)


clin. factors + PCA; ridge_cph_lifelines, INsize: 32: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 932.36it/s]


training c indices:  [0.76 0.74 0.74 0.76 0.76]
valid c indices (aggregated):  (0.7008306216734908, 0.6700800376647834, 0.7283827741904161)
Setting up stack... saving to GPU


clin. factors + PCA; CPHDNN, INsize: 32: 100%|██████████| 5/5 [00:32<00:00,  6.42s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 966.01it/s]


training c indices:  [0.71 0.76 0.79 0.78 0.82]
valid c indices (aggregated):  (0.6920020857204916, 0.6625139944260499, 0.7198236399556691)


clin. factors + LSC17+PCA; ridge_cph_lifelines, INsize: 49: 100%|██████████| 5/5 [00:03<00:00,  1.27it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 931.15it/s]


training c indices:  [0.77 0.78 0.77 0.77 0.78]
valid c indices (aggregated):  (0.6930856128053506, 0.6660107506239201, 0.7183891100299769)
Setting up stack... saving to GPU


clin. factors + LSC17+PCA; CPHDNN, INsize: 49: 100%|██████████| 5/5 [00:32<00:00,  6.40s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 978.28it/s]


training c indices:  [0.78 0.78 0.79 0.82 0.83]
valid c indices (aggregated):  (0.6776368426981205, 0.6481951459898977, 0.7058809597119917)


LSC17+PCA; ridge_cph_lifelines, INsize: 33: 100%|██████████| 5/5 [00:03<00:00,  1.60it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 923.60it/s]


training c indices:  [0.73 0.75 0.75 0.73 0.73]
valid c indices (aggregated):  (0.6634524850527055, 0.6348339938771151, 0.6922534011535935)
Setting up stack... saving to GPU


LSC17+PCA; CPHDNN, INsize: 33: 100%|██████████| 5/5 [00:32<00:00,  6.41s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 971.73it/s]


training c indices:  [0.67 0.75 0.78 0.79 0.77]
valid c indices (aggregated):  (0.6695967556975747, 0.637999419560801, 0.7013173997552372)


PCA; ridge_cph_lifelines, INsize: 30: 100%|██████████| 5/5 [00:02<00:00,  2.07it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 940.74it/s]


training c indices:  [0.73 0.73 0.74 0.75 0.72]
valid c indices (aggregated):  (0.6880884997769322, 0.6577147016011645, 0.7175509809606649)
Setting up stack... saving to GPU


PCA; CPHDNN, INsize: 30: 100%|██████████| 5/5 [00:31<00:00,  6.30s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 976.62it/s]


training c indices:  [0.87 0.9  0.88 0.85 0.84]
valid c indices (aggregated):  (0.48936920438959364, 0.45578409850147944, 0.5233106718602569)


LSC17; ridge_cph_lifelines, INsize: 17: 100%|██████████| 5/5 [00:00<00:00, 12.07it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 951.78it/s]


training c indices:  [0.68 0.72 0.68 0.71 0.7 ]
valid c indices (aggregated):  (0.6673949335411229, 0.6360181468373672, 0.6981355645297802)
Setting up stack... saving to GPU


LSC17; CPHDNN, INsize: 17: 100%|██████████| 5/5 [00:29<00:00,  5.89s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1002.08it/s]


training c indices:  [0.68 0.68 0.7  0.7  0.71]
valid c indices (aggregated):  (0.6571505024621256, 0.6247128750384807, 0.686539643515673)


clin. factors + None; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 11.62it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 966.69it/s]


training c indices:  [0.67 0.7  0.68 0.7  0.69]
valid c indices (aggregated):  (0.6797348360519492, 0.6488788034062904, 0.7107886036561649)
Setting up stack... saving to GPU


clin. factors + None; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:29<00:00,  5.93s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1005.04it/s]


training c indices:  [0.7  0.72 0.75 0.75 0.77]
valid c indices (aggregated):  (0.7058175951755304, 0.6781835428029405, 0.7337684394409938)


clin. factors + LSC17; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  8.13it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 963.50it/s]


training c indices:  [0.73 0.75 0.75 0.76 0.75]
valid c indices (aggregated):  (0.6907187431790366, 0.6619140150595093, 0.7189181185528849)
Setting up stack... saving to GPU


clin. factors + LSC17; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:29<00:00,  5.99s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1005.27it/s]


training c indices:  [0.88 0.93 0.98 0.98 0.99]
valid c indices (aggregated):  (0.8152170742836906, 0.7963973930400904, 0.836426116838488)


clin. factors + PCA; ridge_cph_lifelines, INsize: 32: 100%|██████████| 5/5 [00:03<00:00,  1.55it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 940.07it/s]


training c indices:  [0.76 0.75 0.74 0.76 0.75]
valid c indices (aggregated):  (0.6987473895669527, 0.670722309147381, 0.7270233196159122)
Setting up stack... saving to GPU


clin. factors + PCA; CPHDNN, INsize: 32: 100%|██████████| 5/5 [00:31<00:00,  6.34s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 963.74it/s]


training c indices:  [0.72 0.75 0.78 0.81 0.8 ]
valid c indices (aggregated):  (0.6857831529465213, 0.6546727959758357, 0.7122463945245662)


clin. factors + LSC17+PCA; ridge_cph_lifelines, INsize: 49: 100%|██████████| 5/5 [00:03<00:00,  1.29it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 951.90it/s]


training c indices:  [0.77 0.79 0.77 0.77 0.78]
valid c indices (aggregated):  (0.6842388529166887, 0.6537426468837076, 0.7126111755607115)
Setting up stack... saving to GPU


clin. factors + LSC17+PCA; CPHDNN, INsize: 49: 100%|██████████| 5/5 [00:31<00:00,  6.33s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 978.88it/s]


training c indices:  [0.76 0.79 0.8  0.8  0.82]
valid c indices (aggregated):  (0.6755221858525531, 0.6446400230846919, 0.7068821740410085)


LSC17+PCA; ridge_cph_lifelines, INsize: 33: 100%|██████████| 5/5 [00:03<00:00,  1.58it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 947.27it/s]


training c indices:  [0.74 0.74 0.75 0.73 0.73]
valid c indices (aggregated):  (0.6684971965984572, 0.636433800313399, 0.697002536883328)
Setting up stack... saving to GPU


LSC17+PCA; CPHDNN, INsize: 33: 100%|██████████| 5/5 [00:31<00:00,  6.32s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 973.60it/s]


training c indices:  [0.74 0.78 0.79 0.8  0.78]
valid c indices (aggregated):  (0.6805640360956604, 0.6470338282854512, 0.7123590695019266)


PCA; ridge_cph_lifelines, INsize: 30: 100%|██████████| 5/5 [00:02<00:00,  2.12it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 942.06it/s]


training c indices:  [0.74 0.73 0.73 0.72 0.74]
valid c indices (aggregated):  (0.6919732724888078, 0.6608894067385183, 0.7232686980609419)
Setting up stack... saving to GPU


PCA; CPHDNN, INsize: 30: 100%|██████████| 5/5 [00:31<00:00,  6.23s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 989.72it/s]


training c indices:  [0.88 0.92 0.91 0.85 0.86]
valid c indices (aggregated):  (0.5747742022192479, 0.5425496212484419, 0.6062437775565354)


LSC17; ridge_cph_lifelines, INsize: 17: 100%|██████████| 5/5 [00:00<00:00, 12.06it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 957.31it/s]


training c indices:  [0.7  0.7  0.7  0.72 0.69]
valid c indices (aggregated):  (0.6488985869661915, 0.6146511571195162, 0.6774405850091407)
Setting up stack... saving to GPU


LSC17; CPHDNN, INsize: 17: 100%|██████████| 5/5 [00:29<00:00,  5.81s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:00<00:00, 1000.57it/s]

training c indices:  [0.65 0.69 0.7  0.71 0.73]
valid c indices (aggregated):  (0.6352551939940491, 0.6034939121228163, 0.6704937948119387)





In [47]:

colnames = ("repn", "model t.", "input t.", "c.ind med", "c.ind 5%", "c.ind 95%")
resdf = pd.DataFrame(results, columns = colnames)

resdf



Unnamed: 0,repn,model t.,input t.,c.ind med,c.ind 5%,c.ind 95%
0,1,c_index,cytogenetics,0.630862,0.614227,0.646676
1,1,ridge_cph_lifelines,clin. factors + None,0.674803,0.64345,0.704258
2,1,CPHDNN,clin. factors + None,0.680323,0.650106,0.708893
3,1,ridge_cph_lifelines,clin. factors + LSC17,0.649237,0.619071,0.678337
4,1,CPHDNN,clin. factors + LSC17,0.792501,0.769966,0.813222
5,1,ridge_cph_lifelines,clin. factors + PCA,0.707423,0.677315,0.735326
6,1,CPHDNN,clin. factors + PCA,0.643735,0.608435,0.675487
7,1,ridge_cph_lifelines,clin. factors + LSC17+PCA,0.690411,0.662015,0.717433
8,1,CPHDNN,clin. factors + LSC17+PCA,0.694421,0.666352,0.722718
9,1,ridge_cph_lifelines,LSC17+PCA,0.669152,0.64034,0.698454


### Plotting... 

In [48]:
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize = (22,10))
cph_ax = axes[0]
cphdnn_ax = axes[1]
title = "CPHDNN, CPH, Cyto risk with leucegene from varying input factors"
spacer = 0.2
cph_df = resdf[(resdf["model t."] == "ridge_cph_lifelines")]
for (xmark, input_type) in enumerate(np.unique(cph_df["input t."])):
    data = cph_df[(cph_df["input t."] == input_type)]
    xaxis = np.array(xmark + (spacer * data["repn"].values.astype(int)))
    cph_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cph_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cph_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cph_ax.set_title(title)
    cph_ax.set_xlabel("CPH (lifelines)")
    cph_ax.set_ylabel("concordance index")
    cph_ax.grid(visible = True, linestyle = "--")
cph_ax.set_ylim((0.5,0.9))
cph_ax.set_xticks(np.arange(len(np.unique(cph_df["input t."]))) + spacer * 2)
cph_ax.set_xticklabels(np.unique(cph_df["input t."]), fontsize = 14)
cph_ax.legend()

cphdnn_df = resdf[(resdf["model t."] == "CPHDNN")]
for (xmark, input_type) in enumerate(np.unique(cphdnn_df["input t."])):
    data = cphdnn_df[(cphdnn_df["input t."] == input_type)]
    xaxis = np.array(xmark + (0.2 * data["repn"].values.astype(int)))
    cphdnn_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cphdnn_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cphdnn_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cphdnn_ax.set_title(title)
    cphdnn_ax.set_xlabel("CPHDNN")
    cphdnn_ax.set_ylabel("concordance index")
    cphdnn_ax.grid(visible = True, linestyle = "--")
cphdnn_ax.set_xticks(np.arange(len(np.unique(cphdnn_df["input t."]))) + 0.2 * 2)
cphdnn_ax.set_xticklabels(np.unique(cphdnn_df["input t."]), fontsize = 14)
cphdnn_ax.legend()
cphdnn_ax.set_ylim((0.5,0.9))
plt.tight_layout()

plt.savefig(f"RES/V2/fig_{str(hash1 * 9)}.png")


#### **Figure** : using clinical factors with cph / cphdnn with leucegene and comparing to cytogentic risk only benchmark 