In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from engines.datasets.base_datasets import SurvivalGEDataset
from engines.hp_dict.base import HP_dict
from engines.models import cox_models
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
from engines.models import functions
plt.rcParams["svg.fonttype"] = "none"


In [2]:
### import some basic general arguments
from experiments.parsers import parse_arguments


## Data
The data used to train the network is described here. We use 
* **mutation profiles** : is described by presence/absence of NPM1 mutation, FLT3-ITD mutation and IDH1-R32 mutation.
* **transcriptomic profile** : is descr. by gene expression of the protein coding genes with selection/projection-based dimensionality reduction. (to up to 17 components.)    
* **cytogenetic profile** : is descr. by multiple (18) cytogenetic abnormalities or groups.
    
* **age** (described by age > 60 years boolean), **sex** (is patient female? bool)

In [3]:
## Clinical FEATURES 
mutations = ["NPM1 mutation", "FLT3-ITD mutation", "IDH1-R132 mutation"]
age_sex = ["Sex_F","Age_gt_60"]
cytogenetics = ['MLL translocations (+MLL FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       'Intermediate abnormal karyotype (except isolated trisomy/tetrasomy 8)',
       'Normal karyotype',
       'Complex (3 and more chromosomal abnormalities)',
       'Trisomy/tetrasomy 8 (isolated)',
       'Monosomy 5/ 5q-/Monosomy 7/ 7q- (less than 3 chromosomal abnormalities)',
       'NUP98-NSD1(normal karyotype)',
       't(8;21)(q22;q22)/RUNX1-RUNX1T1 (Irrespective of additional cytogenetic abnormalities)',
       'inv(16)(p13.1q22)/t(16;16)(p13.1;q22)/CBFB-MYH11 (Irrespective of additional cytogenetic abnormalities)',
       'EVI1 rearrangements (+EVI1 FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       't(6;9)(p23;q34) (Irrespective of additional cytogenetic abnormalities)',
       'Monosomy17/del17p (less than 3 chromosomal abnormalities)',
       'Hyperdiploid numerical abnormalities only']
clinical_features = np.concatenate([mutations, cytogenetics, age_sex])

In [4]:
SGE = SurvivalGEDataset()
## cohort ## input_types ## other params
## data{x: input_data, y: target} 
clin_factors = SGE.new("lgn_pronostic", clinical_features, gene_expressions=None)
clin_factors_lsc17 = SGE.new("lgn_pronostic", clinical_features, gene_expressions="LSC17")

Loading and assembling Gene Repertoire...
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


In [5]:
clin_factors_copy = clin_factors.clone()
print( "Is the cloned copy equal in values ? : ", np.all(clin_factors.x == clin_factors_copy.x))
clin_factors.split_train_test(5)
clin_factors_copy.split_train_test(5)


Is the cloned copy equal in values ? :  True


In [6]:
# splitting different copies of the cloned set will shuffle the data internally 
n = clin_factors.folds[0].train.x.index.isin(clin_factors_copy.folds[0].train.x.index).sum()
print (f"The number of common samples between two shuffles: {n}")

The number of common samples between two shuffles: 196


## Example
### Parameters, preprocessing, splitting


In [7]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
data = clin_factors.clone()
# preprocess data (remove low variance columns)
data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
# splitting
data.split_train_test(HyperParams.nfolds)
# generate model parameters 
params = HyperParams.generate_default(model_type = "cphdnn_2l", data = data)



### Launch training

In [8]:
# c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        

## Figures 
#### Data generation

In [63]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)

for repn in range(1,4,1):        
    CDS = SGE.get_data("lgn_pronostic")["CDS"]
    cyt = pd.DataFrame(SGE.data["CF"]["Cytogenetic risk"])
    cyt_levels = [{"intermediate cytogenetics":1, "Intermediate/Normal":1, "adverse cytogenetics": 2, "favorable cytogenetics":0, "Favorable":0, "Standard":1, "Low":0, "Poor":2, None: 1}[level] for level in cyt["Cytogenetic risk"]] 
    cyt["pred_risk"] = cyt_levels
    cyt_c_scores, cyt_metrics = functions.compute_cyto_risk_c_index(cyt["pred_risk"], CDS.y, gamma = 0.001, n = HyperParams.bootstr_n)
    print("C index method 1: ", cyt_metrics)

    results = [("cytogenetics", cyt_metrics)]
    for model_type in ["ridge_cph_lifelines_CF", "cphdnn_2l"]:
        data = clin_factors.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((model_type + " \n + clinical factors", c_index_metrics))

    for model_type in ["ridge_cph_lifelines_CF_LSC17", "cphdnn_5l"]:
        data = clin_factors_lsc17.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((model_type + " \n + clinical factors + LSC17", c_index_metrics))
    resdf = pd.DataFrame(dict(results))
    ncols = resdf.shape[1]
    title = "CPHDNN, CPH, Cyto risk with leucegene from clinical factors input"
    fig, ax = plt.subplots(figsize = (15,10))
    ax.vlines(np.arange(ncols), ymin = resdf.iloc[1,:], ymax = resdf.iloc[2,:], linewidth = 5)
    ax.scatter(np.arange(ncols), resdf.iloc[0,:], linewidth = 5,  c = "k")
    for (i, value) in enumerate(resdf.iloc[0,:]):
        ax.text(i, value, str(round(value, 3)), fontsize = 14)
    ax.set_xticks(np.arange(ncols))
    ax.set_xticklabels(resdf.columns, fontsize = 14)
    ax.set_title(title)
    ax.set_xlabel("method + input")
    ax.set_ylabel("concordance index")
    ax.grid(visible = True, linestyle = "--")
    # plt.show()
    plt.tight_layout()
    plt.savefig(f"RES/V2/fig1_rep{repn}.png")



Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


100%|██████████| 1000/1000 [00:00<00:00, 1130.38it/s]


C index method 1:  (0.6312368618165124, 0.6158697175441564, 0.6476188175030807)


data; ridge_cph_lifelines, INsize: 15: 100%|██████████| 5/5 [00:00<00:00, 10.57it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 931.65it/s]


training c indices:  [0.67 0.68 0.7  0.7  0.68]
valid c indices (aggregated):  (0.682697710713322, 0.6531606794764689, 0.7133300049925112)
Setting up stack... saving to GPU


data; CPHDNN, INsize: 15: 100%|██████████| 5/5 [00:32<00:00,  6.59s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 961.01it/s]


training c indices:  [0.69 0.73 0.75 0.76 0.76]
valid c indices (aggregated):  (0.683219051611274, 0.6525762059085999, 0.7115892515325531)


data; ridge_cph_lifelines, INsize: 31: 100%|██████████| 5/5 [00:00<00:00,  6.57it/s]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 903.00it/s]


training c indices:  [0.75 0.74 0.75 0.76 0.75]
valid c indices (aggregated):  (0.689034015879426, 0.6610994464592479, 0.7163411631219371)
Setting up stack... saving to GPU


data; CPHDNN, INsize: 31: 100%|██████████| 5/5 [00:33<00:00,  6.68s/it]
bootstraping 1000...: 100%|██████████| 1000/1000 [00:01<00:00, 963.30it/s]

training c indices:  [0.87 0.98 0.95 0.98 0.99]
valid c indices (aggregated):  (0.7136011915561205, 0.6838861308967072, 0.7433224755700326)





#### **Figure** : using clinical factors with cph / cphdnn with leucegene and comparing to cytogentic risk only benchmark 

array(['0.63', '0.68', '0.71'], dtype='<U32')