In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from engines.datasets.base_datasets import SurvivalGEDataset
from engines.hp_dict.base import HP_dict
from engines.models import cox_models
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

from engines.models import functions
plt.rcParams["svg.fonttype"] = "none"


In [2]:
### import some basic general arguments
from experiments.parsers import parse_arguments


## Data
The data used to train the network is described here. We use 
* **mutation profiles** : is described by presence/absence of NPM1 mutation, FLT3-ITD mutation and IDH1-R32 mutation.
* **transcriptomic profile** : is descr. by gene expression of the protein coding genes with selection/projection-based dimensionality reduction. (to up to 17 components.)    
* **cytogenetic profile** : is descr. by multiple (18) cytogenetic abnormalities or groups.
    
* **age** (described by age > 60 years boolean), **sex** (is patient female? bool)

In [3]:
## Clinical FEATURES 
mutations = ["NPM1 mutation", "FLT3-ITD mutation", "IDH1-R132 mutation"]
age_sex = ["Sex_F","Age_gt_60"]
cytogenetics = ['MLL translocations (+MLL FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       'Intermediate abnormal karyotype (except isolated trisomy/tetrasomy 8)',
       'Normal karyotype',
       'Complex (3 and more chromosomal abnormalities)',
       'Trisomy/tetrasomy 8 (isolated)',
       'Monosomy 5/ 5q-/Monosomy 7/ 7q- (less than 3 chromosomal abnormalities)',
       'NUP98-NSD1(normal karyotype)',
       't(8;21)(q22;q22)/RUNX1-RUNX1T1 (Irrespective of additional cytogenetic abnormalities)',
       'inv(16)(p13.1q22)/t(16;16)(p13.1;q22)/CBFB-MYH11 (Irrespective of additional cytogenetic abnormalities)',
       'EVI1 rearrangements (+EVI1 FISH positive) (Irrespective of additional cytogenetic abnormalities)',
       't(6;9)(p23;q34) (Irrespective of additional cytogenetic abnormalities)',
       'Monosomy17/del17p (less than 3 chromosomal abnormalities)',
       'Hyperdiploid numerical abnormalities only']
clinical_features = np.concatenate([mutations, cytogenetics, age_sex])

In [17]:
SGE = SurvivalGEDataset()
## cohort ## input_types ## other params
## data{x: input_data, y: target} 

clin_factors = SGE.new("lgn_pronostic", clinical_features, gene_expressions="None")
clin_factors_lsc17 = SGE.new("lgn_pronostic", clinical_features, gene_expressions="LSC17")
clin_factors_pca = SGE.new("lgn_pronostic", clinical_features, gene_expressions="PCA")

Loading and assembling Gene Repertoire...
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)
Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


UnboundLocalError: local variable 'train_features' referenced before assignment

In [5]:
clin_factors_copy = clin_factors.clone()
print( "Is the cloned copy equal in values ? : ", np.all(clin_factors.x == clin_factors_copy.x))
clin_factors.split_train_test(5)
clin_factors_copy.split_train_test(5)


Is the cloned copy equal in values ? :  True


In [6]:
# splitting different copies of the cloned set will shuffle the data internally 
n = clin_factors.folds[0].train.x.index.isin(clin_factors_copy.folds[0].train.x.index).sum()

print (f"The number of common samples between two shuffles: {n}")

The number of common samples between two shuffles: 188


## Example
### Parameters, preprocessing, splitting


In [7]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
data = clin_factors.clone()
# preprocess data (remove low variance columns)
data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
# splitting
data.split_train_test(HyperParams.nfolds)
# generate model parameters 
params = HyperParams.generate_default(model_type = "cphdnn_2l", data = data)



### Launch training

In [8]:
# c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        

## Figures 
#### Data generation

In [13]:
test1 = SGE.new("lgn_pronostic", clinical_features, gene_expressions="PCA")

Loading ClinF lgn_pronostic file ...
removed 414 genes with null expression across samples 
Now datataset hase shape (300, 19183)


UnboundLocalError: local variable 'train_features' referenced before assignment

In [10]:
test1.name

'clin. factors + LSC17'

In [1]:
# Set general parameters
HyperParams = HP_dict(wd = 1e-3, nepochs = 200,  bootstr_n = 1000, nfolds = 5)
CDS = SGE.get_data("lgn_pronostic")["CDS"]
cyt = pd.DataFrame(SGE.data["CF"]["Cytogenetic risk"])
cyt_levels = [{"intermediate cytogenetics":1, "Intermediate/Normal":1, "adverse cytogenetics": 2, "favorable cytogenetics":0, "Favorable":0, "Standard":1, "Low":0, "Poor":2, None: 1}[level] for level in cyt["Cytogenetic risk"]] 
cyt["pred_risk"] = cyt_levels
cyt_c_scores, cyt_metrics = functions.compute_cyto_risk_c_index(cyt["pred_risk"], CDS.y, gamma = 0.001, n = HyperParams.bootstr_n)
print("C index method 1: ", cyt_metrics)

results = [(1, "c_index", "cytogenetics", cyt_metrics[0], cyt_metrics[1], cyt_metrics[2] )]


NameError: name 'HP_dict' is not defined

In [None]:

for repn in range(1,4,1):        
    for model_type in ["ridge_cph_lifelines_CF", "cphdnn_2l"]:
        data = clin_factors.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((repn, params["modeltype"], data.name, c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))


In [None]:

    for model_type in ["ridge_cph_lifelines_CF_LSC17", "cphdnn_5l"]:
        data = clin_factors_lsc17.clone()
        # preprocess data (remove low variance columns)
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
        
        # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))

    for model_type in ["ridge_cph_lifelines_CF_PCA", "cphdnn_5l"]:
        data = clin_factors_pca.clone()
        # preprocess data 
        data.x = data.x[data.x.columns[np.where(data.x.var(0) > 0.01)]]
         # splitting
        data.split_train_test(HyperParams.nfolds)
        # generate model parameters 
        params = HyperParams.generate_default(model_type = model_type, data = data)
        # train and evaluate model
        c_index_metrics, c_scores, surv_tbl, params= cox_models.evaluate(data, params, pca_n = None)
        # append to results
        results.append((repn, params["modeltype"], data.name , c_index_metrics[0], c_index_metrics[1], c_index_metrics[2] ))



In [None]:

colnames = ("repn", "model t.", "input t.", "c.ind med", "c.ind 5%", "c.ind 95%")
resdf = pd.DataFrame(results, columns = colnames)

resdf



In [None]:
fig, axes = plt.subplots(ncols = 2, nrows = 1, figsize = (22,10))
cph_ax = axes[0]
cphdnn_ax = axes[1]
title = "CPHDNN, CPH, Cyto risk with leucegene from varying input factors"
spacer = 0.2
cph_df = resdf[(resdf["model t."] == "ridge_cph_lifelines")]
for (xmark, input_type) in enumerate(np.unique(cph_df["input t."])):
    data = cph_df[(cph_df["input t."] == input_type)]
    xaxis = np.array(xmark + (spacer * data["repn"].values.astype(int)))
    cph_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cph_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cph_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cph_ax.set_title(title)
    cph_ax.set_xlabel("CPH (lifelines)")
    cph_ax.set_ylabel("concordance index")
    cph_ax.grid(visible = True, linestyle = "--")
cph_ax.set_ylim((0.5,0.9))
cph_ax.set_xticks(np.arange(len(np.unique(cph_df["input t."]))) + spacer * 2)
cph_ax.set_xticklabels(np.unique(cph_df["input t."]), fontsize = 14)
cph_ax.legend()

cphdnn_df = resdf[(resdf["model t."] == "CPHDNN")]
for (xmark, input_type) in enumerate(np.unique(cphdnn_df["input t."])):
    data = cphdnn_df[(cphdnn_df["input t."] == input_type)]
    xaxis = np.array(xmark + (0.2 * data["repn"].values.astype(int)))
    cphdnn_ax.vlines(x = xaxis, ymin = data["c.ind 5%"], ymax = data["c.ind 95%"], linewidth = 4, label = input_type, color = "k")
    cphdnn_ax.scatter(x = xaxis, y = data["c.ind med"], linewidth = 5, label = input_type)
    for (i, value) in zip(xaxis, data["c.ind med"]):
        cphdnn_ax.text(i, value, str(round(value, 3)), fontsize = 20)
    #ax.set_xticks(np.arange(ncols))
    #ax.set_xticklabels(resdf.columns, fontsize = 14)
    cphdnn_ax.set_title(title)
    cphdnn_ax.set_xlabel("CPHDNN")
    cphdnn_ax.set_ylabel("concordance index")
    cphdnn_ax.grid(visible = True, linestyle = "--")
cphdnn_ax.set_xticks(np.arange(len(np.unique(cphdnn_df["input t."]))) + 0.2 * 2)
cphdnn_ax.set_xticklabels(np.unique(cphdnn_df["input t."]), fontsize = 14)
cphdnn_ax.legend()
cphdnn_ax.set_ylim((0.5,0.9))
plt.tight_layout()
plt.savefig(f"RES/V2/fig1.png")


#### **Figure** : using clinical factors with cph / cphdnn with leucegene and comparing to cytogentic risk only benchmark 