# ISMB figures notebook

In [1]:

include("engines/init.jl")
include("engines/data_processing.jl")
include("engines/deep_learning.jl")
include("engines/cross_validation.jl")
outpath, session_id = set_dirs() ;

[32m[1m  Activating[22m[39m project at `~/vae_cox`


In [None]:
BRCA = MLSurvDataset("Data/TCGA_BRCA_tpm_n1049_btypes_labels_surv.h5")

In [2]:
LAML = MLSurvDataset("Data/LGN_AML_tpm_n300_btypes_labels_surv.h5") 

MLSurvDataset(Float32[0.008600163 0.0 … 0.0 0.0; 0.033423774 0.00432137 … 0.0 0.029383799; … ; 0.4828736 0.0 … 0.0 0.15836251; 0.045322984 0.017033324 … 0.0 0.20682588], ["01H001", "02H003", "02H009", "02H017", "02H026", "02H033", "02H053", "02H066", "03H016", "03H022"  …  "13H186", "14H001", "14H007", "14H012", "14H015", "14H017", "14H019", "14H020", "14H023", "14H038"], ["TSPAN6", "TNMD", "DPM1", "SCYL3", "C1orf112", "FGR", "CFH", "FUCA2", "GCLC", "NFYA"  …  "AP003086.3", "AL109627.1", "AC084851.4", "AC024558.2", "AC108479.4", "AL512357.2", "AL138899.3", "AL669830.1", "AC091135.2", "AL357075.5"], ["lncRNA", "lncRNA", "protein_coding", "lncRNA", "protein_coding", "lncRNA", "protein_coding,retained_intron", "lncRNA", "protein_coding", "protein_coding"  …  "transcribed_processed_pseudogene", "lncRNA", "processed_pseudogene", "protein_coding", "unprocessed_pseudogene", "lncRNA", "protein_coding", "retained_intron", "unprocessed_pseudogene", "protein_coding"], ["Therapy-related myeloid ne

In [None]:
BRCA_pcoding = BRCA.biotypes .== "protein_coding"
println(sum(BRCA_pcoding))
println(size(BRCA.samples)[1])
println(mean(BRCA.surve .!= 0))


In [None]:
pcoding = [occursin("protein_coding", bt) for bt in LAML.biotypes]
println(size(LAML.genes[pcoding])[1])
println(mean(LAML.surve .!= 0))

In [None]:
# testing VAE-Cox
# set hyper parameters 
DATA = BRCA
keep = BRCA_pcoding
# split train test
folds = split_train_test(Matrix(DATA.data[:,keep]), DATA.survt, DATA.surve, DATA.samples;nfolds =5)


In [None]:
device!()

In [63]:
# testing VAE-Cox
# set hyper parameters 
DATA,nfolds, nepochs, dim_redux = LAML, 5, 1000, 125
keep = [occursin("protein_coding", bt) for bt in DATA.biotypes]
println(sum(keep))
println(size(DATA.samples)[1])
println(mean(DATA.surve .!= 0))
params_dict = Dict(
        ## run infos 
        "session_id" => session_id, "nfolds" =>5,  "modelid" => "$(bytes2hex(sha256("$(now())"))[1:Int(floor(end/3))])",
        "machine_id"=>strip(read(`hostname`, String)), "device" => "$(device())", "model_title"=>"AECPHDNN",
        ## data infos 
        "dataset" => "BRCA_data(norm=true)", "nsamples" => size(DATA.samples)[1],
        "nsamples_test" => Int(round(size(DATA.samples)[1] / nfolds)), "ngenes" => size(DATA.genes[keep])[1],
        "nsamples_train" => size(DATA.samples)[1] - Int(round(size(DATA.samples)[1] / nfolds)),
        ## optim infos 
        "nepochs" => nepochs, "ae_lr" =>1e-6, "cph_lr" => 1e-6, "ae_wd" => 1e-6, "cph_wd" => 1e-4,
        ## model infos
        "model_type"=> "aecphdnn", "dim_redux" => dim_redux, "ae_nb_hls" => 2, "ae_hl_size"=> 128,
        "enc_nb_hl" => 2, "enc_hl_size"=> 128,  "dec_nb_hl" => 2 , "dec_hl_size"=> 128,
        "nb_clinf" => 0, "cph_nb_hl" => 2, "cph_hl_size" => 64, 
        "insize" => size(DATA.genes[keep])[1],
        ## metrics
        "model_cv_complete" => false
    )
# split train test
folds = split_train_test(Matrix(DATA.data[:,keep]), DATA.survt, DATA.surve, DATA.samples;nfolds =5)
fold = folds[1]
# format input data  
train_x, train_y_t, train_y_e, NE_frac_tr, test_x, test_y_t, test_y_e, NE_frac_tst = format_train_test(fold)
# create model 
model = build_vaecox(params_dict)
# train model 
## gradient CPH            

# report learning curves
# test model
# report c-index

14996
300
0.7433333333333333


Dict{String, Any} with 3 entries:
  "ae"  => AE_model(Chain(Dense(14996 => 3040, leakyrelu), Dense(3040 => 616, l…
  "cph" => dnn(Chain(Dense(125 => 64, leakyrelu), Dense(64 => 1, σ; bias=false)…
  "enc" => Chain(Dense(14996 => 3040, leakyrelu), Dense(3040 => 616, leakyrelu)…

In [15]:
function VAE_COX_loss(VENC, CPH, X, Y_e, NE_frac;device = gpu)
    mu, log_sigma = VENC(X)
    z = mu + device(randn(Float32, size(log_sigma))) .* exp.(log_sigma)
    outs = vec(CPH(z))
    hazard_ratios = exp.(outs)
    log_risk = log.(cumsum(hazard_ratios))
    uncensored_likelihood = outs .- log_risk
    censored_likelihood = uncensored_likelihood .* Y_e'
    #neg_likelihood = - sum(censored_likelihood) / sum(e .== 1)
    neg_likelihood = - sum(censored_likelihood) * NE_frac
    return neg_likelihood
end 

VAE_COX_loss (generic function with 1 method)

In [39]:
function cox_nll_vec(mdl::Chain, X_, Y_e_, NE_frac)
    outs = vec(mdl(X_))
    #outs = vec(mdl.cphdnn(mdl.encoder(X_)))
    hazard_ratios = exp.(outs)
    log_risk = log.(cumsum(hazard_ratios))
    uncensored_likelihood = outs .- log_risk
    censored_likelihood = uncensored_likelihood .* Y_e_'
    #neg_likelihood = - sum(censored_likelihood) / sum(e .== 1)
    neg_likelihood = - sum(censored_likelihood) * NE_frac
    return neg_likelihood
end 

cox_nll_vec (generic function with 6 methods)

In [43]:
cphdnn = gpu(Chain(Dense(size(DATA.data[:,keep])[2], 125, leakyrelu), Dense(125, 100,leakyrelu),Dense(100, 1)))

Chain(
  Dense(14996 => 125, leakyrelu),       [90m# 1_874_625 parameters[39m
  Dense(125 => 100, leakyrelu),         [90m# 12_600 parameters[39m
  Dense(100 => 1),                      [90m# 101 parameters[39m
) [90m                  # Total: 6 arrays, [39m1_887_326 parameters, 856 bytes.

In [45]:
train_x, train_y_t, train_y_e, NE_frac_tr, test_x, test_y_t, test_y_e, NE_frac_tst = format_train_test(fold)
cphdnn(train_x)

1×240 CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}:
 0.804183  0.6966  0.642762  0.544045  …  0.874874  1.00074  0.824527

In [48]:
function validate_vaecox(DATA, params_dict;device = gpu)
    nfolds, nepochs, dim_redux = 5, 1000, 125
    keep = [occursin("protein_coding", bt) for bt in DATA.biotypes]
    # split train test
    folds = split_train_test(Matrix(DATA.data[:,keep]), DATA.survt, DATA.surve, DATA.samples;nfolds =5)
    fold = folds[1]

    venc = VariationalEncoder(size(DATA.data[:,keep])[2], 125, 600)
    vdec = Decoder(size(DATA.data[:,keep])[2], 125, 600)
    VAE_opt = Flux.ADAM(1e-4)

    cphdnn = device(Chain(Dense(size(DATA.data[:,keep])[2], 125, leakyrelu), Dense(125, 100,leakyrelu),Dense(100, 1)))
    cphdnn_opt = Flux.ADAM(1e-5)

    train_x, train_y_t, train_y_e, NE_frac_tr, test_x, test_y_t, test_y_e, NE_frac_tst = format_train_test(fold)
    for i in 1:nepochs
        ps1 = Flux.params(venc, vdec)
        gs1 = gradient(ps1) do
            VAE_lossf(venc, vdec, train_x)
        end 
        VAE_loss = VAE_lossf(venc, vdec, train_x)
        VAE_cor = round(my_cor(vec(train_x), vec(MyReconstruct(venc, vdec, train_x)[end])),digits = 3)
        VAE_test = round(my_cor(vec(test_x), vec(MyReconstruct(venc, vdec, test_x)[end])),digits = 3)

        ps2 = Flux.params(venc, cphdnn)
        gs2 = gradient(ps2) do 
            cox_nll_vec(cphdnn, train_x, train_y_e, NE_frac_tr)
            #VAE_COX_loss(venc, cphdnn, train_x, train_y_e, NE_frac_tr)
        end 
        CPH_loss = round(cox_nll_vec(cphdnn, train_x, train_y_e, NE_frac_tr), digits = 3) #round(VAE_COX_loss(venc, cphdnn, train_x, train_y_e, NE_frac_tr), digits = 3)
        #mu, log_sigma = venc(train_x)
        #z = mu + device(randn(Float32, size(log_sigma))) .* exp.(log_sigma)
        #OUTS_tr = vec(cphdnn(z))
        OUTS_tr = cphdnn(train_x)
        cind_tr, cdnt_tr, ddnt_tr, tied_tr  = concordance_index(train_y_t, train_y_e, OUTS_tr)

        #Flux.update!(VAE_opt, ps1, gs1)
        Flux.update!(cphdnn_opt, ps2, gs2)
        if i % 100 == 0 || i == 1
            println("$i TRAIN - VAE-loss-avg: $VAE_loss\tVAE-cor: $VAE_cor CPH-loss: $CPH_loss CPH-cind: $(round(cind_tr,digits=3))\tTEST - VAE-cor: $VAE_test")
        end
    end
end 

validate_vaecox (generic function with 1 method)

In [49]:
validate_vaecox(DATA, params_dict)

1 TRAIN - VAE-loss-avg: 19349.998	VAE-cor: 0.007 CPH-loss: 5.052 CPH-cind: 0.582	TEST - VAE-cor: 0.009


100 TRAIN - VAE-loss-avg: 19451.33	VAE-cor: 0.007 CPH-loss: 4.388 CPH-cind: 0.189	TEST - VAE-cor: 0.007


200 TRAIN - VAE-loss-avg: 19240.607	VAE-cor: 0.007 CPH-loss: 4.023 CPH-cind: 0.117	TEST - VAE-cor: 0.008


300 TRAIN - VAE-loss-avg: 19241.342	VAE-cor: 0.008 CPH-loss: 3.7 CPH-cind: 0.076	TEST - VAE-cor: 0.006


400 TRAIN - VAE-loss-avg: 19323.492	VAE-cor: 0.008 CPH-loss: 3.369 CPH-cind: 0.046	TEST - VAE-cor: 0.008


500 TRAIN - VAE-loss-avg: 19401.654	VAE-cor: 0.008 CPH-loss: 3.12 CPH-cind: 0.029	TEST - VAE-cor: 0.007


600 TRAIN - VAE-loss-avg: 19184.316	VAE-cor: 0.008 CPH-loss: 2.907 CPH-cind: 0.019	TEST - VAE-cor: 0.006


700 TRAIN - VAE-loss-avg: 19168.014	VAE-cor: 0.008 CPH-loss: 2.711 CPH-cind: 0.012	TEST - VAE-cor: 0.008


800 TRAIN - VAE-loss-avg: 19352.445	VAE-cor: 0.007 CPH-loss: 2.533 CPH-cind: 0.007	TEST - VAE-cor: 0.009


900 TRAIN - VAE-loss-avg: 19244.887	VAE-cor: 0.008 CPH-loss: 2.371 CPH-cind: 0.004	TEST - VAE-cor: 0.006


1000 TRAIN - VAE-loss-avg: 19375.945	VAE-cor: 0.007 CPH-loss: 2.225 CPH-cind: 0.002	TEST - VAE-cor: 0.007


In [67]:
for iter in 1:params_dict["nepochs"]
    ps1 = Flux.params(model["cph"].model, model["enc"])
    gs1 = gradient(ps1) do
        model["cph"].lossf(model["cph"],model["enc"], train_x, train_y_e, NE_frac_tr, params_dict["cph_wd"])
    end 
    ## gradient Auto-Encoder 
    ps2 = Flux.params(model["ae"].net)
    gs2 = gradient(ps2) do
        model["ae"].lossf(model["ae"], train_x, train_x, weight_decay = params_dict["ae_wd"])
    end
    Flux.update!(model["cph"].opt, ps1, gs1)
    #Flux.update!(model["ae"].opt, ps2, gs2)

    ######
    OUTS_tr = vec(model["cph"].model(model["enc"](train_x)))
    ae_loss = model["ae"].lossf(model["ae"], train_x, train_x, weight_decay = params_dict["ae_wd"])
    ae_cor =  round(my_cor(vec(train_x), vec(model["ae"].net(train_x))),digits = 3)
    cph_loss = model["cph"].lossf(model["cph"],model["enc"](train_x), train_y_e, NE_frac_tr, params_dict["cph_wd"])
    ae_loss_test = round(model["ae"].lossf(model["ae"], test_x, test_x, weight_decay = params_dict["ae_wd"]), digits = 3)
    ae_cor_test = round(my_cor(vec(test_x), vec(model["ae"].net(test_x))), digits= 3)
    cph_loss_test = round(model["cph"].lossf(model["cph"],model["enc"](test_x), test_y_e, NE_frac_tst, params_dict["cph_wd"]), digits= 3)
                    
    OUTS_tst =  vec(model["cph"].model(model["enc"](test_x)))
            
    cind_tr, cdnt_tr, ddnt_tr, tied_tr  = concordance_index(train_y_t, train_y_e, OUTS_tr)
    cind_test,cdnt_tst, ddnt_tst, tied_tst = concordance_index(test_y_t, test_y_e,OUTS_tst)
    if iter % 100 == 0  || iter == 1     
        println("FOLD $(fold["foldn"]) $iter\t TRAIN AE-loss $(round(ae_loss,digits =3)) \t AE-cor: $(round(ae_cor, digits = 3))\t cph-loss-avg: $(round(cph_loss / params_dict["nsamples_train"],digits =6)) \t cph-cind: $(round(cind_tr,digits =3))")
        println("\t\tTEST AE-loss $(round(ae_loss_test,digits =3)) \t AE-cor: $(round(ae_cor_test, digits = 3))\t cph-loss-avg: $(round(cph_loss_test / params_dict["nsamples_test"],digits =6)) \t cph-cind: $(round(cind_test,digits =3)) [$(Int(cdnt_tst)), $(Int(ddnt_tst)), $(Int(tied_tst))]")
    end
end 

FOLD 1 1	 TRAIN AE-loss 0.063 	 AE-cor: 0.954	 cph-loss-avg: 0.018939 	 cph-cind: 0.844
		TEST AE-loss 0.073 	 AE-cor: 0.946	 cph-loss-avg: 0.056767 	 cph-cind: 0.709 [1138, 466, 0]


FOLD 1 100	 TRAIN AE-loss 0.054 	 AE-cor: 0.961	 cph-loss-avg: 0.020494 	 cph-cind: 0.831
		TEST AE-loss 0.063 	 AE-cor: 0.952	 cph-loss-avg: 0.058883 	 cph-cind: 0.696 [1117, 487, 0]


FOLD 1 200	 TRAIN AE-loss 0.054 	 AE-cor: 0.961	 cph-loss-avg: 0.020517 	 cph-cind: 0.828
		TEST AE-loss 0.063 	 AE-cor: 0.953	 cph-loss-avg: 0.058967 	 cph-cind: 0.693 [1112, 492, 0]


FOLD 1 300	 TRAIN AE-loss 0.053 	 AE-cor: 0.961	 cph-loss-avg: 0.020525 	 cph-cind: 0.827
		TEST AE-loss 0.063 	 AE-cor: 0.953	 cph-loss-avg: 0.059 	 cph-cind: 0.691 [1109, 495, 0]


FOLD 1 400	 TRAIN AE-loss 0.053 	 AE-cor: 0.961	 cph-loss-avg: 0.020529 	 cph-cind: 0.826
		TEST AE-loss 0.063 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.69 [1106, 498, 0]


FOLD 1 500	 TRAIN AE-loss 0.053 	 AE-cor: 0.961	 cph-loss-avg: 0.020532 	 cph-cind: 0.825
		TEST AE-loss 0.063 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.686 [1101, 503, 0]


FOLD 1 600	 TRAIN AE-loss 0.053 	 AE-cor: 0.962	 cph-loss-avg: 0.020534 	 cph-cind: 0.824
		TEST AE-loss 0.062 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.685 [1098, 506, 0]


FOLD 1 700	 TRAIN AE-loss 0.053 	 AE-cor: 0.962	 cph-loss-avg: 0.020536 	 cph-cind: 0.823
		TEST AE-loss 0.062 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.684 [1097, 506, 2]


FOLD 1 800	 TRAIN AE-loss 0.053 	 AE-cor: 0.962	 cph-loss-avg: 0.020537 	 cph-cind: 0.823
		TEST AE-loss 0.062 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.682 [1094, 509, 2]


FOLD 1 900	 TRAIN AE-loss 0.052 	 AE-cor: 0.962	 cph-loss-avg: 0.020539 	 cph-cind: 0.822
		TEST AE-loss 0.062 	 AE-cor: 0.953	 cph-loss-avg: 0.059017 	 cph-cind: 0.68 [1090, 514, 0]


FOLD 1 1000	 TRAIN AE-loss 0.052 	 AE-cor: 0.962	 cph-loss-avg: 0.02054 	 cph-cind: 0.821
		TEST AE-loss 0.062 	 AE-cor: 0.954	 cph-loss-avg: 0.059033 	 cph-cind: 0.677 [1086, 518, 0]
