This notebook is for using variational autoencoder and the choice of variables in struct depends on which version you want to train. 

- with Pre-transformations
- with AIQN
- standard VAE

The results in the paper have the following architecture for the sim data:

    data_string::String = "sim" 
    η::Float32 = 1e-3                                                                                                
    λ::Float32 = 0.01f0
    β::Float64 = 0.5                                                                                      
    batch_size::Int = 128                                                                                            
    epochs::Int = 500                                                                                                 
    seed::Int = 42                                                                                                  
    input_dim::Int = 21                                                                                               
    latent_dim::Int = 2                                                                                               
    hidden_dim::Int = 28                                                                                              
    verbose_freq::Int = 100                                                                                          
    hyperopt_flag::Bool = false       
    multimodal_encoder::Bool = true   

    pre_transformation::Bool =  true                                                                                 
    bimodality_score_threshold::Float32 = 0                                                                         
    
    scaling::Bool = true                                                                                             
    scaling_method::String = "scaling" 
                                                                                    
    AIQN::Bool = false                                                                                               
                                                                                 
    synthetic_data::Bool = false                                                                                     


Now for QVAE:
    AIQN = true

for standard VAE:
    pre_transformation = false




The results in the paper have the following architecture for the IST data:

    data_string::String = "ist_randomization_data_smaller_no_west_no_south_aug5" 
    η::Float32 = 5e-4                                                                                                
    λ::Float32 = 0.01f0
    β::Float64 = 0.5                                                                                      
    batch_size::Int = 32                                                                                            
    epochs::Int = 1000                                                                                                 
    seed::Int = 42                                                                                                  
    input_dim::Int = 21                                                                                               
    latent_dim::Int = 2                                                                                               
    hidden_dim::Int = 28                                                                                              
    verbose_freq::Int = 100                                                                                          
    hyperopt_flag::Bool = false       
    multimodal_encoder::Bool = true   

    pre_transformation::Bool =  true                                                                                 
    bimodality_score_threshold::Float32 = 0                                                                         
    
    scaling::Bool = true                                                                                             
    scaling_method::String = "scaling" 
                                                                                    
    AIQN::Bool = false                                                                                               
                                                                                 
    synthetic_data::Bool = false                                                                                     


Now for QVAE:
    AIQN = true

for standard VAE:
    pre_transformation = false

In [1]:
cd("../.") 
pwd()

"/Users/farhadyar/Documents/Project_PTVAE/progs/github_repo/LatentSubgroups"

In [2]:
using Pkg
if isfile("Project.toml") && isfile("Manifest.toml")
    Pkg.activate(".")
end

# Pkg.instantiate()
using IJulia

using Revise




[32m[1m  Activating[22m[39m environment at `~/Documents/Project_PTVAE/progs/github_repo/LatentSubgroups/Project.toml`


In [3]:
includet("../AIQN/AIQN.jl")
includet("../src/structs.jl")
includet("../src/report.jl")
includet("../src/transformations.jl")
includet("../src/VAE.jl")
includet("../src/load_data.jl")
includet("../src/evaluation/evaluation.jl")
includet("../src/classification.jl")
includet("../src/GLM.jl")



## Loading Dataset

In [4]:
x, dataTypeArray,args = load_dataset()

args.cross_validation_flag = true
args.n_folds = 10


if args.data_string == "sim"
    
    args.β = 0.5
    args.η = 1e-3
    args.epochs = 500
    args.multimodal_encoder = true
    args.batch_size = 32
    args.latent_dim = 2
    args.hidden_dim = 28
    args.scaling_method = "scaling"
    args.IPW_sampling = false

else contains(args.data_string, "ist")
    args.β = 0.5
    args.η = 1e-3
    args.epochs = 1000
    args.multimodal_encoder = true
    args.batch_size = 128
    args.latent_dim = 2
    args.hidden_dim = 22
    args.scaling_method = "scaling"
end


args.pre_transformation = true

args.AIQN = false

false

## Preprocessing (transformations and scaling) &
## Training Variational Autoencoder

In [5]:
Random.seed!(11)

if args.cross_validation_flag

    reconstruction_train_val_sets = []

    cross_val_run_path = string(args.current_run_path, "/$(args.n_folds)fold_cross_validation")
    mkdir(cross_val_run_path)

    cross_val_sets = create_cross_validation_sets(x, args.n_folds)

    for fold = 1:args.n_folds    

        train_set, val_set = cross_val_sets[fold]
               
        args.current_run_path = string(cross_val_run_path, "/fold_$(fold)")


        mkdir(args.current_run_path)

        # write the train_set and val_set as csv in args.current_run_path
        writedlm(string(args.current_run_path, "/", "train.csv"),  train_set, ',')
        writedlm(string(args.current_run_path, "/", "val.csv"),  val_set, ',')
            
        preprocess_ps = preprocess_params(input_dim = args.input_dim, pre_transformation_type = "quantile")
        preprocessed_data, preprocess_ps = preprocess!(args, preprocess_ps, train_set, dataTypeArray)
        preprocessed_data_val, preprocess_ps = preprocess_test_data(args, preprocess_ps, val_set, dataTypeArray)

        if args.hyperopt_flag 
            println("cross_validation and hyper parameter optimization is not implemented!")
            println("hyperopt_flag is changed to false")
        else
            val_data = get_data(preprocessed_data_val, args.batch_size)

            model, training_data, reconstruction_train_val_set = trainVAE!(preprocessed_data, train_set, dataTypeArray, preprocess_ps, args; val_data = val_data)

            push!(reconstruction_train_val_sets, reconstruction_train_val_set)

            
            save_vae_results(val_data, preprocessed_data, val_set, model, preprocess_ps, args, [], true)
        end
    end
    
    mkdir(string(cross_val_run_path, "/reconstruction_values"))

    @save string(cross_val_run_path, "/reconstruction_values/reconstruction_values") reconstruction_train_val_sets

else
    preprocess_ps = preprocess_params(input_dim = args.input_dim)
    preprocessed_data, preprocess_ps = preprocess!(args, preprocess_ps, x, dataTypeArray)

    if args.hyperopt_flag
        trainVAE_hyperparams_opt!(preprocessed_data, x, dataTypeArray, preprocess_ps, args)
    else
        model, training_data, loss_array_vae = trainVAE!(preprocessed_data, x, dataTypeArray, preprocess_ps, args)
    end
end

Epoch 100: loss = 2.773278758823861
Epoch 200: loss = 2.9145162232575546
Epoch 300: loss = 2.601037079384966
Epoch 400: loss = 2.580703933500228
Epoch 500: loss = 2.3359099957898017
Epoch 100: loss = 2.846485695957449
Epoch 200: loss = 2.59240343464286
Epoch 300: loss = 2.4469880291882053
Epoch 400: loss = 2.531275340588725
Epoch 500: loss = 2.4596551211982165
Epoch 100: loss = 2.864328112147001
Epoch 200: loss = 2.5672353660600664
Epoch 300: loss = 2.4417963462688115
Epoch 400: loss = 2.5057129072398725
Epoch 500: loss = 2.5932919125913676
Epoch 100: loss = 2.7321363565540557
Epoch 200: loss = 2.533614436055833
Epoch 300: loss = 2.382207664820421
Epoch 400: loss = 2.403969955562805
Epoch 500: loss = 2.394177447097772
Epoch 100: loss = 2.7279418592079434
Epoch 200: loss = 2.445441486233846
Epoch 300: loss = 2.4201817574357105
Epoch 400: loss = 2.3171895708208488
Epoch 500: loss = 2.401005751915347
Epoch 100: loss = 2.719563659968318
Epoch 200: loss = 2.487140405006585
Epoch 300: loss =