# (0.1) Train multiple-imputation (MI) models
One option to resolve missingness for AFA is to use Multiple Imputation. 
This notebook trains MI models and saves the multiple imputed datasets. 

In [1]:
%load_ext autoreload
%autoreload 2


## Define paths

In [2]:
from afa.configurations.utils_ts import specify_default_paths_ts
# which dataset to work on 
dataset_name   = "miiv_test"

# name for of missingness scenario 
miss_scenario  = 'MCAR_1'

# automatically specify some path locations (change paths manually if needed) 
paths = specify_default_paths_ts(dataset_name = dataset_name , miss_scenario = miss_scenario) 

# name for ps_model 
mi_model_name  = 'mi_simple'

# new (where to save the model) 
mi_model_dir = paths['data_dir']  + 'mi_models' + '/' + mi_model_name + '/'


### Define model specifications

In [3]:
mi_model_params = {
    'name' : mi_model_name, 
    'directory' : mi_model_dir,
    'base_model_params' : {   'model_type': 'simple_imputer' }
    }

In [4]:
device = 'auto'  # 'cuda' or 'cpu'

# Config for dataset preparation (torch.DataSet class)
dataset_params = {
    'missingness_value': 'nan',             # Values given to missing values: int, float or 'nan'
    'missingness_rate': (0.3),              # The artificially created missingness for training (MCAR)
    'device': device  # 'cuda' or 'cpu'     # Device to train on
}

# Config for dataloader (torch.DataLoader class)
dataloader_params = {
    'batch_size': 100,                      # Number of samples per batch
    'shuffle': False,                       # Shuffle samples in batch?
    # 'prefetch_factor': 1, # increase for speed up, experimental
    # 'num_workers': 0,     # set higher for faster throughput, experimental
    'drop_last': True                       # Drop last batch, if it has a different batch size
}

# Config for trainer (pytorch_lightning.Trainer class)
trainer_params = {
    'max_epochs': 100,                      # number of epochs to train
    #'auto_lr_find': False,                  # Find best starting lr, experimental
    'fast_dev_run': False,                  # Fast dev run to test set up before commencing training
    'accelerator': device,                  # device to train on, should be the same as for dataset_params
    'devices': 1,                           # Number of devices to train on, leave it at one
    'profiler': None,                       # Pytorch profiler, 'simple', 'advanced', None
    'num_sanity_val_steps': 0,              # Number of sanity validation steps, for debugging
    'wandb_logger': False,                   # Wether to use wandb logger, else Tensorboard is used
    #'wandb_project_name': 'GPImputer Synthetic 2'   # Project name, in case wandb logging is used
}

# Config for gp_model (GPImputer class)
gp_params = {
    'model_type': 'gaussian_process',       # IMPORTANT: for BaseModelImputer_ts to choose the correct class
    'dataset_params' : dataset_params,
    'dataloader_params' : dataloader_params,
    'trainer_params' : trainer_params,
    'num_tasks': 49,  # number of tasks == number of features
    'num_kernels': 10,
    'data_mode': 'no_simulation',   # 'no_simulation' or 'simulation', with simulation a ground truth is expected to passed as well, ground truth = values for data that is missing in train dataloader, experimental -> leave it as no_simulation
    'ckpt_path': None #'best_model-v_recon_loss_target=1.10-epoch=142.ckpt',  # path to checkpoint of trained model, full path or relative to model directory    
}
    
# Config for mi_model from AFA module (MultipleImputationModel_ts class)
mi_model_params = {
    'name' : mi_model_name, 
    'directory' : mi_model_dir,
    'base_model_params' : gp_params
}


## Load dataset with missingness 
At first, we want to load the dataset 

Includes loading: 
- superfeature mapping
- problem
- afa_problem 
- missingness_model

In [5]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-08-07 14:43:18.645851: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-08-07 14:43:18.737386: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-08-07 14:43:18.737403: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-08-07 14:43:22.617555: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
data_loader = DataLoader_ts(     data_file                  = paths['data_file'],
                                 temporal_data_file         = paths['temporal_data_file'],
                                 superfeature_mapping_file  = paths['superfeature_mapping_file'],
                                 problem_file               = paths['problem_file'],
                                 afa_problem_files          = paths['afa_problem_files'], 
                                 miss_model_files           = paths['miss_model_files'], 
                                 folds_file                 = paths['folds_file'] )
dataset = data_loader.load() 

Padding sequences: 100%|██████████| 100/100 [00:00<00:00, 1973.54it/s]
Padding sequences: 100%|██████████| 100/100 [00:00<00:00, 2285.23it/s]
Padding sequences: 100%|██████████| 100/100 [00:00<00:00, 1665.85it/s]


## Define MI model

In [7]:
from afa.data_modelling.missingness.multiple_imputation.multiple_imputation_model_ts import MultipleImputationModel_ts

mi_model = MultipleImputationModel_ts(  name                         = mi_model_params['name'], 
                                        m_graph                      = dataset.miss_model.m_graph, 
                                        superfeature_mapping         = dataset.superfeature_mapping,
                                        target_superfeature_names    = dataset.afa_problem.target_superfeature_names,
                                        model_params                 = mi_model_params,
                                        directory                    = mi_model_params['directory'])


Global seed set to 42


{'batch_size': 128,
 'ckpt_path': None,
 'data_missingness': 0.6,
 'data_mode': 'no_simulation',
 'dataloader_params': {'batch_size': 100, 'drop_last': True, 'shuffle': False},
 'dataset_name': 'toydataset_50000',
 'dataset_params': {'device': 'auto',
                    'missingness_rate': 0.3,
                    'missingness_value': 'nan'},
 'directory': '../../../data/ts/miiv_test/MCAR_1/mi_models/mi_simple/',
 'lr': 0.01,
 'model_type': 'gaussian_process',
 'model_weights_save_path': './model_weights',
 'num_epochs': 10,
 'num_kernels': 10,
 'num_tasks': 49,
 'rank': 4,
 'sample_tp': 0.4,
 'sample_tp_interval': [0.3, 0.8],
 'task_names': ['Noise', 'Trend', 'Seasonality', 'Trend + Seasonality'],
 'trainer_params': {'accelerator': 'auto',
                    'devices': 1,
                    'fast_dev_run': False,
                    'max_epochs': 100,
                    'num_sanity_val_steps': 0,
                    'profiler': None,
                    'wandb_logger': False}}
Cre

GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


## Train MI model 

In [12]:
mi_model.fit(dataset, fold = 0, train_split = 'train', valid_split = 'val', fit_again = True)

Unloading trained model and reinstantiating new model and trainer...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name     | Type                       | Params
--------------------------------------------------------
0 | mae_loss | L1Loss                     | 0     
1 | mgp      | HadamardGP                 | 2.6 K 
2 | mll      | ExactMarginalLogLikelihood | 2.6 K 
--------------------------------------------------------
2.6 K     Trainable params
0         Non-trainable params
2.6 K     Total params
0.010     Total estimated model params size (MB)


Training: 0it [00:00, ?it/s]

`Trainer.fit` stopped: No training batches.


Fit finished.
{'batch_size': 128,
 'ckpt_path': None,
 'data_missingness': 0.6,
 'data_mode': 'no_simulation',
 'dataloader_params': {'batch_size': 100, 'drop_last': True, 'shuffle': False},
 'dataset_name': 'toydataset_50000',
 'dataset_params': {'device': 'auto',
                    'missingness_rate': 0.0,
                    'missingness_value': 'nan'},
 'directory': '../../../data/ts/miiv_test/MCAR_1/mi_models/mi_simple/',
 'lr': 0.01,
 'model_type': 'gaussian_process',
 'model_weights_save_path': './model_weights',
 'num_epochs': 10,
 'num_kernels': 10,
 'num_tasks': 49,
 'rank': 4,
 'sample_tp': 0.4,
 'sample_tp_interval': [0.3, 0.8],
 'task_names': ['Noise', 'Trend', 'Seasonality', 'Trend + Seasonality'],
 'trainer_params': {'accelerator': 'auto',
                    'devices': 1,
                    'fast_dev_run': False,
                    'max_epochs': 100,
                    'num_sanity_val_steps': 0,
                    'profiler': None,
                    'wandb_logger

## Create multiple imputed dataset

In [13]:
from afa.data_modelling.missingness.multiple_imputation.multiple_imputed_dataset_ts import MultipleImputedDataset_ts

In [14]:
mi_results = mi_model.predict(dataset, n_samples = 5)

# create an mi_dataset out of the generated imputations
mi_dataset = MultipleImputedDataset_ts(  dataset = dataset, model = mi_model, results = mi_results) 

Batch sampling:   0%|          | 0/100 [00:00<?, ?it/s]

## Evaluate imputation model on ground truth dataset

In [None]:
# TO DO

## Save MI dataset

In [12]:
mi_dataset.save( model_dir = mi_model_dir) 