# (0.1) Train multiple-imputation (MI) models
One option to resolve missingness for AFA is to use Multiple Imputation. 
This notebook trains MI models and saves the multiple imputed datasets. 

In [1]:
%load_ext autoreload
%autoreload 2

## Define paths

In [10]:
from afa.configurations.utils_ts import specify_default_paths_ts
# which dataset to work on 
dataset_name   = "miiv"

# name for of missingness scenario 
miss_scenario  = 'fully_observed'

# automatically specify some path locations (change paths manually if needed) 
paths = specify_default_paths_ts(dataset_name = dataset_name , miss_scenario = miss_scenario) 

# name for ps_model 
mi_model_name  = 'gaussian_process'

# new (where to save the model) 
mi_model_dir = paths['data_dir']  + 'mi_models' + '/' + mi_model_name + '/'
if miss_scenario == 'fully_observed':
    paths['data_file']          = '/home2/joshua.wendland/Documents/data/ts/miiv/fully_observed/miiv_static.parquet'
    paths['temporal_data_file'] = '/home2/joshua.wendland/Documents/data/ts/miiv/fully_observed/miiv_ts_wide.parquet'
    paths['miss_model_files'] = None

### Define model specifications

In [3]:
device = 'cuda'  # 'cuda' or 'cpu'

# Config for dataset preparation (torch.DataSet class)
dataset_params = {
    'missingness_value': 'nan',   # int, float or 'nan'
    'missingness_rate': (0.1, 0.3),
    'device': device  # 'cuda' or 'cpu'
}

# Config for dataloader (torch.DataLoader class)
dataloader_params = {
    'batch_size': 50, 
    'shuffle': False, 
    # 'prefetch_factor': 1, # increase for speed up
    # 'num_workers': 0,     # set higher for faster throughput
    'drop_last': True
}

# Config for trainer (pytorch_lightning.Trainer class)
trainer_params = {
    'max_epochs': 10,    # number of epochs to train
    'auto_lr_find': False,
    'fast_dev_run': True,
    'accelerator': device,
    'devices': 1,
    'profiler': None,  # 'simple', 'advanced', None
    'num_sanity_val_steps': 1
}

# Config for gp_model (GPImputer class)
gp_params = {
    'model_type': 'gaussian_process',
    'dataset_params' : dataset_params,
    'dataloader_params' : dataloader_params,
    'trainer_params' : trainer_params,
    'num_tasks': 49,  # number of tasks == number of features
    'num_kernels': 10,
    'data_mode': 'no_simulation',   # 'no_simulation' or 'simulation', with simulation a ground truth is expected to passed as well, ground truth = values for data that is missing in train dataloader
    # 'ckpt_path': 'best_model-v_recon_loss_target=1.10-epoch=142.ckpt',  # path to checkpoint of trained model, full path or relative to model directory    
}
    
# Config for mi_model from AFA module (MultipleImputationModel_ts class)
mi_model_params = {
    'name' : mi_model_name, 
    'directory' : mi_model_dir,
    'base_model_params' : gp_params
}

print('Done!')

Done!


## Load dataset with missingness 
At first, we want to load the dataset 

Includes loading: 
- superfeature mapping
- problem
- afa_problem 
- missingness_model

In [4]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-04-05 12:10:26.501523: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-04-05 12:10:26.593242: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-04-05 12:10:26.593261: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-04-05 12:10:33.884767: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [11]:
data_loader = DataLoader_ts(     data_file                  = paths['data_file'],
                                 temporal_data_file         = paths['temporal_data_file'],
                                 superfeature_mapping_file  = paths['superfeature_mapping_file'],
                                 problem_file               = paths['problem_file'],
                                 afa_problem_files          = paths['afa_problem_files'], 
                                 miss_model_files           = paths['miss_model_files'], 
                                 folds_file                 = paths['folds_file'] )
dataset = data_loader.load(temporal_is_wide=True) 
if miss_scenario == 'fully_observed':
    dataset.miss_model = type('Dummy', (object,), {})
    dataset.miss_model.m_graph = None

Padding sequences: 100%|██████████| 67056/67056 [00:26<00:00, 2539.73it/s]
Padding sequences: 100%|██████████| 67056/67056 [00:32<00:00, 2061.38it/s]
Padding sequences: 100%|██████████| 67056/67056 [00:27<00:00, 2463.17it/s]


In [18]:
dataset.get_data(fold=None, split='train')['temporal_feature'].shape

: 

: 

## Define MI model

In [16]:
from afa.data_modelling.missingness.multiple_imputation.multiple_imputation_model_ts import MultipleImputationModel_ts

mi_model = MultipleImputationModel_ts(  name                         = mi_model_params['name'], 
                                        m_graph                      = dataset.miss_model.m_graph, 
                                        superfeature_mapping         = dataset.superfeature_mapping,
                                        target_superfeature_names    = dataset.afa_problem.target_superfeature_names,
                                        model_params                 = mi_model_params,
                                        directory                    = mi_model_params['directory'] ) 

Global seed set to 42


{'dataset_name': 'toydataset_50000', 'data_missingness': 0.6, 'num_kernels': 10, 'num_tasks': 49, 'rank': 4, 'data_mode': 'no_simulation', 'lr': 0.01, 'batch_size': 128, 'sample_tp': 0.4, 'sample_tp_interval': [0.3, 0.8], 'num_epochs': 10, 'model_weights_save_path': './model_weights', 'model_type': 'gaussian_process', 'dataset_params': {'missingness_value': 'nan', 'missingness_rate': (0.1, 0.3), 'device': 'cuda'}, 'dataloader_params': {'batch_size': 50, 'shuffle': False, 'drop_last': True}, 'trainer_params': {'max_epochs': 10, 'auto_lr_find': False, 'fast_dev_run': True, 'accelerator': 'cuda', 'devices': 1, 'profiler': None, 'num_sanity_val_steps': 1}, 'directory': '../../../data/ts/miiv/fully_observed/mi_models/gaussian_process/', 'mode': 'imputation', 'task_names': ['Noise', 'Trend', 'Seasonality', 'Trend + Seasonality']} 




GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
Running in `fast_dev_run` mode: will run the requested loop using 1 batch(es). Logging and checkpointing is suppressed.


## Train MI model 

In [17]:
mi_model.fit(dataset, fold = 0, train_split = 'train', valid_split = 'val', fit_again = False)

TypeError: can't convert np.ndarray of type numpy.object_. The only supported types are: float64, float32, float16, complex64, complex128, int64, int32, int16, int8, uint8, and bool.

## Create multiple imputed dataset

In [None]:
from afa.data_modelling.missingness.multiple_imputation.multiple_imputed_dataset_ts import MultipleImputedDataset_ts

In [None]:
mi_results = mi_model.predict(dataset, n_samples = 5)

# create an mi_dataset out of the generated imputations
mi_dataset = MultipleImputedDataset_ts(  dataset = dataset, model = mi_model, results = mi_results) 

## Evaluate imputation model on ground truth dataset

In [None]:
# TO DO

## Save MI dataset

In [None]:
mi_dataset.save( model_dir = mi_model_dir) 