# (3.1) Create AFA dataset on multiple-imputed dataset
In this notebook we run the agent on a dataset with imputed values. We save the created dataset.

Note: AFA agents must be already trained 

In [None]:
%load_ext autoreload
%autoreload 2

## Define paths

Paths for data

In [None]:
from afa.configurations.utils_static import specify_default_paths_static
# which dataset to work on 
dataset_name   = "synthetic_1"

# name for of missingness scenario 
miss_scenario  = 'MCAR_1'

# automatically specify some path locations (change paths manually if needed) 
paths = specify_default_paths_static(dataset_name = dataset_name , miss_scenario = miss_scenario) 

Paths for model

In [None]:
# name for agent (and predictor) 
agent_name            = 'DQN'
predictor_model_name  = 'logistic_regression'

# new (where to save the model) 
agent_dir           = paths['data_dir']  + 'afa_agents' + '/' + agent_name + '/'
predictor_model_dir = paths['data_dir']  + 'predictor_models' + '/' + predictor_model_name + '/'

In [None]:
mi_model_name   =  'mi_simple'
mi_model_dir  =  paths['data_dir'] + 'mi_models/' + mi_model_name + '/'

# how to name the afa_dataset
afa_dataset_name = mi_model_name 

## Load multiple imputed dataset 
At first, we want to load the dataset 

Includes loading: 
- superfeature mapping
- problem
- afa_problem 
- missingness_model

The dataset has potentially multiple imputations for each missing value. 

In [None]:
from afa.data_modelling.datasets.data_loader.data_loader_static import DataLoader_static
from afa.data_modelling.missingness.multiple_imputation.multiple_imputed_data_loader.multiple_imputed_data_loader_static import MultipleImputedDataLoader_static

In [None]:
data_loader = DataLoader_static( data_file                  = paths['data_file'],
                                 superfeature_mapping_file  = paths['superfeature_mapping_file'],
                                 problem_file               = paths['problem_file'],
                                 afa_problem_files          = paths['afa_problem_files'], 
                                 folds_file                 = paths['folds_file'] )
dataset = data_loader.load() 

In [None]:
#load afa_dataset
augmented_data_file = mi_model_dir + 'results.hkl'
mi_model_params = None
mi_data_loader = MultipleImputedDataLoader_static(                   
                    augmented_data_file = augmented_data_file,
                    dataset             = dataset,
                    model_params        = mi_model_params) 
mi_dataset = mi_data_loader.load() 

In [None]:
# define predictor (has to be pretrained) 
predictor_model_params = {
    'name' : predictor_model_name, 
    'directory': predictor_model_dir,
    'base_model_params' : {   
        'model_type': 'ImputeThenRegress',
        'imputer_params' : {
              'model_type': 'simple_imputer',
              'mode' : 'imputation', 
            },
        'predictor_params' : {
              'model_type': 'ann',
              'mode' : dataset.problem, 
              'units':              1,
              'layers': 1,
              'learning_rate': 0.01,
              'batch_size' :128,
              'epochs': 100
          }
    }
}

In [None]:
# define agent
agent_model_params =  {                       
                    'num_iterations'              : 100,
                    'initial_collect_steps'       : 50,
                    'collect_steps_per_iteration' : 1, 
                    'replay_buffer_max_length'    : 100000 ,
                    'batch_size'                  : 64,
                    'learning_rate'               : 1e-3,
                    'train_steps_per_iteration'   : 1, 
                    'log_interval'                : 200,
                    'num_eval_episodes'           : 50,
                    'eval_interval'               : 500, 
                    'verbose'                     : False
                      }               

In [None]:
# define afa_agent
afa_agent_params = {
    'name' : agent_name,
    'agent_type' : 'dqn_agent',
    'directory' : agent_dir, 
    'predictor_params' : predictor_model_params, 
    'agent_params' : agent_model_params
}

## Initialize Agent 

In [None]:
from afa.afa_models.afa_agents.utils_static import define_afa_agent_static

In [None]:
afa_agent = define_afa_agent_static(     name             = afa_agent_params['name'],  
                                         agent_type       = afa_agent_params['agent_type'],   
                                         afa_agent_params = afa_agent_params,
                                         agent_directory  = afa_agent_params['directory'])

## Create afa dataset 

In [None]:
from afa.afa_datasets.afa_dataset_static import AFADataset_static 

In [None]:
### WITHOUT MEMORY TRACKING
# afa_results = afa_agent.predict(dataset,  n_samples = 1) 

## create an afa_dataset out of the generated afa_results
# afa_dataset = AFADataset_static(  dataset = dataset, model = afa_agent, results = afa_results) 

In [None]:
### WITH MEMORY TRACKING
import tracemalloc
tracemalloc.start()

# run agent
afa_results = afa_agent.predict(mi_dataset,  n_samples = 1) 

# create an afa_dataset out of the generated afa_results
afa_dataset = AFADataset_static(  dataset = mi_dataset, model = afa_agent, results = afa_results) 

# displaying the memory
current_memory, max_memory = tracemalloc.get_traced_memory()
print("Checking memory usage during afa dataset creation ... ")
print("current memory usage (after afa dataset creation): {} MB".format(round(current_memory/1000000))  )
print("maximum memory usage (during afa dataset creation): {} MB".format(round(max_memory/1000000))  )
 
# stopping the library
tracemalloc.stop()

In [None]:
# MI data 
# import numpy as np 
# # mi_data, mi_results = mi_dataset.get_augmented_data( ids = np.array([1,2]), n_samples = None)
# mi_data, mi_results = mi_dataset.get_augmented_data(  n_samples = 2)
# print("shape of features of mi data = {}".format(mi_data['feature'].shape))
# # AFA results with MI data
# print("shape of target_R of afa results= {}".format(afa_results['target_superR_hat'].shape))
# print("shape of ids of afa results = {}".format(afa_results['ids'].shape))
# # AFA results retrieved from AFA dataset
# afa_data, afa_data_results = afa_dataset.get_augmented_data(n_samples = 3)
# print("shape of feature from afa_dataset = {}".format(afa_data['feature'].shape))
# print("shape of ids of afa_dataset results = {}".format(afa_data_results['ids'].shape))
# AFA results retrieved from AFA dataset
# afa_data, afa_data_results = afa_dataset.get_augmented_data(n_samples = 3)
# print("shape of feature from afa_dataset = {}".format(afa_data['feature'].shape))
# print("shape of ids of afa_dataset results = {}".format(afa_data_results['ids'].shape))

### Run test for consistency check 

In [None]:
from afa.afa_datasets.testing.test_afa_dataset_static import test_afa_dataset_static
test_afa_dataset_static(afa_dataset)

In [None]:
# save afa_dataset
afa_dataset.save( directory = agent_dir, afa_dataset_name = afa_dataset_name ) 

## Test loading afa dataset 

In [None]:
from afa.afa_datasets.afa_data_loader.afa_data_loader_static import AFADataLoader_static

In [None]:
augmented_data_file = agent_dir + afa_dataset_name + '_' + 'results.hkl'
afa_data_loader = AFADataLoader_static(                   
                    augmented_data_file = augmented_data_file,
                    dataset  = mi_dataset,
                    model_params = afa_agent_params) 
afa_dataset = afa_data_loader.load() 

In [None]:
data = afa_dataset.get_data(fold = None, split = None, n_samples =None) 
# data = afa_dataset.get_data(fold = 0, split = 'val', n_samples = 1) 

### Write report

In [None]:
# report 
explanation_file = agent_dir + afa_dataset_name + '_' + 'afa_dataset_report.md'  # +  'reports/' + 'model_report' 
afa_dataset.explain(file= explanation_file, format = 'markdown')