# Train AFA agent
The AFA problem contains the training of an AFA agent that can recommends which features to acquire. 
In this notebook we train and save such an AFA agent and the created AFA Dataset.

## Define paths

In [1]:
# which dataset to work on 
dataset_name   = "synthetic_1"

# name for of missingness scenario 
miss_scenario  = 'MCAR_1'

# name for agent (and predictor) 
agent_name            = 'DQN'
predictor_model_name  = 'logistic_regression'

In [2]:
# specifications
data_dir       = "../../../data/ts/" + dataset_name + "/" + miss_scenario + "/" 
data_file          = data_dir + dataset_name + '_' + miss_scenario + '_static.csv.gz' 
temporal_data_file = data_dir + dataset_name + '_' + miss_scenario + '_ts_eav.csv.gz' 
superfeature_mapping_file = data_dir + 'superfeatures.csv'
problem_file   = data_dir + 'problem/' + 'problem.yaml'
afa_problem_files = {
        'action_file'          : data_dir + 'afa_problem/' + 'acquisition_actions.csv', 
        'prediction_cost_file' : data_dir + 'afa_problem/' + 'prediction_cost.csv'
                }
folds_file = data_dir + 'folds/' + 'fold_list.hkl'


# new (where to save the model) 
agent_dir = data_dir + 'afa_agents' + '/' + agent_name + '/'
predictor_model_dir = data_dir + 'predictor_models' + '/' + predictor_model_name + '/'

# reporting
explanation_file = agent_dir +  'reports/' + 'model_report' 

## Imports

In [3]:
%load_ext autoreload
%autoreload 2

import sys
import os

# navigate to afa directory 
sys.path.insert(0, os.path.abspath('../../afa'))

## Load dataset with missingness 
At first, we want to load the dataset 

Includes loading: 
- superfeature mapping
- problem
- afa_problem 
- missingness_model

In [4]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-02-13 13:28:26.278709: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-13 13:28:26.455813: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-13 13:28:26.455839: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-13 13:28:27.291501: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [5]:
data_loader = DataLoader_ts( data_file                  = data_file,
                             temporal_data_file         = temporal_data_file,
                             superfeature_mapping_file  = superfeature_mapping_file,
                             problem_file               = problem_file,
                             afa_problem_files          = afa_problem_files,
                             folds_file                 = folds_file)
dataset = data_loader.load() 

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 709.86it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 932.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 822.97it/s]


In [6]:
predictor_model_params = {
    'name' : predictor_model_name, 
    'base_model_params' : {   'base_model_type': 'ImputeThenRegress',
                              'imputer_params' : 
                                       {   
                                      'base_model_type': 'simple_imputer',
                                      'mode' : 'imputation', 
                                      'checkpoint_dir': predictor_model_dir + 'checkpoints/' 
                                      },
                              'predictor_params' : 
                                       {   
                                      'base_model_type': 'ann',
                                      'mode' : 'classification', 
                                      'units':              1,
                                      'layers': 1,
                                      'learning_rate': 0.01,
                                      'batch_size' :128,
                                      'epochs': 100,
                                      'checkpoint_dir': predictor_model_dir + 'checkpoints/' 
                                      }
                          }
    }

In [7]:
# define agent
agent_model_params =  {                       
                    'num_iterations'              : 10000,
                    'initial_collect_steps'       : 50,
                    'collect_steps_per_iteration' : 1, 
                    'replay_buffer_max_length'    : 100000 ,
                    'batch_size'                  : 64,
                    'learning_rate'               : 1e-3,
                    'train_steps_per_iteration'   : 1, 
                    'log_interval'                : 200,
                    'num_eval_episodes'           : 50,
                    'eval_interval'               : 500, 
                    'verbose'                     : False,
                    'checkpoint_dir'              : agent_dir + 'checkpoints/' 
                      }               

In [8]:
# define afa_agent
afa_agent_params = {
    'agent_name' : agent_name,
    'agent_type' : 'dqn_agent',
    'predictor_params' : predictor_model_params, 
    'agent_params' : agent_model_params
}

## Initialize Agent 

In [9]:
from afa_models.afa_agents.utils_ts import define_afa_agent_ts

In [10]:
afa_agent = define_afa_agent_ts(     agent_name = afa_agent_params['agent_name'],  
                                     agent_type = afa_agent_params['agent_type'],   
                                     afa_agent_params =  afa_agent_params)

Loading already trained weights for SimpleImputer...
Loading already trained weights for ann...
Load DQN Agent policy... 


2023-02-13 13:28:32.429974: I tensorflow/compiler/xla/stream_executor/cuda/cuda_gpu_executor.cc:967] could not open file to read NUMA node: /sys/bus/pci/devices/0000:01:00.0/numa_node
Your kernel may have been built without NUMA support.
2023-02-13 13:28:32.430185: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-13 13:28:32.430283: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublas.so.11'; dlerror: libcublas.so.11: cannot open shared object file: No such file or directory
2023-02-13 13:28:32.430347: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcublasLt.so.11'; dlerror: libcublasLt.so.11: cannot open shared object file: No such file or directory
2023-02-13 13:28:32.430428: W tensorf

## Train agent

In [11]:
afa_agent.fit(dataset, fold = 0, train_split = "train", valid_split = "val", 
              fit_again = True ) 

Start training DQN agent
Agent was build .... 
Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.


Instructions for updating:
Use `tf.data.Dataset.counter(...)` instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Use `as_dataset(..., single_deterministic_pass=False) instead.


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
Lambda fuctions will be no more assumed to be used in the statement where they are used, or at least in the same block. https://github.com/tensorflow/tensorflow/issues/56089


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


Instructions for updating:
back_prop=False is deprecated. Consider using tf.stop_gradient instead.
Instead of:
results = tf.foldr(fn, elems, back_prop=False)
Use:
results = tf.nest.map_structure(tf.stop_gradient, tf.foldr(fn, elems))


step = %d, loss = %f 500 tf.Tensor(36.249916, shape=(), dtype=float32)
step = %d, loss = %f 1000 tf.Tensor(26.122139, shape=(), dtype=float32)
step = %d, loss = %f 1500 tf.Tensor(29.498661, shape=(), dtype=float32)
step = %d, loss = %f 2000 tf.Tensor(43.60396, shape=(), dtype=float32)




step = %d, loss = %f 2500 tf.Tensor(32.273056, shape=(), dtype=float32)




step = %d, loss = %f 3000 tf.Tensor(43.690884, shape=(), dtype=float32)
step = %d, loss = %f 3500 tf.Tensor(24.93745, shape=(), dtype=float32)
step = %d, loss = %f 4000 tf.Tensor(28.531643, shape=(), dtype=float32)
step = %d, loss = %f 4500 tf.Tensor(49.08983, shape=(), dtype=float32)
step = %d, loss = %f 5000 tf.Tensor(41.01908, shape=(), dtype=float32)
step = %d, loss = %f 5500 tf.Tensor(39.588512, shape=(), dtype=float32)
step = %d, loss = %f 6000 tf.Tensor(51.904404, shape=(), dtype=float32)
step = %d, loss = %f 6500 tf.Tensor(35.758343, shape=(), dtype=float32)
step = %d, loss = %f 7000 tf.Tensor(58.620575, shape=(), dtype=float32)
step = %d, loss = %f 7500 tf.Tensor(35.57255, shape=(), dtype=float32)
step = %d, loss = %f 8000 tf.Tensor(51.36346, shape=(), dtype=float32)
step = %d, loss = %f 8500 tf.Tensor(45.50667, shape=(), dtype=float32)
step = %d, loss = %f 9000 tf.Tensor(46.290047, shape=(), dtype=float32)
step = %d, loss = %f 9500 tf.Tensor(62.68849, shape=(), dtype=float32)



INFO:tensorflow:Assets written to: ../../../data/ts/synthetic_1/MCAR_1/afa_agents/DQN/checkpoints/DQNPolicy/assets


INFO:tensorflow:Assets written to: ../../../data/ts/synthetic_1/MCAR_1/afa_agents/DQN/checkpoints/DQNPolicy/assets


## Plot training 

In [12]:
logdir = afa_agent_params['agent_params']['checkpoint_dir']
%load_ext tensorboard
%tensorboard --logdir=$logdir --port=3018

## Create afa dataset 

In [13]:
from afa_datasets.afa_dataset_ts import AFADataset_ts 

In [14]:
# afa_results = afa_agent.predict(dataset, ids = None, fold = 0, split = "val",  n_samples = 1) 
afa_results = afa_agent.predict(dataset, n_samples = 1) 

# create an afa_dataset out of the generated afa_results
afa_dataset = AFADataset_ts(  dataset = dataset, model = afa_agent, results = afa_results) 



### Run test for consistency check 

In [22]:
from afa_datasets.testing.test_afa_dataset_ts import test_afa_dataset_ts
test_afa_dataset_ts(afa_dataset)

From AFA dataset: 
cost:  31.82
cost_mc:  26.6
cost_acq:  5.22
From Metrics: 
cost:  31.82
cost_mc:  26.6
cost_acq:  5.22
From AverageReturnMetric: 
cost:  31.82
Test passed


In [23]:
# save afa_dataset
afa_dataset.save( model_dir = agent_dir ) 

## Test loading afa dataset 

In [24]:
from afa_datasets.afa_data_loader.afa_data_loader_ts import AFADataLoader_ts

In [25]:
augmented_data_file = agent_dir + 'results.hkl'
afa_data_loader = AFADataLoader_ts(                   
                    augmented_data_file = augmented_data_file,
                    data_loader  = data_loader,
                    model_params = afa_agent_params) 

In [26]:
afa_dataset = afa_data_loader.load() 

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 786.91it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 665.02it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 814.41it/s]


Loading already trained weights for SimpleImputer...
Loading already trained weights for ann...
Load DQN Agent policy... 


In [27]:
afa_dataset.results.keys()

dict_keys(['ids', 'superR_hat', 'label_pred', 'cost_acq', 'cost_mc', 'cost'])

In [28]:
data = afa_dataset.get_data(fold = None, split = None, n_samples = 1) 
# data = afa_dataset.get_data(fold = 0, split = 'val', n_samples = 1) 

In [None]:
afa_dataset._compute_costs()