
# (0) Generate a synthetic static dataset

Generate a synthetic dataset with 
- features X
- temporal features X_ts
- label Y 

Note: data is stored in the folder "../../../data/ts/" + dataset relativ to this directory

In [1]:
%load_ext autoreload
%autoreload 2

### Define paths 

In [2]:
# which dataset to work on 
dataset_name   = "synthetic_2"

In [3]:
# data specifications 
data_dir            = "../../../data/ts/" + dataset_name + "/fully_observed/" 

## Data Generation

In [4]:
from afa.data_modelling.datasets.synthetic_data_generation.data_generator_ts import DataGenerator_ts

In [5]:
data_generator = DataGenerator_ts(
                        data_dir           = data_dir,  
                        dataset_name       = dataset_name , 
                        n_datapts          = 2000, 
                        test_size          = 0,
                        n_features_static  = 1,
                        n_features_ts      = 4, 
                        n_timepts          = 50,
                        table_style        = "EAV",
                        compression        = 'gzip',
                        n_classes          = 2)

In [6]:
df_static_train, df_ts_train, df_static_test, df_ts_test  = data_generator.generate_data()

## Load data
Check if data was saved correctly and can be loaded 

In [7]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-03-21 11:17:05.822448: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-21 11:17:05.910614: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-21 11:17:05.910630: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-21 11:17:20.856627: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [8]:
data_file          = data_dir + dataset_name + '_static.csv.gz' 
temporal_data_file = data_dir + dataset_name + '_ts_eav.csv.gz' 
superfeature_mapping_file = None

data_loader = DataLoader_ts( data_file                  = data_file,
                             temporal_data_file         = temporal_data_file,
                             superfeature_mapping_file  = superfeature_mapping_file )

dataset = data_loader.load() 

In [9]:
# visualize pandas dataframe for static data: 
dataset.temporal_data

variable,id,time,X0_ts,X1_ts,X2_ts,X3_ts,Y_ts
0,0,0,-0.504205,0.713320,0.543657,0.006903,0.0
1,0,1,-1.078681,0.547127,0.839031,-0.176253,0.0
2,0,2,0.446764,-0.208496,0.550235,-0.832454,0.0
3,0,3,1.347463,0.181738,0.630574,0.063319,0.0
4,0,4,0.027169,0.674259,-0.109590,-0.059911,0.0
...,...,...,...,...,...,...,...
99995,1999,45,-0.526578,-1.155559,0.323330,0.003595,1.0
99996,1999,46,0.687887,0.100261,1.642386,0.423817,0.0
99997,1999,47,0.782064,1.778468,-0.406455,-1.452343,0.0
99998,1999,48,-0.594470,0.817426,0.168355,-0.549674,0.0


## Define problem
Check if problem definition works, see more in the `preparation01` tutorial

In [10]:
from afa.data_modelling.problem.problem_ts import ProblemMaker_ts

In [11]:
# define problem specifications
label_name = 'Y_ts'
problem = 'online'
treatment = None
max_seq_len = 50

problem_maker = ProblemMaker_ts(    problem    = problem, 
                                    label      = [label_name], 
                                    treatment  = treatment, 
                                    max_seq_len = max_seq_len)

dataset  = problem_maker.fit_transform(dataset)

100%|██████████| 2000/2000 [00:01<00:00, 1674.22it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1929.33it/s]
100%|██████████| 2000/2000 [00:01<00:00, 1794.32it/s]


In [12]:
# check if features/superfeatures/labels etc have been correctly assigned 
dataset.feature_name

{'temporal': ['X0_ts', 'X1_ts', 'X2_ts', 'X3_ts'],
 'data': ['X0', 'Y'],
 'treatment': None,
 'label': ['Y_ts'],
 'super_data': ['Y', 'X0'],
 'super_temporal': ['X3_ts', 'X2_ts', 'X0_ts', 'X1_ts']}

## Explain the synthetic data generation process
Store information in reports folder in latex / markdown format

In [13]:
explanation_file = data_dir + 'synthetic_data_report'
data_generator.explain(file= explanation_file + '.md'  , format='markdown')
# data_generator.explain(file= explanation_file + '.tex'  , format='latex')

FileNotFoundError: [Errno 2] No such file or directory: 'pandoc'