# (0) Generate a synthetic static dataset

Generate a synthetic dataset with 
- features X
- temporal features X_ts
- label Y 

Note: data is stored in the folder "../../../data/ts/" + dataset relativ to this directory

In [1]:
%load_ext autoreload
%autoreload 2

### Define paths 

In [2]:
# which dataset to work on 
dataset_name   = "synthetic_1"

In [3]:
# data specifications 
data_dir            = "../../../data/ts/" + dataset_name + "/fully_observed/" 

## Data Generation

In [4]:
from afa.data_modelling.datasets.synthetic_data_generation.data_generator_ts import DataGenerator_ts

In [5]:
data_generator = DataGenerator_ts(
                        data_dir           = data_dir,  
                        dataset_name       = dataset_name , 
                        n_datapts          = 100, 
                        test_size          = 0,
                        n_features_static  = 1,
                        n_features_ts      = 4, 
                        n_timepts          = 5,
                        table_style        = "EAV",
                        compression        = 'gzip',
                        n_classes          = 2)

In [6]:
df_static_train, df_ts_train, df_static_test, df_ts_test  = data_generator.generate_data()

## Load data
Check if data was saved correctly and can be loaded 

In [7]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-02-19 09:02:27.527285: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-19 09:02:27.945025: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-19 09:02:27.945041: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-19 09:02:28.974258: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [8]:
data_file          = data_dir + dataset_name + '_static.csv.gz' 
temporal_data_file = data_dir + dataset_name + '_ts_eav.csv.gz' 
superfeature_mapping_file = None

data_loader = DataLoader_ts( data_file                  = data_file,
                             temporal_data_file         = temporal_data_file,
                             superfeature_mapping_file  = superfeature_mapping_file )

dataset = data_loader.load() 

In [9]:
# visualize pandas dataframe for static data: 
dataset.temporal_data

variable,id,time,X0_ts,X1_ts,X2_ts,X3_ts,Y_ts
0,0,0,0.155886,0.514312,-1.998231,0.397823,0.0
1,0,1,0.619534,0.573113,-1.517186,-0.768024,1.0
2,0,2,-2.128794,0.511745,-0.106982,-0.215193,1.0
3,0,3,-2.573879,-0.061686,-0.016590,0.935965,1.0
4,0,4,1.746255,-0.176698,0.575739,0.893275,0.0
...,...,...,...,...,...,...,...
995,99,5,0.503500,-1.622758,-0.889544,-0.219066,1.0
996,99,6,-1.473449,-1.312907,-1.315638,-0.107722,1.0
997,99,7,1.201455,-0.818499,-1.196366,-0.159622,0.0
998,99,8,-2.164363,-0.383707,-0.993903,-0.119783,1.0


## Define problem
Check if problem definition works, see more in the `preparation01` tutorial

In [10]:
from afa.data_modelling.problem.problem_ts import ProblemMaker_ts

In [11]:
# define problem specifications
label_name = 'Y_ts'
problem = 'online'
treatment = None
max_seq_len = 10

problem_maker = ProblemMaker_ts(    problem    = problem, 
                                    label      = [label_name], 
                                    treatment  = treatment, 
                                    max_seq_len = max_seq_len)

dataset  = problem_maker.fit_transform(dataset)

100%|███████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 1139.94it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 975.06it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 924.11it/s]


In [12]:
# check if features/superfeatures/labels etc have been correctly assigned 
dataset.feature_name

{'temporal': ['X0_ts', 'X1_ts', 'X2_ts', 'X3_ts'],
 'data': ['X0', 'X1', 'Y'],
 'treatment': None,
 'label': ['Y_ts'],
 'super_data': ['X0', 'Y', 'X1'],
 'super_temporal': ['X0_ts', 'X1_ts', 'X3_ts', 'X2_ts']}

## Explain the synthetic data generation process
Store information in reports folder in latex / markdown format

In [13]:
explanation_file = data_dir + 'synthetic_data_report'
data_generator.explain(file= explanation_file + '.md'  , format='markdown')
# data_generator.explain(file= explanation_file + '.tex'  , format='latex')