
# (0) Generate a synthetic static dataset

Generate a synthetic dataset with 
- features X
- temporal features X_ts
- label Y 

Note: data is stored in the folder "../../../data/ts/" + dataset relativ to this directory

In [1]:
%load_ext autoreload
%autoreload 2

### Define paths 

In [2]:
# which dataset to work on 
dataset_name   = "synthetic_1"

In [3]:
# data specifications 
data_dir            = "../../../data/ts/" + dataset_name + "/fully_observed/" 

## Data Generation

In [4]:
from afa.data_modelling.datasets.synthetic_data_generation.data_generator_ts import DataGenerator_ts

In [5]:
data_generator = DataGenerator_ts(
                        data_dir           = data_dir,  
                        dataset_name       = dataset_name , 
                        n_datapts          = 100, 
                        test_size          = 0,
                        n_features_static  = 1,
                        n_features_ts      = 4, 
                        n_timepts          = 5,
                        table_style        = "EAV",
                        compression        = 'gzip',
                        n_classes          = 2)

In [6]:
df_static_train, df_ts_train, df_static_test, df_ts_test  = data_generator.generate_data()

## Load data
Check if data was saved correctly and can be loaded 

In [5]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-03-02 13:49:03.941535: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-03-02 13:49:04.037439: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-03-02 13:49:04.037454: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-03-02 13:49:15.635760: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [6]:
data_file          = data_dir + dataset_name + '_static.csv.gz' 
temporal_data_file = data_dir + dataset_name + '_ts_eav.csv.gz' 
superfeature_mapping_file = None

data_loader = DataLoader_ts( data_file                  = data_file,
                             temporal_data_file         = temporal_data_file,
                             superfeature_mapping_file  = superfeature_mapping_file )

dataset = data_loader.load() 

In [7]:
# visualize pandas dataframe for static data: 
dataset.temporal_data

variable,id,time,X0_ts,X1_ts,X2_ts,X3_ts,Y_ts
0,0,0,0.429527,0.962485,-0.910083,-0.304410,0.0
1,0,1,-0.266579,1.761697,-1.155467,0.498474,0.0
2,0,2,-0.412245,-0.651917,-0.357704,-0.374634,0.0
3,0,3,-1.080314,-0.472513,1.791534,0.606205,0.0
4,0,4,-2.335514,-0.189049,-0.197982,-0.462585,1.0
...,...,...,...,...,...,...,...
495,99,0,1.647702,-1.489371,-1.644126,0.350373,1.0
496,99,1,-1.721988,-0.364009,-0.585702,-0.533847,1.0
497,99,2,-1.537833,-0.418374,-0.577308,-0.355349,1.0
498,99,3,-1.910697,0.269552,-1.482798,-0.595961,1.0


## Define problem
Check if problem definition works, see more in the `preparation01` tutorial

In [8]:
from afa.data_modelling.problem.problem_ts import ProblemMaker_ts

In [9]:
# define problem specifications
label_name = 'Y_ts'
problem = 'online'
treatment = None
max_seq_len = 10

problem_maker = ProblemMaker_ts(    problem    = problem, 
                                    label      = [label_name], 
                                    treatment  = treatment, 
                                    max_seq_len = max_seq_len)

dataset  = problem_maker.fit_transform(dataset)

Padding sequence: 100%|██████████| 100/100 [00:00<00:00, 898.54it/s]
Padding sequence: 100%|██████████| 100/100 [00:00<00:00, 876.62it/s]
Padding sequence: 100%|██████████| 100/100 [00:00<00:00, 921.75it/s]


In [10]:
# check if features/superfeatures/labels etc have been correctly assigned 
dataset.feature_name

{'temporal': ['X0_ts', 'X1_ts', 'X2_ts', 'X3_ts'],
 'data': ['X0', 'Y'],
 'treatment': None,
 'label': ['Y_ts'],
 'super_data': ['X0', 'Y'],
 'super_temporal': ['X3_ts', 'X2_ts', 'X0_ts', 'X1_ts']}

## Explain the synthetic data generation process
Store information in reports folder in latex / markdown format

In [11]:
explanation_file = data_dir + 'synthetic_data_report'
data_generator.explain(file= explanation_file + '.md'  , format='markdown')
# data_generator.explain(file= explanation_file + '.tex'  , format='latex')

NameError: name 'data_generator' is not defined