# Generate a synthetic time-series dataset

Generate a synthetic dataset with 
- features X
- temporal features X_ts
- label Y (static or temporal) 

Note: data is stored in the folder "../../../data/ts/" + dataset relativ to this directory

### Define paths 

In [1]:
# which dataset to work on 
dataset_name   = "synthetic_1"

In [2]:
# data specifications 
data_dir            = "../../../data/ts/" + dataset_name + "/fully_observed/" 
data_file          = data_dir  + dataset_name + '.csv'

# reporting
explanation_file = data_dir + 'reports/' + 'synthetic_data'

### Imports

In [3]:
%load_ext autoreload
%autoreload 2

import sys
import os

# navigate to afa directory 
sys.path.insert(0, os.path.abspath('../../afa'))

from afa.data_modelling.datasets.synthetic_data_generation.data_generator_ts import DataGenerator_ts

## Data Generation

In [4]:
data_generator = DataGenerator_ts(
                        data_dir           = data_dir,  
                        dataset_name       = dataset_name , 
                        n_datapts          = 100, 
                        test_size          = 0,
                        n_features_static  = 2,
                        n_features_ts      = 4, 
                        n_timepts          = 10,
                        table_style        = "EAV",
                        compression        = 'gzip',
                        n_classes          = 2)

In [5]:
df_static_train, df_ts_train, df_static_test, df_ts_test  = data_generator.generate_data()

## Load data
Check if data was saved correctly and can be loaded 

In [6]:
from afa.data_modelling.datasets.data_loader.data_loader_ts import DataLoader_ts

2023-02-05 20:27:47.258788: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-02-05 20:27:47.410337: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-02-05 20:27:47.410355: I tensorflow/compiler/xla/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.
2023-02-05 20:27:48.167130: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-

In [7]:
data_file          = data_dir + dataset_name + '_static.csv.gz' 
temporal_data_file = data_dir + dataset_name + '_ts_eav.csv.gz' 
# superfeature_mapping_file = data_dir + 'superfeaturs.csv'
superfeature_mapping_file = None
    
data_loader = DataLoader_ts( data_file                  = data_file,
                             temporal_data_file         = temporal_data_file,
                             superfeature_mapping_file  = superfeature_mapping_file )

dataset = data_loader.load() 

In [8]:
# visualize pandas dataframe for static data: 
dataset.temporal_data

variable,id,time,X0_ts,X1_ts,X2_ts,X3_ts,Y_ts
0,0,0,0.040214,0.736119,0.903988,-0.271101,0.0
1,0,1,-0.492273,-0.268303,-0.921176,-0.812881,1.0
2,0,2,0.309823,-0.386123,1.315453,-0.241455,0.0
3,0,3,1.871679,0.845866,1.473744,0.437033,0.0
4,0,4,1.224608,0.097955,0.098419,-0.635451,0.0
...,...,...,...,...,...,...,...
995,99,5,0.346083,0.991062,-0.432751,1.140907,0.0
996,99,6,-1.823515,0.312444,0.004809,0.671610,0.0
997,99,7,0.035619,-0.076120,-0.245551,0.130265,0.0
998,99,8,0.850422,1.200439,0.028020,-1.400394,0.0


In [9]:
# visualize pandas dataframe for time-series data: 
dataset.temporal_data

variable,id,time,X0_ts,X1_ts,X2_ts,X3_ts,Y_ts
0,0,0,0.040214,0.736119,0.903988,-0.271101,0.0
1,0,1,-0.492273,-0.268303,-0.921176,-0.812881,1.0
2,0,2,0.309823,-0.386123,1.315453,-0.241455,0.0
3,0,3,1.871679,0.845866,1.473744,0.437033,0.0
4,0,4,1.224608,0.097955,0.098419,-0.635451,0.0
...,...,...,...,...,...,...,...
995,99,5,0.346083,0.991062,-0.432751,1.140907,0.0
996,99,6,-1.823515,0.312444,0.004809,0.671610,0.0
997,99,7,0.035619,-0.076120,-0.245551,0.130265,0.0
998,99,8,0.850422,1.200439,0.028020,-1.400394,0.0


## Define problem
Check if problem definition works 

In [10]:
from afa.data_modelling.problem.problem_ts import ProblemMaker_ts

In [11]:
# define problem specifications
label_name = 'Y_ts'
problem = 'online'
treatment = None
max_seq_len = 10

problem_maker = ProblemMaker_ts(    problem    = problem, 
                                    label      = [label_name], 
                                    treatment  = treatment, 
                                    max_seq_len = max_seq_len)

dataset  = problem_maker.fit_transform(dataset)

100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 894.23it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 798.38it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 723.57it/s]


In [12]:
# check if features/superfeatures/labels etc have been correctly assigned 
dataset.feature_name

{'temporal': ['X0_ts', 'X1_ts', 'X2_ts', 'X3_ts'],
 'data': ['X0', 'X1', 'Y'],
 'treatment': None,
 'label': ['Y_ts'],
 'super_data': ['Y', 'X0', 'X1'],
 'super_temporal': ['X1_ts', 'X3_ts', 'X2_ts', 'X0_ts']}

## Explain the synthetic data generation process
Store information in reports folder in latex / markdown format

In [13]:
data_generator.explain(file= explanation_file + '.md'  , format='markdown')
data_generator.explain(file= explanation_file + '.tex'  , format='latex')