# Generate many synthetic datasets with observeable attribute change

All generated datasets have 
- 5 relevant attributes (attributes with a change)
- 5 irrelevant attributes (attributes without a change)
- primary change-points at 9 locations (1/10ths, 2/10ths...) (depends on base dataset)
- standard_deviation_offset_explain_change_point = 0

Dataset 1: sudden_3_attribute_values
- 3 attribute values each
- only sudden drift

Dataset 2: recurring_3_attribute_values
- 3 attribute values each
- only recurring drift

Dataset 3: sudden_10_attribute_values
- 3 attribute values each
- only sudden drift

In [1]:
import helper
import uuid
import os
import numpy as np

In [2]:
input_datasets = [
    # {
    #     'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cb\\cb2.5k.xes',
    #     'size': 2500,
    #     'change_points': helper.get_change_points_maardji_et_al_2013(2500)
    # },
    # {
    #     'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cb\\cb5k.xes',
    #     'size': 5000
    # },
    # {
    #     'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cb\\cb7.5k.xes',
    #     'size': 7500
    # },
    {
        'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf10k.xes',
        'size': 10000,
        'change_points': helper.get_change_points_maardji_et_al_2013(10000)
    }
]


In [3]:
count_relevant_attributes = 5
count_irrelevant_attributes = 5

generations_per_dataset = 100

output_folder = 'data\\synthetic\\attribute_drift\\' # the resulting file will be put in the subfolder 'configuration/size/old_file_name_UUID.xes'

## Generate Dataset 1: sudden_3_attribute_values

In [4]:
number_attribute_values = 3
type_of_drift = 'sudden'
type_of_change = 'mixed'
configuration_name = 'sudden_3_attribute_values'

In [5]:
for dataset in input_datasets:
    print(f'Now working on dataset {dataset}')

    dataset_base = '.'.join(os.path.basename(dataset['path']).split('.')[:-1])
    
    for i in range(generations_per_dataset):
        print(f'{i + 1} of {generations_per_dataset} for current dataset')
        file_name = f'{dataset_base + "_" +str(uuid.uuid4())}.xes'
        output_path = os.path.join(output_folder, configuration_name, str(dataset['size']), file_name)
        helper.add_synthetic_attributes(dataset['path'],
            output_path,
            dataset['change_points'],
            count_relevant_attributes,
            count_irrelevant_attributes,
            number_attribute_values,
            type_of_drift,
            type_of_change)

Now working on dataset {'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf10k.xes', 'size': 10000, 'change_points': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]}
1 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (35013.642822265625 msec.)

2 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (34671.000732421875 msec.)

3 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (31146.470703125 msec.)

4 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (30504.00341796875 msec.)

5 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing lo

## Generate Dataset 2: recurring_3_attribute_values

In [6]:
number_attribute_values = 3
type_of_drift = 'reoccurring'
type_of_change = 'mixed'
configuration_name = 'recurring_3_attribute_values'

In [7]:
for dataset in input_datasets:
    print(f'Now working on dataset {dataset}')

    dataset_base = '.'.join(os.path.basename(dataset['path']).split('.')[:-1])
    
    for i in range(generations_per_dataset):
        print(f'{i + 1} of {generations_per_dataset} for current dataset')
        file_name = f'{dataset_base + "_" +str(uuid.uuid4())}.xes'
        output_path = os.path.join(output_folder, configuration_name, str(dataset['size']), file_name)
        helper.add_synthetic_attributes(dataset['path'],
            output_path,
            dataset['change_points'],
            count_relevant_attributes,
            count_irrelevant_attributes,
            number_attribute_values,
            type_of_drift,
            type_of_change)

Now working on dataset {'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf10k.xes', 'size': 10000, 'change_points': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]}
1 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (46167.35205078125 msec.)

2 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (39716.798828125 msec.)

3 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (38893.25634765625 msec.)

4 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (42727.355712890625 msec.)

5 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log

## Generate Dataset 3: sudden_10_attribute_values

In [4]:
number_attribute_values = 10
type_of_drift = 'sudden'
type_of_change = 'mixed'
configuration_name = 'sudden_10_attribute_values'

In [5]:
for dataset in input_datasets:
    print(f'Now working on dataset {dataset}')

    dataset_base = '.'.join(os.path.basename(dataset['path']).split('.')[:-1])
    
    for i in range(generations_per_dataset):
        print(f'{i + 1} of {generations_per_dataset} for current dataset')
        file_name = f'{dataset_base + "_" +str(uuid.uuid4())}.xes'
        output_path = os.path.join(output_folder, configuration_name, str(dataset['size']), file_name)
        helper.add_synthetic_attributes(dataset['path'],
            output_path,
            dataset['change_points'],
            count_relevant_attributes,
            count_irrelevant_attributes,
            number_attribute_values,
            type_of_drift,
            type_of_change)

Now working on dataset {'path': 'data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf10k.xes', 'size': 10000, 'change_points': [1000, 2000, 3000, 4000, 5000, 6000, 7000, 8000, 9000]}
1 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (31967.1025390625 msec.)

2 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (31384.522705078125 msec.)

3 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (31859.544921875 msec.)

4 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (30376.674072265625 msec.)

5 of 100 for current dataset
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log