In [1]:
import helper
import numpy as np
import os
from concept_drift import generate_attributes

# Generate Synthetic Attribute Data with different settings
Always generate 10 datasets with the same settings.

Out of the 10 change points, the first 5 are always explainable with attributes 1, 2, 3, 4 and 5. The last 5 are not explainable.

In [2]:
count_event_logs = 1
event_log_size = 2500
relevant_attributes = 5
irrelevant_attributes = 5
attribute_value_count = 3

In [3]:
# 10 random event logs from the maardji et al 2013 dataset that are of the desired size
suitable_dataset= helper.get_datasets_by_criteria(size=event_log_size, 
                                             has_generated_attributes=False, 
                                             dataset='maardji et al 2013')
datasets = np.random.choice(list(suitable_dataset.keys()), count_event_logs)
datasets

array(['data\\synthetic\\maardji et al 2013_xes\\logs\\cf\\cf2.5k.xes'],
      dtype='<U58')

## Scenario 1: New material, new vendor...

Data is categorical. One new category is added which is sampled at 10%. All other categories are sampled at random distributions.

In [4]:
def _get_distribution(attribute_value_count):
    """Get a single distribution for a given number of values.
    
    Args:
        number_values: How many different values the distribution should have.
    
    Returns:
        A probability distribution.
    """
    # use the Dirichlet distribution
    distribution = np.random.dirichlet(np.ones(attribute_value_count)).tolist()
    return distribution

In [5]:
new_value_probability = 0.2

from scipy import stats

def _get_drifted_distributions(attribute_value_count, change_type=None):
    """Get two distributions, the baseline distribution and the drifted distribution.
    
    The change_type determines in which regard both are different.
    
    The two distributions are guaranteed to be significantly different at 10 observations.
    
    Args:
        attribute_value_count: How many attribute values there are.
        change_type: 'new_value', 'overproportional_gain' or 'independent_new'.
    
    Returns:
        (baseline_distribution, drifted_distribution) tuple
    """
    if attribute_value_count < 2: raise Exception('Must generate at least 2 attribute values.')
    
    if change_type is None:
        change_type = np.random.choice(['new_value', 'overproportional_gain', 'independent_new'])
    
    # get the baseline distribution
    baseline_distribution = None
    if change_type == 'new_value':
        baseline_distribution = _get_distribution(attribute_value_count - 1)
        # add a 0% probability item
        baseline_distribution.append(0)
    else:
        baseline_distribution = _get_distribution(attribute_value_count)
        
    drifted_distribution_found = False
    drifted_distribution = None
    
    while not drifted_distribution_found:
        if change_type == 'new_value':
            drifted_distribution = baseline_distribution.copy()
            drifted_distribution = list(np.array(drifted_distribution) * (1 - new_value_probability))
            drifted_distribution[-1] = new_value_probability
        else:
            drifted_distribution = _get_distribution(attribute_value_count)
        
        hellinger_distance = np.sqrt(np.sum((np.sqrt(baseline_distribution) - np.sqrt(drifted_distribution)) ** 2)) / np.sqrt(2)
        
        if hellinger_distance > 0.3:
            drifted_distribution_found = True

    return baseline_distribution, drifted_distribution

In [6]:
out_path = 'data/synthetic/generated/new_attribute_values/'

for i, dataset in enumerate(datasets):
    print(f'Generating dataset {i}')
    dataset_info = helper.get_data_information(dataset)
    opyenxes_log = helper.opyenxes_read_xes(dataset_info['file_path'])
    ag = generate_attributes.AttributeGenerator(opyenxes_log, dataset_info['change_points'])
    
    # generate drifted attributes
    for attribute_index in range(relevant_attributes):
        attribute_name = f'relevant_attribute_{attribute_index + 1}'
        
        # get the distributions
        base_distribution, drifted_distribution = _get_drifted_distributions(attribute_value_count)

        # change point to explain
        explain_change_point = dataset_info['change_points'][attribute_index]
        ag.generate_drifting_categorical_attribute(attribute_name,
                                          base_distribution,
                                          drifted_distribution=drifted_distribution,
                                          explain_change_point=explain_change_point,
                                          drift_type = 'sudden')
    
    # generate attributes that did not drift
    for attribute_index in range(irrelevant_attributes):
        attribute_name = f'irrelevant_attribute_{attribute_index + 1}'
        distribution = _get_distribution(attribute_value_count)
        ag.generate_categorical_attribute(attribute_name, distribution)
    
    # prepare to write results
    file_name = f'{event_log_size}_sudden_{relevant_attributes}_{i:02d}.xes'
    out_file_path = os.path.join(out_path, file_name)
    out_file_path = os.path.normpath(out_file_path)
    
    # write event log to file
    helper.opyenxes_write_xes(ag.opyenxes_log, out_file_path)
    
    # save the data information
    dataset_info['file_path'] = out_file_path
    dataset_info['file_name'] = file_name
    dataset_info['has_generated_attributes'] = True
    dataset_info['change_point_explanations'] = ag.change_point_explanations
    helper.update_data_dictionary({out_file_path: dataset_info})

Generating dataset 0
Importance: DEBUG
Message: Start serializing log to XES.XML

Importance: DEBUG
Message: finished serializing log (7858.00048828125 msec.)

