# DataSynthesizer

This notebook is used to create synthetic data with DataSynthesizer. It will first read and preprocess the CMAPPS data, after which it generate synthetic data which will then be analysed compared to the original data. This notebook is used for the results section on DataSynthesizer.

In [1]:
%%capture
# install DataSynthesizer (cannot be included in conda)
!pip install DataSynthesizer

In [3]:
import pandas as pd
import numpy as np
import DataSynthesizer
import os

# Read & preprocess data

We read the data from the CMAPPS folder. We remove the last two columns as these solely contain N/A values and then we rename the columns with their respective names as defined in "readme.txt".
We then store the data as a comma-separated-value (csv) file instead of a text file with tabs, as DataSynthesizer works with csv files.

In [3]:
data = pd.read_csv('CMAPSS/train_FD001.txt', sep=" ", header=None)

# drop last two columns with N/A values
data = data.iloc[:, :-2]

# rename columns according to readme.txt
col_names = ["unit_nr", "timecycle", "ops_set1", "ops_set2", "ops_set3"]
for i in range(1,22):
    col_names.append(f"sens_{i}")
data.columns = col_names
data.to_csv('CMAPSS/train_FD001_pre.csv', index=False)

data_length = len(data)

# Compute Remaining Useful Life (RUL) for each index (engine)
def add_remaining_useful_life(df):
    # Get the total number of cycles for each unit
    grouped_by_unit = df.groupby(by="unit_nr")
    max_cycle = grouped_by_unit["timecycle"].max()
    
    # Merge the max cycle back into the original frame
    result_frame = df.merge(max_cycle.to_frame(name='max_cycle'), left_on='unit_nr', right_index=True)
    
    # Calculate remaining useful life for each row
    remaining_useful_life = result_frame["max_cycle"] - result_frame["timecycle"]
    result_frame["RUL"] = remaining_useful_life
    
    # drop max_cycle as it's no longer needed
    result_frame = result_frame.drop("max_cycle", axis=1)
    return result_frame

data = add_remaining_useful_life(data)

data.to_csv('CMAPSS/train_FD001_pre.csv', index=False)

data_length = len(data)

# display data
data
data.columns

Index(['unit_nr', 'timecycle', 'ops_set1', 'ops_set2', 'ops_set3', 'sens_1',
       'sens_2', 'sens_3', 'sens_4', 'sens_5', 'sens_6', 'sens_7', 'sens_8',
       'sens_9', 'sens_10', 'sens_11', 'sens_12', 'sens_13', 'sens_14',
       'sens_15', 'sens_16', 'sens_17', 'sens_18', 'sens_19', 'sens_20',
       'sens_21', 'RUL'],
      dtype='object')

## Create synthetic data

In [4]:
"""
Creating the synthetic data using the Git page from DataSynthesizer
https://github.com/DataResponsibly/DataSynthesizer/blob/master/notebooks/DataSynthesizer__correlated_attribute_mode.ipynb

NOTE: First create the description file (e.g. via terminal command touch)
before running the code, does not write the .json file itself.
"""

from DataSynthesizer.DataDescriber import DataDescriber
from DataSynthesizer.DataGenerator import DataGenerator
from DataSynthesizer.lib.utils import read_json_file, display_bayesian_network

def create_data(data_length):
    # input dataset
    input_data = 'CMAPSS/train_FD001_pre.csv'

    # location of two output files
    mode = 'correlated_attribute_mode'
    description_file = f'./CMAPSS/Synthetic/description_FD001.json'
    synthetic_data = f'./CMAPSS/Synthetic/DataSyn_FD001.csv'

    # An attribute is categorical if its domain size is less than this threshold.
    threshold_value = 42

    # A parameter in Differential Privacy. It roughly means that removing a row in the input dataset will not 
    # change the probability of getting the same output more than a multiplicative difference of exp(epsilon).
    # Increase epsilon value to reduce the injected noises. Set epsilon=0 to turn off differential privacy.
    epsilon = 0

    # The maximum number of parents in Bayesian network, i.e., the maximum number of incoming edges.
    degree_of_bayesian_network = 2

    # Number of tuples generated in synthetic dataset.
    num_tuples_to_generate = data_length

    describer = DataDescriber(category_threshold=threshold_value)
    describer.describe_dataset_in_correlated_attribute_mode(dataset_file=input_data, 
                                                            epsilon=epsilon, 
                                                            k=degree_of_bayesian_network)
    describer.save_dataset_description_to_file(description_file)

    # Generate data set
    generator = DataGenerator()
    generator.generate_dataset_in_correlated_attribute_mode(num_tuples_to_generate, description_file)
    generator.save_synthetic_data(synthetic_data)

create_data(data_length)

Adding ROOT sens_8
Adding attribute sens_13
Adding attribute sens_12
Adding attribute sens_11
Adding attribute sens_4
Adding attribute sens_7
Adding attribute RUL
Adding attribute timecycle
Adding attribute unit_nr
Adding attribute sens_14
Adding attribute sens_9
Adding attribute sens_15
Adding attribute sens_21
Adding attribute sens_20
Adding attribute sens_2
Adding attribute sens_17
Adding attribute sens_3
Adding attribute ops_set1
Adding attribute ops_set2
Adding attribute sens_6
Adding attribute ops_set3
Adding attribute sens_1
Adding attribute sens_5
Adding attribute sens_10
Adding attribute sens_16
Adding attribute sens_18
Adding attribute sens_19


  for parents_instance, stats_sub in stats.groupby(parents):


# Read & output synthetic data

In [4]:
syn_data = pd.read_csv('./CMAPSS/Synthetic/DataSyn_FD001.csv')
syn_data[:10]

Unnamed: 0,unit_nr,timecycle,ops_set1,ops_set2,ops_set3,sens_1,sens_2,sens_3,sens_4,sens_5,...,sens_13,sens_14,sens_15,sens_16,sens_17,sens_18,sens_19,sens_20,sens_21,RUL
0,84.0,287.0,-0.001034,0.0001,100.0,518.67,643.26513,1599.606655,1411.97443,14.62,...,2388.088113,8117.58724,8.469115,0.03,394,2388,100.0,38.597677,23.12108,9.0
1,30.0,56.0,-0.001488,-0.0002,100.0,518.67,642.581595,1591.055015,1410.289857,14.62,...,2388.118976,8140.887048,8.428558,0.03,394,2388,100.0,38.97426,23.312177,91.0
2,58.0,76.0,-0.002117,-0.0005,100.0,518.67,642.857685,1588.830978,1407.430623,14.62,...,2388.055948,8136.848609,8.414437,0.03,392,2388,100.0,39.023074,23.406597,107.0
3,86.0,124.0,0.000826,-0.0002,100.0,518.67,642.759005,1592.064501,1400.055742,14.62,...,2388.102871,8136.752208,8.4249,0.03,391,2388,100.0,39.09641,23.363743,155.0
4,23.0,68.0,0.000781,-0.0002,100.0,518.67,643.010252,1592.346665,1405.832447,14.62,...,2388.139336,8122.153896,8.448167,0.03,394,2388,100.0,38.951928,23.359406,144.0
5,3.0,139.0,-0.001209,0.0005,100.0,518.67,642.902253,1590.943306,1410.928967,14.62,...,2388.093062,8157.838336,8.465383,0.03,395,2388,100.0,38.875838,23.205776,40.0
6,40.0,12.0,-0.000246,0.0003,100.0,518.67,642.774293,1589.059653,1410.157601,14.62,...,2388.092668,8134.53929,8.434034,0.03,394,2388,100.0,38.717423,23.303179,164.0
7,87.0,112.0,0.002275,-0.0004,100.0,518.67,642.748289,1594.684203,1415.362561,14.62,...,2388.17755,8144.855908,8.490564,0.03,392,2388,100.0,38.724596,23.286717,41.0
8,92.0,131.0,0.000846,0.0002,100.0,518.67,643.34599,1596.921029,1421.989821,14.62,...,2388.307165,8132.613902,8.514121,0.03,395,2388,100.0,38.449765,23.006137,14.0
9,43.0,130.0,-0.000654,0.0001,100.0,518.67,642.203954,1594.720532,1401.061698,14.62,...,2388.024091,8167.011131,8.424992,0.03,392,2388,100.0,38.907018,23.329811,80.0


What we see directly is that the synthetic data has a different ordering for the unit numbers and timecylces. Where these are sequential for the original data, we see that the synthetic data just samples them from a distribution. In general, this would work for values as the mean and standard deviation will probably be similar. However, in this case the unit numbers and timecycles are relevant and should be sequential as well.

In [6]:
data[:10]

Unnamed: 0,unit_nr,timecycle,ops_set1,ops_set2,ops_set3,sens_1,sens_2,sens_3,sens_4,sens_5,...,sens_13,sens_14,sens_15,sens_16,sens_17,sens_18,sens_19,sens_20,sens_21,RUL
0,1,1,-0.0007,-0.0004,100.0,518.67,641.82,1589.7,1400.6,14.62,...,2388.02,8138.62,8.4195,0.03,392,2388,100.0,39.06,23.419,191
1,1,2,0.0019,-0.0003,100.0,518.67,642.15,1591.82,1403.14,14.62,...,2388.07,8131.49,8.4318,0.03,392,2388,100.0,39.0,23.4236,190
2,1,3,-0.0043,0.0003,100.0,518.67,642.35,1587.99,1404.2,14.62,...,2388.03,8133.23,8.4178,0.03,390,2388,100.0,38.95,23.3442,189
3,1,4,0.0007,0.0,100.0,518.67,642.35,1582.79,1401.87,14.62,...,2388.08,8133.83,8.3682,0.03,392,2388,100.0,38.88,23.3739,188
4,1,5,-0.0019,-0.0002,100.0,518.67,642.37,1582.85,1406.22,14.62,...,2388.04,8133.8,8.4294,0.03,393,2388,100.0,38.9,23.4044,187
5,1,6,-0.0043,-0.0001,100.0,518.67,642.1,1584.47,1398.37,14.62,...,2388.03,8132.85,8.4108,0.03,391,2388,100.0,38.98,23.3669,186
6,1,7,0.001,0.0001,100.0,518.67,642.48,1592.32,1397.77,14.62,...,2388.03,8132.32,8.3974,0.03,392,2388,100.0,39.1,23.3774,185
7,1,8,-0.0034,0.0003,100.0,518.67,642.56,1582.96,1400.97,14.62,...,2388.03,8131.07,8.4076,0.03,391,2388,100.0,38.97,23.3106,184
8,1,9,0.0008,0.0001,100.0,518.67,642.12,1590.98,1394.8,14.62,...,2388.05,8125.69,8.3728,0.03,392,2388,100.0,39.05,23.4066,183
9,1,10,-0.0033,0.0001,100.0,518.67,641.71,1591.24,1400.46,14.62,...,2388.06,8129.38,8.4286,0.03,393,2388,100.0,38.95,23.4694,182


Compare statistics

In [7]:
# select data with unit-nr 1.0
for i in range(1,6): 
    df_syn = syn_data.loc[syn_data['unit_nr'] == i]
    df_cmapps = data.loc[data['unit_nr'] == i]
    
    print(f"Number of data points for unit-nr {i} for CMAPPS: {len(df_cmapps)}. For synthetic data: {len(df_syn)}")
    
print("\nThe number of unique values per unit number:")

for i in range(1,6): 
    df_syn = syn_data.loc[syn_data['unit_nr'] == i].drop_duplicates(subset='timecycle')
    df_cmapps = data.loc[data['unit_nr'] == i].drop_duplicates(subset='timecycle')
    
    
    print(f"Number of unique values in timestamp for unit_nr {i} for CMAPPS: {len(df_cmapps)}. For synthetic data: {len(df_syn)}")

    
df_syn

Number of data points for unit-nr 1 for CMAPPS: 192. For synthetic data: 112
Number of data points for unit-nr 2 for CMAPPS: 287. For synthetic data: 222
Number of data points for unit-nr 3 for CMAPPS: 179. For synthetic data: 221
Number of data points for unit-nr 4 for CMAPPS: 189. For synthetic data: 213
Number of data points for unit-nr 5 for CMAPPS: 269. For synthetic data: 237

The number of unique values per unit number:
Number of unique values in timestamp for unit_nr 1 for CMAPPS: 192. For synthetic data: 93
Number of unique values in timestamp for unit_nr 2 for CMAPPS: 287. For synthetic data: 148
Number of unique values in timestamp for unit_nr 3 for CMAPPS: 179. For synthetic data: 150
Number of unique values in timestamp for unit_nr 4 for CMAPPS: 189. For synthetic data: 147
Number of unique values in timestamp for unit_nr 5 for CMAPPS: 269. For synthetic data: 149


Unnamed: 0,unit_nr,timecycle,ops_set1,ops_set2,ops_set3,sens_1,sens_2,sens_3,sens_4,sens_5,...,sens_13,sens_14,sens_15,sens_16,sens_17,sens_18,sens_19,sens_20,sens_21,RUL
50,5.0,156.0,-0.001650,0.0003,100.0,518.67,642.928175,1595.363411,1426.131399,14.62,...,2388.122332,8182.197006,8.477892,0.03,395,2388,100.0,38.682065,23.142507,34.0
224,5.0,18.0,-0.003383,0.0001,100.0,518.67,641.958729,1580.828596,1394.473234,14.62,...,2387.952528,8131.610359,8.423730,0.03,393,2388,100.0,38.864591,23.534890,176.0
506,5.0,155.0,0.000376,0.0001,100.0,518.67,643.081848,1592.362820,1410.543574,14.62,...,2388.199600,8125.081054,8.463548,0.03,395,2388,100.0,38.718511,23.235610,33.0
539,5.0,186.0,-0.001628,0.0001,100.0,518.67,643.657678,1598.884655,1436.940018,14.62,...,2388.203262,8142.315680,8.540793,0.03,396,2388,100.0,38.451200,23.212395,9.0
601,5.0,60.0,0.000123,0.0003,100.0,518.67,642.509111,1582.655538,1393.131613,14.62,...,2388.008541,8133.081539,8.414524,0.03,393,2388,100.0,38.757324,23.369591,213.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19608,5.0,122.0,-0.002061,0.0000,100.0,518.67,642.984059,1585.634355,1411.337050,14.62,...,2388.077764,8136.593751,8.435961,0.03,394,2388,100.0,39.157110,23.233116,56.0
19866,5.0,44.0,0.000704,0.0002,100.0,518.67,642.360386,1589.128904,1408.383519,14.62,...,2388.122278,8138.493994,8.453126,0.03,390,2388,100.0,38.738625,23.270831,143.0
20234,5.0,147.0,-0.000676,-0.0002,100.0,518.67,643.774365,1605.527962,1417.459321,14.62,...,2388.245193,8139.620077,8.480807,0.03,394,2388,100.0,38.637894,23.141283,19.0
20347,5.0,107.0,-0.004005,0.0005,100.0,518.67,642.733561,1596.690115,1410.341086,14.62,...,2388.068406,8131.988379,8.431865,0.03,393,2388,100.0,38.847533,23.392598,166.0


# Export exploratory data analysis to csv files

In [5]:
syn_description = syn_data.describe()
syn_description
syn_description.T.to_csv("./CMAPSS/Synthetic/synthetic_data_FD001_description.csv", float_format="{:.5f}".format)

In [7]:
data.describe()
syn_description.T.to_csv("./CMAPSS/CMAPPS_FD001_description.csv", float_format="{:.5f}".format)

NameError: name 'data' is not defined

In [6]:
syn_description.T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_nr,20631.0,51.938442,28.99465,1.0,27.0,53.0,78.0,100.0
timecycle,20631.0,108.743978,68.81619,1.0,52.5,103.0,156.0,360.0
ops_set1,20631.0,-8e-06,0.002216008,-0.008633,-0.001507,6.2e-05,0.001509,0.008515
ops_set2,20631.0,1e-06,0.0002922696,-0.0006,-0.0002,0.0,0.0002,0.0006
ops_set3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sens_1,20631.0,518.67,0.0,518.67,518.67,518.67,518.67,518.67
sens_2,20631.0,642.685566,0.4997734,641.213443,642.32686,642.655013,643.007799,644.520224
sens_3,20631.0,1590.633424,6.22064,1571.298178,1586.305192,1590.193372,1594.619861,1616.857611
sens_4,20631.0,1408.969796,9.133706,1383.445159,1402.314634,1408.097828,1414.702369,1438.512194
sens_5,20631.0,14.62,1.7764e-15,14.62,14.62,14.62,14.62,14.62


In [9]:
data = pd.read_csv('CMAPSS/train_FD001_pre.csv')
data.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
unit_nr,20631.0,51.506568,29.22763,1.0,26.0,52.0,77.0,100.0
timecycle,20631.0,108.807862,68.88099,1.0,52.0,104.0,156.0,362.0
ops_set1,20631.0,-9e-06,0.002187313,-0.0087,-0.0015,0.0,0.0015,0.0087
ops_set2,20631.0,2e-06,0.0002930621,-0.0006,-0.0002,0.0,0.0003,0.0006
ops_set3,20631.0,100.0,0.0,100.0,100.0,100.0,100.0,100.0
sens_1,20631.0,518.67,0.0,518.67,518.67,518.67,518.67,518.67
sens_2,20631.0,642.680934,0.5000533,641.21,642.325,642.64,643.0,644.53
sens_3,20631.0,1590.523119,6.13115,1571.04,1586.26,1590.1,1594.38,1616.91
sens_4,20631.0,1408.933782,9.000605,1382.25,1402.36,1408.04,1414.555,1441.49
sens_5,20631.0,14.62,1.7764e-15,14.62,14.62,14.62,14.62,14.62
