# Generating data on a CTGAN model/ synthesizer

In [84]:
%%capture

# Load libraries for generation

import datetime
import os
import pandas as pd
from sdv.single_table import CTGANSynthesizer
from sdv.datasets.local import load_csvs
from sdv.metadata import SingleTableMetadata

In [12]:
# Ignore warnings

In [13]:
# Get current timestamp for unique identifiers
current_timestamp = datetime.datetime.now().strftime("%Y%m%d_%H%M%S")

# Set sampling parameters
generate_rows = int(input("Enter the number of rows of synthetic data to generate: "))
batch_size = int(input("Enter the batch size for sampling synthetic data: "))

# Load existing model
model_folder = 'model'
model_name = input("Enter the model file name (without extension): ").lower()  #ctgan_filtered_new_data_ep1_mdl1
model_path = os.path.join(model_folder, model_name + '.pkl')
synthesizer = CTGANSynthesizer.load(filepath=model_path)

In [None]:
%%capture

# Include metadata to run evaluations
metadata_dict = {
    "columns": {
        "Timestamp": {
            "sdtype": "datetime",
            "datetime_format": "%Y-%m-%d %H:%M:%S"
        },
        "Source.IP": {
            "sdtype": "categorical"
        },
        "Source.Port": {
            "sdtype": "categorical"
        },
        "Destination.IP": {
            "sdtype": "categorical"
        },
        "Destination.Port": {
            "sdtype": "categorical"
        },
        "Protocol": {
            "sdtype": "categorical"
        },
        "Flow.Duration": {
            "sdtype": "numerical"
        },
        "Total.Fwd.Packets": {
            "sdtype": "numerical"
        },
        "Total.Backward.Packets": {
            "sdtype": "numerical"
        },
        "Total.Length.of.Fwd.Packets": {
            "sdtype": "numerical"
        },
        "Total.Length.of.Bwd.Packets": {
            "sdtype": "numerical"
        }
    },
    "METADATA_SPEC_VERSION": "SINGLE_TABLE_V1"
}
metadata = SingleTableMetadata.load_from_dict(metadata_dict)

# Sampling/ Generation

Saves the synthetic data as a CSV to the output folder.
Directory: synthetic_data_{model_name}_n{rows}_b{batch_size}_{timestamp}.csv

In [None]:
%%capture
# Create output folder if it does not exist
output_folder = 'output'
os.makedirs(output_folder, exist_ok=True)
output_name = f"synthetic_data_{model_name.split('.')[0]}_n{generate_rows}_b{batch_size}_{current_timestamp}.csv"
output_path = os.path.join(output_folder, output_name)

In [58]:
%%capture
# Create log folder if it does not exist
log_folder = 'logs'
os.makedirs(log_folder, exist_ok=True)
training_log_file_name = 'information.txt'
training_log_file_name = os.path.join(log_folder, training_log_file_name)
with open(training_log_file_name, 'a') as log:
    log.write(f"--- Synthetic data generation started at: {current_timestamp}\n")
    log.write(f"\tModel name: {model_name}\n")
    log.write(f"\tModel path: {model_path}\n")
    log.write(f"\tGenerating {generate_rows} rows\n")
    log.write(f"\tBatch size: {batch_size}\n")
    log.write(f"\tOutput file path: {output_path}\n")

In [60]:
print("Final log file path:", training_log_file_name)

Final log file path: logs\information.txt


In [None]:
# Write synthetic data to output file
synthetic_data = synthesizer.sample(
    num_rows=generate_rows if generate_rows else 5000,  # Default to 5000 rows if not specified
    batch_size=batch_size if batch_size else 32,  # Default to batch size of 32 if not specified
    output_file_path=output_path
)

print(output_path)