# The jupyter notebook is intended to generate syntheticdata using Synthetic Data Vault. 

1. Generate using the hyper parameters at the bare minimum 
    a. use the hyper parameters epoch and batch size for the GAN
2. The dataset size equal to that of the original dataset it the size is less than 2000
3. Start sampling at the size of 2000, 5000, 1000 and maximum


In [1]:
# Import necessary libraries 

import warnings
warnings.filterwarnings('ignore') # ignore any warnings 

import pandas as pd
from sdv.tabular import GaussianCopula
from sdv.tabular import CTGAN
from sdv.tabular import CopulaGAN
from sdv.tabular import TVAE
from util.dataloader import *

In [12]:
def generate_gaussian(in_file):
    start_time = time.strftime("%H:%M:%S", time.localtime())
    gaussian_model = GaussianCopula()
    gaussian_model.fit(real_df)
    gaus_synthetic_df = gaussian_model.sample(SAMPLE_SIZE)
    gaus_synthetic_df.to_csv(out_path1)
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print('Total processing time for gaussian_synthesizer '+in_file+ ' is : {:.5f}'.format(elapsedtime(start_time, end_time))+ " minutes")

In [13]:
def generate_ctgan(in_file):
    start_time = time.strftime("%H:%M:%S", time.localtime())
    ctgan = CTGAN(batch_size = BATCH_SIZE, epochs = EPOCH)
    ctgan.fit(real_df)
    ctgan_synthetic_df = ctgan.sample(SAMPLE_SIZE)
    ctgan_synthetic_df.to_csv(out_path2)
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print('Total processing time for ctgan_synthesizer '+in_file+ ' is : {:.5f}'.format(elapsedtime(start_time, end_time))+ " minutes")

In [14]:
def generate_copula_gan(in_file):
    start_time = time.strftime("%H:%M:%S", time.localtime())
    copula_gan_model = CopulaGAN(epochs=EPOCH,
                batch_size=BATCH_SIZE,
                generator_dim=(128, 128, 128),
                discriminator_dim=(128, 128, 128))
    copula_gan_model.fit(real_df)
    cgan_synthetic_df = copula_gan_model.sample(SAMPLE_SIZE)
    cgan_synthetic_df.to_csv(out_path3)
    end_time = time.strftime("%H:%M:%S", time.localtime())
    '{:.5f}'.format
    print('Total processing time for cgan_synthesizer '+in_file+ ' is : {:.5f}'.format(elapsedtime(start_time, end_time))+ " minutes")

In [15]:
def generate_tvae(in_file):    
    start_time = time.strftime("%H:%M:%S", time.localtime())
    tvae_model = TVAE(epochs=EPOCH,
                batch_size=BATCH_SIZE)
    tvae_model.fit(real_df)
    tvae_synthetic_df = tvae_model.sample(SAMPLE_SIZE)
    tvae_synthetic_df.to_csv(out_path4)
    end_time = time.strftime("%H:%M:%S", time.localtime())
    print('Total processing time for tvae_synthesizer '+in_file+ ' is : {:.5f}'.format(elapsedtime(start_time, end_time))+ " minutes")

In [16]:
def concat_census_data():
    #path1, path2, path3 = ''
    path1 = '../Real Datasets/Adult Census Income Dataset/adult.csv'
    path2 = '../Real Datasets/Adult Census Income Dataset/test.csv'
    real1 = pd.read_csv(path1)
    real2 = pd.read_csv(path2)
    real2_no_header = real2[1:] 
    real = real1.append(real2_no_header)
    real.to_csv('../Real Datasets/Adult Census Income Dataset/census.csv')

def concat_anomaly_data():
    #path1, path2, path3 = ''
    path1 = '../Real Datasets/Anomaly Detection/Participants_Data_WH18/Train.csv'
    path2 = '../Real Datasets/Anomaly Detection/Participants_Data_WH18/Test.csv'
    path3 = '../Real Datasets/Anomaly Detection/Participants_Data_WH18/Sample_submission.csv'

    # Read 3 files in the package separtely
#    real1 = pd.read_csv(path1, header=0)
#    real2 = pd.read_csv(path2, header=0)
#    real3 = pd.read_csv(path3, header=0)
    
    # Read 3 files in the package separtely
    real1 = pd.read_csv(path1)
    real2 = pd.read_csv(path2)
    real3 = pd.read_csv(path3)

    # merge the sample submision with test data
    df_list = [real2, real3]
    test_df = pd.concat(df_list, axis=1)

    #append the concatenated test datset with train dataset. Remove the header from the test set 
    #test_no_header = test_df[1:] 
    real = real1.append(test_df)
    real.to_csv('../Real Datasets/Anomaly Detection/Participants_Data_WH18/anomaly.csv')

# Cancer data file has some of the values as '?' which is to be cleaned. Else, the generator fails
def get_cervical_cancer_data():
    path = ''
    path = '../Real Datasets/Cervical Cancer/risk_factors_cervical_cancer.csv'
    real = pd.read_csv(path) # in datasets/
    real.replace('?', 0, inplace=True)
    real.drop(['STDs: Time since first diagnosis', 'STDs: Time since last diagnosis'], axis=1, inplace=True )
    real.to_csv('../Real Datasets/Cervical Cancer/cervical_cancer.csv')

In [18]:
# load the dataset

in_file = ['smoke_detection',
         'diabetes',
         'cerebral_stroke',
         'hr_analysis',
         'cervical_cancer',
         'census',
         'malware_detection',
            ### in_file = 'anomaly' ## -- Need to try on Azure. Takes a long time to generate becuase of very high number of fetures 
         'Titanic',
         'insurance',
         'House_Rent_Dataset',
         'Card_Application_Data',
         'creditcard']

EPOCH = 10
BATCH_SIZE = 50
PATH = '../Real Datasets/'
SAMPLE_SIZE = 2000

for i in range(len(in_file)):
    PATH = '../Real Datasets/'
    #Concat files and then load
    if in_file[i] == 'census':
        concat_census_data()
        PATH = '../Real Datasets/Adult Census Income Dataset/'
    elif in_file[i] =='cervical_cancer':

        # Remove the special charecters in the dataset
        get_cervical_cancer_data()
        PATH = '../Real Datasets/Cervical Cancer/'
    elif in_file[i] == 'anomaly':
        concat_anomaly_data()
        PATH = '../Real Datasets/Anomaly Detection/Participants_Data_WH18/'
    elif in_file[i] == 'malware_detection':
        PATH = '../Real Datasets/Malware Detection/'
    elif in_file[i] == 'hr_analysis':
        PATH = '../Real Datasets/HR Analysis/'

    real_df = pd.read_csv(PATH + str(in_file[i])+'.csv')
    
    if len(real_df) < SAMPLE_SIZE:
        SAMPLE_SIZE = len(real_df)
        #real_df = pd.read_csv(PATH + in_file+'.csv', nrows=SAMPLE_SIZE)
    out_path1 = '../Synthetic Datasets/sdv_gauss/sdv_gauss_'+in_file[i]+'.csv'
    out_path2 = '../Synthetic Datasets/sdv_ctgan/sdv_ctgan_'+in_file[i]+'.csv'
    out_path3 = '../Synthetic Datasets/sdv_copula/sdv_copula_'+in_file[i]+'.csv'
    out_path4 = '../Synthetic Datasets/sdv_tvae/sdv_tvae_'+in_file[i]+'.csv'
    generate_gaussian(in_file[i])
    generate_ctgan(in_file[i])
    generate_copula_gan(in_file[i])
    generate_tvae(in_file[i])

Total processing time for gaussian_synthesizer Titanic is : 0.05000 minutes
Total processing time for ctgan_synthesizer Titanic is : 1.40000 minutes
Total processing time for cgan_synthesizer Titanic is : 0.93333 minutes
Total processing time for tvae_synthesizer Titanic is : 0.16667 minutes
Total processing time for gaussian_synthesizer insurance is : 0.03333 minutes
Total processing time for ctgan_synthesizer insurance is : 0.13333 minutes
Total processing time for cgan_synthesizer insurance is : 0.13333 minutes
Total processing time for tvae_synthesizer insurance is : 0.06667 minutes
Total processing time for gaussian_synthesizer House_Rent_Dataset is : 0.11667 minutes
Total processing time for ctgan_synthesizer House_Rent_Dataset is : 16.93333 minutes
Total processing time for cgan_synthesizer House_Rent_Dataset is : 10.56667 minutes
Total processing time for tvae_synthesizer House_Rent_Dataset is : 0.51667 minutes
Total processing time for gaussian_synthesizer Card_Application_Dat

KeyboardInterrupt: 

# Observation
The Gaussian generator did not generate any data for outliers. When the data size is large and the minority class is small, it tends to ignore the minority class completely.In Titanic dataset, Age is an outlier.