# Methods for Generate Fake Data using CTGAN

In [1]:
"""
Dependencies
- SDV
- SDMetrics
- TableEvaluator
"""

'\nDependencies\n- SDV\n- SDMetrics\n- TableEvaluator\n'

In [2]:
from sdv.metadata import SingleTableMetadata
from sdv.single_table import CTGANSynthesizer
from sdmetrics.reports.single_table import QualityReport
from table_evaluator import TableEvaluator
    
def make_metadata(real_data):
    """
        Create metadata for SDV data needs

        Parameters
        -----------
        real_data : DataFrame
            real data in Pandas dataframe

        Returns
        --------
        object
            metadata as object. It can be converted to dictionary

    """
    meta = SingleTableMetadata()
    meta.detect_from_dataframe(real_data)

    return meta

def make_fake_ctgan(metadata, epoch, real_data, fake_nums, verbose=True):
    """
        Generate fake data via CTGAN

        Parameters
        ----------
        metadata : object
            SDV metadata
        epoch : int
            number of epoch
        real_data : DataFrame
            real data in Pandas dataframe
        fake_nums : int
            numbers of generated fake data
        verbose : bool
            verbose mode. Default 'True'

        Returns
        -------
        DataFrame
            Fake data as DataFrame
            
    """
    
    # Define CTGAN Model
    ctgan_model = CTGANSynthesizer(
        metadata=metadata,
        enforce_rounding=False,
        epochs=epoch,
        verbose=verbose
    )

    # Train CTGAN
    ctgan_model.fit(real_data)

    # Generate Fake Data
    fake_data = ctgan_model.sample(fake_nums)

    return fake_data

def fake_data_quality(real_data, fake_data, metadata):
    """
        Check fake data quality using SDMetrics

        Parameters
        ----------
        real_data : DataFrame
            real data as Pandas dataframe
        fake_data : DataFrame
            fake data as Pandas dataframe
        meta : Object
            SDV metadata object
        
        Returns
        -------
        void
            print quality report
    """

    report = QualityReport()
    report.generate(
        real_data=real_data,
        synthetic_data=fake_data,
        metadata=metadata.to_dict()
    )

def fake_data_table_evaluator(real_data, fake_data, target_col):
    """
        Evaluate fake data quality using Table Evaluator

        Parameters
        ----------
        real_data : DataFrame
            real data as Pandas dataframe
        fake_data : DataFrame
            fake data as Pandas dataframe
        target_col : str
            class column' name
        
        Returns
        -------
        void
            print evaluation report
    """
    eval = TableEvaluator(real=real_data, fake=fake_data)
    eval.evaluate(target_col=target_col)


def export_data(fake_data, fake_data_path):
    """
        Export fake data as csv

        Parameters
        ----------
        fake_data : DataFrame
            fake data in Pandas dataframe format
        fake_data_path : str
            data path with extension

        Returns
        -------
        void
            print success message
    """
    fake_data.to_csv(fake_data_path, index=False)

    return "Success!"
    

# Check Generator Methods

In [3]:
import pandas as pd

In [4]:
real_data = pd.read_csv('../anxiety_class_encoded.csv')
real_data.head()

Unnamed: 0,P1,P2,P3,P4,P5,P6,P7,P8,P9,P10,P11,P12,P13,P14,CLASS
0,3,2,0,3,3,2,0,1,2,1,2,1,2,1,1
1,3,2,0,2,2,3,0,2,2,1,3,1,3,1,1
2,1,2,0,2,1,2,0,1,1,1,2,1,1,1,1
3,2,1,0,2,0,1,0,2,1,1,0,1,1,1,1
4,2,3,2,2,3,2,2,1,1,0,2,0,3,1,2


In [5]:
# Create metadata
meta = make_metadata(real_data)

In [6]:
# Create fake data via CTGAN
fake_data = make_fake_ctgan(meta, 500, real_data, 120)

Epoch 1, Loss G:  0.0643,Loss D:  0.0028
Epoch 2, Loss G:  0.0770,Loss D: -0.0363
Epoch 3, Loss G:  0.0771,Loss D: -0.0802
Epoch 4, Loss G:  0.0663,Loss D: -0.0695
Epoch 5, Loss G:  0.0759,Loss D: -0.1288
Epoch 6, Loss G:  0.0556,Loss D: -0.0995
Epoch 7, Loss G:  0.0578,Loss D: -0.1446
Epoch 8, Loss G:  0.0523,Loss D: -0.1775
Epoch 9, Loss G:  0.0427,Loss D: -0.1805
Epoch 10, Loss G:  0.0145,Loss D: -0.2177
Epoch 11, Loss G: -0.0362,Loss D: -0.2186
Epoch 12, Loss G: -0.0220,Loss D: -0.2454
Epoch 13, Loss G: -0.0678,Loss D: -0.2582
Epoch 14, Loss G: -0.0699,Loss D: -0.3264
Epoch 15, Loss G: -0.1310,Loss D: -0.4417
Epoch 16, Loss G: -0.2032,Loss D: -0.3842
Epoch 17, Loss G: -0.2359,Loss D: -0.4780
Epoch 18, Loss G: -0.2997,Loss D: -0.4018
Epoch 19, Loss G: -0.4252,Loss D: -0.4708
Epoch 20, Loss G: -0.4488,Loss D: -0.5603
Epoch 21, Loss G: -0.5128,Loss D: -0.6153
Epoch 22, Loss G: -0.5364,Loss D: -0.6499
Epoch 23, Loss G: -0.6515,Loss D: -0.6779
Epoch 24, Loss G: -0.6772,Loss D: -0.6872
E

In [7]:
fake_data_quality(real_data, fake_data, meta)

Creating report:   0%|          | 0/4 [00:00<?, ?it/s]

Creating report: 100%|██████████| 4/4 [00:01<00:00,  3.76it/s]


Overall Quality Score: 86.88%

Properties:
Column Shapes: 86.11%
Column Pair Trends: 87.65%





In [8]:
fake_data_table_evaluator(real_data, fake_data, 'CLASS')


Classifier F1-scores and their Jaccard similarities::
                             f1_real  f1_fake  jaccard_similarity
index                                                            
DecisionTreeClassifier_fake   0.1250   0.2083              0.0435
DecisionTreeClassifier_real   0.9583   0.2917              0.2000
LogisticRegression_fake       0.1667   0.1250              0.1429
LogisticRegression_real       0.9583   0.0833              0.0667
MLPClassifier_fake            0.1250   0.2083              0.0435
MLPClassifier_real            0.9583   0.0833              0.0435
RandomForestClassifier_fake   0.1250   0.1667              0.1163
RandomForestClassifier_real   1.0000   0.0417              0.0213

Privacy results:
                                          result
Duplicate rows between sets (real/fake)  (93, 0)
nearest neighbor mean                     3.2383
nearest neighbor std                      0.5325

Miscellaneous results:
                                  Result
Column

In [9]:
export_data(fake_data, 'anx_ep500_1x_test.csv')

'Success!'

# Methods for NN Training and Evaluation

In [10]:
"""
Dependencies
- Numpy
- Pandas
- Tensorflow
- Scikit-learn
"""

'\nDependencies\n- Numpy\n- Pandas\n- Tensorflow\n- Scikit-learn\n'

In [11]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder

def combine_real_fake_data(real_data_path, fake_data_path):
    """
        Combine real and fake data

        Parameters
        ----------
        real_data_path : str
            Path to real data
        fake_data_path : str
            Path to fake data
        
        Returns
        -------
        DataFrame
            Combine real and fake data in Pandas DataFrame
    """
    # Load Data
    real = pd.read_csv(real_data_path)
    fake = pd.read_csv(fake_data_path)

    # Combine
    frames = [real, fake]
    combine = pd.concat(frames)

    return combine

def data_preprocessing(combine_data):
    """
        Prepare the data

        Parameter
        ----------
        combine_data : DataFrame
            combined real and fake data
        
        Returns
        -------
        X features
            numpy array in ndarray format
        y labels
            numpy array in ndarray format
    """

    # Select features and labels
    X = combine_data.iloc[:,0:14].values
    y = combine_data.iloc[:,-1].values

    # Make y in 2 dimension array
    y_encode = y.reshape(y.size, 1)

    # Define One Hot Encoder Object
    ohe = OneHotEncoder()

    # Transform y
    y_encode = ohe.fit_transform(y_encode)

    # Transform to y to ndarray
    y_encode = y_encode.toarray()

    return X, y_encode

def split_data(X, y, test_size=0.2, random_state=0, stratify=None):
    """
        Perform random split train test using stratify method. Default split 80:20

        Parameters
        ----------
        X : ndarray
            Features vector
        y : ndarray
            Labels
        test_size : float
            Test size ratio. Default=0.2
        random_state : int
            Randomize during data spliting. Default=0
        stratify : array-like
            Perform sampling using stratify method. Default=None

        Returns
        -------
        X_train : ndarray
            X features for training
        X_test : ndarray
            X features for testing
        y_train : ndarray
            y labels for training
        y_test : ndarray
            y labels for testing
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=random_state, stratify=stratify)

    return X_train, X_test, y_train, y_test

def eval_nn(X_train, y_train, X_test, y_test, epochs, dense_1_activation='relu', dense_2_activation='relu', output_activation='softmax', optimizer='adam', loss='categorical_crossentropy', metrics='accuracy'):
    
    # Build ANN
    ann = Sequential()
    ann.add(Dense(28, activation=dense_1_activation, input_dim=14))
    ann.add(Dense(28, activation=dense_2_activation))
    ann.add(Dense(6, activation=output_activation))
    ann.compile(optimizer=optimizer, loss=loss, metrics=[metrics])

    # Fitting
    ann.fit(X_train, y_train, epochs=epochs, validation_data=(X_test, y_test), verbose=0)

    # Evaluate
    _, train_acc = ann.evaluate(X_train, y_train, verbose=0)
    _, test_acc = ann.evaluate(X_test, y_test, verbose=0)
    print('Train: %.3f, Test: %.3f' % (train_acc, test_acc))

# Check NN Methods

In [12]:
# Combine real and fake
combine = combine_real_fake_data('../anxiety_class_encoded.csv', 'anx_ep500_1x_test.csv')

In [13]:
combine.shape

(240, 15)

In [14]:
# Preprosessing
X, y = data_preprocessing(combine)

In [15]:
X.shape

(240, 14)

In [16]:
y.shape

(240, 6)

In [17]:
# Split data
X_train, X_test, y_train, y_test = split_data(X, y, test_size=0.2, random_state=42, stratify=y)

In [18]:
X_train.shape

(192, 14)

In [19]:
y_train.shape

(192, 6)

In [20]:
eval_nn(X_train, y_train, X_test, y_test, epochs=500)

Train: 0.990, Test: 0.438
