In [None]:
import os
from tqdm import tqdm

import copy
import pandas as pd
import numpy as np

import random

import torch
import torch.nn as nn
import torch.optim as optim
import statistics as st

from data import *
from module import *
from utils import *

In [None]:
# Define Dataset Information
train_dataset_file = './data/diabetes_train.csv'
synthesized_file = './data/diabetes_cgtgan.csv'

num_columns = 9

continuous_column_names = [
    1, # Pregnancies
    2, # Glucose
    3, # Blood Pressure
    4, # Skin Thickness
    5, # Insulin
    6, # BMI
    7, # Diabetes
    8, # Age
]
mixed_column_names = []
mixed = {}
categorical_column_names = [
    0, #  Outcome
]
integer_columns_names = [
]
target_col = 0

problem_type = 'Classification'
regression_col = []
classification_col = [target_col]

continuous_columns_wo_target = [
    1, 
    2, 
    3, 
    4, 
    5, 
    6, 
    7, 
    8
]
category_columns_wo_target = []
classifiers_for_utility = ['lr', 'dt', 'rf']

In [None]:
df_train = pd.read_csv(train_dataset_file)

for i in range(len(df_train.columns)):
    cur_col_name = df_train.columns[i]
    df_train.rename(columns = {cur_col_name:i}, inplace = True)

if problem_type == 'Classification':
    df_val = df_train.groupby(target_col).sample(frac = 0.2)
else:
    df_val = df_train.sample(frac = 0.2)

df_val = df_val.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)

In [None]:
# Hyperparameters
batch_size = 256
rand_dim = 100

embed_dim = 64
cond_dim = embed_dim * num_columns

learning_rate = 1e-4
num_updates = 5000

num_critic_iters = 5
lambda_gp = 10

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# preprocess dataset
df_train_max_min_norm, max_list, min_list = df_min_max_norm(df_train, continuous_column_names)

num_mode_list = []

for _ in range(len(continuous_column_names)):
    num_mode_list.append(5)

preprocessor = Preprocessor(data= df_train_max_min_norm, continuous_columns = continuous_column_names,
                            mixed=mixed, categorical_columns=categorical_column_names,
                            skew_columns=[],
                            integer_columns = integer_columns_names,
                            num_mode_list = num_mode_list)

preprocessed_data = preprocessor.fit_transform(preprocessor.data)

embedding = CategoricalEmbedding(metadata=preprocessor.metadata)

embedding_data = embedding.embed(preprocessed_data)

merge = MergeDataset(embedding_data, embedding.metadata)

merge_data = merge.merge()

merge_tensor = torch.from_numpy(merge_data).float().to(device)

In [None]:
seed = 3

# Declare modules
deterministic = True

random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
if deterministic:
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

# declare DataSampler
datasampler = DataSampler(data=df_train_max_min_norm, metadata = embedding.metadata)

# declare module for projection
projection = Projection_virtual_node(input_dim = merge_tensor.shape[1], metadata=merge.metadata, emb_dim=embed_dim, device = device)
projection.to(device)
output_info_list = output_info_gather(projection.metadata)

# declare generator
cat_class_num_list = []
for detail in projection.metadata['details']:
    if detail['type'] == 'category':
        cat_class_num_list.append(detail['n'])

generator = NodewiseGenerator(metadata = preprocessor.metadata,
                            rand_dim = rand_dim + embed_dim,
                            num_nodes = num_columns + 1,
                            embed_dim = embed_dim,
                            device=device,
                            cat_class_num_list = cat_class_num_list)
generator.to(device)

# declare critic
critic_layers = GCN(embed_dim, num_columns+1, device, 0)
critic = Critic(layers=critic_layers, dim_representation=cond_dim+embed_dim, device = device)
critic.to(device)

# declare classifier
classifier = Classifier(metadata=projection.metadata, embed_dim=embed_dim, num_nodes = num_columns+1, num_mode_list = num_mode_list)
classifier.to(device)

# declare adj
adj = nn.Linear(in_features=num_columns+1, out_features=num_columns+1)
adj.weight.data = (torch.ones(size = (num_columns + 1, num_columns + 1)) - torch.eye(num_columns + 1)) / math.sqrt(num_columns + 1)
adj.to(device)

# declare optimizers
optimizer_c = optim.Adam(critic.parameters(), lr=learning_rate, weight_decay = 1e-7)
optimizer_g = optim.Adam(generator.parameters(), lr=learning_rate, weight_decay = 1e-7)
optimizer_p = optim.Adam(projection.parameters(), lr = learning_rate, weight_decay = 1e-7)
optimizer_cl = optim.Adam(classifier.parameters(), lr=learning_rate, weight_decay = 1e-7)
optimizer_adj = optim.Adam(adj.parameters(), lr=learning_rate, weight_decay = 1e-7)

# declare schedulers
scheduler_c = optim.lr_scheduler.ExponentialLR(optimizer = optimizer_c, gamma = 0.9)
scheduler_g = optim.lr_scheduler.ExponentialLR(optimizer = optimizer_g, gamma = 0.9)
scheduler_p = optim.lr_scheduler.ExponentialLR(optimizer = optimizer_p, gamma = 0.9)
scheduler_cl = optim.lr_scheduler.ExponentialLR(optimizer = optimizer_cl, gamma = 0.9)
scheduler_adj = optim.lr_scheduler.ExponentialLR(optimizer = optimizer_adj, gamma = 0.9)

# Train
list_real_vs_fake, projection, generator, critic, classifier, adj, df_fake = train_concat(
    train_dataset = merge_tensor,
    generator=generator,
    critic=critic,
    classifier = classifier,
    projection=projection,
    datasampler=datasampler,
    preprocessor=preprocessor,
    batch_size=batch_size,
    rand_dim=rand_dim,
    total_embed_dim=preprocessor.metadata["total_embed_dim"],
    optimizer_c=optimizer_c, optimizer_g=optimizer_g, optimizer_p=optimizer_p, optimizer_cl=optimizer_cl, optimizer_adj=optimizer_adj,
    scheduler_c=scheduler_c, scheduler_g=scheduler_g, scheduler_p=scheduler_p, scheduler_cl=scheduler_cl, scheduler_adj=scheduler_adj,
    num_updates=num_updates,
    num_critic_iters=num_critic_iters,
    lambda_gp=lambda_gp,
    device=device,
    output_info_list=output_info_list, continuous_column_names=continuous_column_names, categorical_column_names=categorical_column_names,
    target_col=target_col,
    node_classification_col=classification_col, node_regression_col=regression_col,
    adj = adj,
    problem_type=problem_type,
    df_train = df_train, df_test = df_val,
    max_list = max_list, min_list = min_list,
    continuous_columns_wo_target= continuous_columns_wo_target,
    category_columns_wo_target= category_columns_wo_target,
    classifiers_utility= classifiers_for_utility,
    )

# Save
df_fake.to_csv(synthesized_file, index=False)
