In [1]:
import os
import platform
# Operating System
OS = platform.system()                                                             # returns 'Windows', 'Linux', etc

# Libraries Installation Section

Installation of all required libraries: SDGym

In [2]:
os.system('pip install gdown')
os.system('pip install sdgym')
os.system('pip install pandas')

0

# All Imports

In [3]:
import timeit
import numpy as np
import pandas as pd
from sdv.demo import load_tabular_demo
from sdv.tabular import GaussianCopula, CTGAN
from sdv.evaluation import evaluate

# All Globals

In [4]:
benchmark = False
#benchmark = True
gaussian_copula_synth_model = False
ctgan_synth_model = True
#dataset = 'satgpa'
dataset = 'acs'
model_names = []

# All Settings

In [5]:
start_global_time = timeit.default_timer()
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500) 

# All Functions Definitions

In [6]:
def explore_data(data): 
  print("\nHead of Data: \n", data.head())
  print("\nTail of Data: \n", data.tail())
  print("\nShape of Data: ", data.shape)
  print("\nInformation about Data: \n")
  try: 
    data.info()
  except: 
    pass
  print("\nTypes of Data attributes: \n")
  try: 
    data.dtypes
  except: 
    pass
  print("\nSummary of all numerical fields in the dataset: \n")
  try: 
    data.describe(include = [np.number])
  except: 
    pass
  print("\nSummary of all categorical fields in the dataset: \n")
  try: 
    data.describe(include = ['O'])
  except: 
    pass
  print("\nLoop Through Each Column and Check for nulls: \n")
  try: 
    for i in range(len(data.columns)):
        print(data.columns[i] + ": " + str(data[data.columns[i]].isna().sum()))
  except: 
    pass

# Data Download - ACS and SatGPA

In [7]:
if benchmark == True: 
  data = load_tabular_demo('student_placements')
  n_to_generate = data.shape[0]
else: 
  if dataset is 'satgpa':
    if not os.path.exists("./satgpa.csv"):
      os.system('gdown --id "1NNVF1LhBDkW_KKp5_QW8cAiQDFatzWMy" --output "./satgpa.csv"')
      data = pd.read_csv('./satgpa.csv')
      data = data.drop(['sat_sum'], axis=1)
      data.to_csv('satgpa_no_sum.csv', sep=',')
      n_to_generate = data.shape[0]
  elif dataset is 'acs':
    if not os.path.exists("./acs_dataset.zip"):
      os.system('gdown --id "1mKZfDieGBJP-cS-R7_i3zVKVawXThfUc" --output "./acs_dataset.zip"')
      if OS == "Linux":
          os.system('unzip -o -n "./acs_dataset.zip" -d "./"')      
      #data = pd.read_csv('./acs_dataset.csv')
      #n_to_generate = data.shape[0]

      data = pd.read_csv('./acs_dataset.csv', nrows = 4000)
      n_to_generate = 4000

# Exploratory Analysis

In [8]:
explore_data(data)


Head of Data: 
    sex  sat_v  sat_m  hs_gpa  fy_gpa
0    1     65     62    3.40    3.18
1    2     58     64    4.00    3.33
2    2     56     60    3.75    3.25
3    1     42     53    3.75    2.42
4    1     55     52    4.00    2.63

Tail of Data: 
      sex  sat_v  sat_m  hs_gpa  fy_gpa
995    2     50     50     3.7    2.19
996    1     54     54     3.3    1.50
997    1     56     58     3.5    3.17
998    1     55     65     2.3    1.94
999    1     49     44     2.7    2.38

Shape of Data:  (1000, 5)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     1000 non-null   int64  
 1   sat_v   1000 non-null   int64  
 2   sat_m   1000 non-null   int64  
 3   hs_gpa  1000 non-null   float64
 4   fy_gpa  1000 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 39.2 KB

Types of Data attributes: 


Summary of 

# Synthetic Data Generation via Gaussian Copula Method 

In mathematical terms, a copula is a distribution over the unit cube [0,1]d which is constructed from a multivariate normal distribution over Rd by using the probability integral transform. Intuitively, a copula is a mathematical function that allows us to describe the joint distribution of multiple random variables by analyzing the dependencies between their marginal distributions.

In [9]:
if gaussian_copula_synth_model == True:
  model = GaussianCopula()
  model.fit(data)
  model_names.append(dataset+'_gaussian_copula.pkl')
  model.save(model_names[-1])

# Synthetic Data Generation via Conditional GAN 

Modeling the probability distribution of rows in tabular data and generating realistic synthetic data is a non-trivial task. Tabular data usually contains a mix of discrete and continuous columns. Continuous columns may have multiple modes whereas discrete columns are sometimes imbalanced making the modeling difficult. Existing statistical and deep neural network models fail to properly model this type of data. We design TGAN, which uses a conditional generative adversarial network to address these challenges. To aid in a fair and thorough comparison, we design a benchmark with 7 simulated and 8 real datasets and several Bayesian network baselines. TGAN outperforms Bayesian methods on most of the real datasets whereas other deep learning methods could not.

In [10]:
if ctgan_synth_model == True:
  model = CTGAN(
    epochs=500,
    batch_size=256,
    generator_dim=(256, 256, 256),
    discriminator_dim=(256, 256, 256)
  )
  model.fit(data)
  model_names.append(dataset+'_ctgan.pkl')
  model.save(model_names[-1])

  random_state=random_state).fit(X).labels_


# Model Loading and Preparation

In [12]:
model_file = []
model_to_load = []
if gaussian_copula_synth_model == True:
  model_file.append(model_names[0])
  model_to_load.append(("GaussianCopula", GaussianCopula))
if ctgan_synth_model == True:
  model_file.append(model_names[-1])
  model_to_load.append(("CTGAN", CTGAN))

loaded_model = []
for mf,ml in zip(model_file, model_to_load): 
  loaded_model.append((ml[0], ml[1].load(mf)))

# Synthetic Data Generation

In [13]:
synthetic_data = []
for lm in loaded_model: 
  synthetic_data.append((lm[0], lm[1].sample(n_to_generate)))

# Synthetic Data Exploratory Analysis

In [14]:
scored_and_synth_data = []
for sd in synthetic_data:
  try:
    print("\nMethod: ",sd[0])
    explore_data(sd[1])
    score = evaluate(sd[1], data)
    print("\n\nScore: ", score)
    scored_and_synth_data.append((sd[0], sd[1], score))  
  except:
    print("Error")



Method:  CTGAN

Head of Data: 
    sex  sat_v  sat_m  hs_gpa  fy_gpa
0    1     52     47    3.95    2.31
1    1     50     52    2.07    3.32
2    1     40     49    3.26    2.83
3    1     57     34    3.65    2.70
4    2     51     52    3.23    1.52

Tail of Data: 
      sex  sat_v  sat_m  hs_gpa  fy_gpa
995    1     59     48    2.45    2.77
996    1     55     49    3.54    2.46
997    1     57     53    2.41    2.78
998    2     35     49    2.93    0.69
999    1     55     41    4.06    2.80

Shape of Data:  (1000, 5)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   sex     1000 non-null   int64  
 1   sat_v   1000 non-null   int64  
 2   sat_m   1000 non-null   int64  
 3   hs_gpa  1000 non-null   float64
 4   fy_gpa  1000 non-null   float64
dtypes: float64(2), int64(3)
memory usage: 39.2 KB

Types of Data attributes

OSError: ignored



Score:  0.5406277945882195


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isna(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isna(synthetic_data)] = 0.0


In [15]:
total_time = timeit.default_timer() - start_global_time

In [16]:
for sas in scored_and_synth_data:
  sas[1].to_csv(dataset+'_synth_data_generated_by_method_'+sas[0].lower()+'total_time_'+str(round(total_time,2))+'_score_'+str(round(sas[2],3))+'.csv', sep=',')

In [17]:
print("Global Exectution Time: ", total_time)

Global Exectution Time:  828.276375455
