In [1]:
import os
import platform
# Operating System
OS = platform.system()                                                             # returns 'Windows', 'Linux', etc

# Libraries Installation Section

Installation of all required libraries: SDGym

In [None]:
os.system('pip install gdown')
os.system('pip install sdgym')
os.system('pip install pandas')

# All Imports

In [None]:
import timeit
import numpy as np
import pandas as pd
from sdv.demo import load_tabular_demo
from sdv.tabular import GaussianCopula, CTGAN
from sdv.evaluation import evaluate

# All Globals

In [None]:
benchmark = False
#benchmark = True
gaussian_copula_synth_model = True
ctgan_synth_model = True
#dataset = 'satgpa'
dataset = 'acs'

# All Settings

In [None]:
start_global_time = timeit.default_timer()
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500) 

# All Functions Definitions

In [None]:
def explore_data(data): 
  print("\nHead of Data: \n", data.head())
  print("\nTail of Data: \n", data.tail())
  print("\nShape of Data: ", data.shape)
  print("\nInformation about Data: \n")
  try: 
    data.info()
  except: 
    pass
  print("\nTypes of Data attributes: \n")
  try: 
    data.dtypes
  except: 
    pass
  print("\nSummary of all numerical fields in the dataset: \n")
  try: 
    data.describe(include = [np.number])
  except: 
    pass
  print("\nSummary of all categorical fields in the dataset: \n")
  try: 
    data.describe(include = ['O'])
  except: 
    pass
  print("\nLoop Through Each Column and Check for nulls: \n")
  try: 
    for i in range(len(data.columns)):
        print(data.columns[i] + ": " + str(data[data.columns[i]].isna().sum()))
  except: 
    pass

# Data Download - ACS and SatGPA

In [7]:
if benchmark == True: 
  data = load_tabular_demo('student_placements')
  n_to_generate = data.shape[0]
else: 
  if dataset is 'satgpa':
    if not os.path.exists("./satgpa.csv"):
      os.system('gdown --id "1NNVF1LhBDkW_KKp5_QW8cAiQDFatzWMy" --output "./satgpa.csv"')
      data = pd.read_csv('./satgpa.csv')
      n_to_generate = data.shape[0]
  elif dataset is 'acs':
    if not os.path.exists("./acs_dataset.csv"):
      os.system('gdown --id "1mKZfDieGBJP-cS-R7_i3zVKVawXThfUc" --output "./acs_dataset.csv"')
      data = pd.read_csv('./acs.csv', nrows = 200)
      n_to_generate = 200

# Exploratory Analysis

In [8]:
explore_data(data)


Head of Data: 
    sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
0    1     65     62      127    3.40    3.18
1    2     58     64      122    4.00    3.33
2    2     56     60      116    3.75    3.25
3    1     42     53       95    3.75    2.42
4    1     55     52      107    4.00    2.63

Tail of Data: 
      sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
995    2     50     50      100     3.7    2.19
996    1     54     54      108     3.3    1.50
997    1     56     58      114     3.5    3.17
998    1     55     65      120     2.3    1.94
999    1     49     44       93     2.7    2.38

Shape of Data:  (1000, 6)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   sex      1000 non-null   int64  
 1   sat_v    1000 non-null   int64  
 2   sat_m    1000 non-null   int64  
 3   sat_sum  1000 non-null   int64  
 4   hs_gpa   1

# Synthetic Data Generation via Gaussian Copula Method 

In mathematical terms, a copula is a distribution over the unit cube [0,1]d which is constructed from a multivariate normal distribution over Rd by using the probability integral transform. Intuitively, a copula is a mathematical function that allows us to describe the joint distribution of multiple random variables by analyzing the dependencies between their marginal distributions.

In [9]:
if gaussian_copula_synth_model == True:
  model = GaussianCopula()
  model.fit(data)
  model.save('gaussian_copula.pkl')

  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.
  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu


# Synthetic Data Generation via Conditional GAN 

Modeling the probability distribution of rows in tabular data and generating realistic synthetic data is a non-trivial task. Tabular data usually contains a mix of discrete and continuous columns. Continuous columns may have multiple modes whereas discrete columns are sometimes imbalanced making the modeling difficult. Existing statistical and deep neural network models fail to properly model this type of data. We design TGAN, which uses a conditional generative adversarial network to address these challenges. To aid in a fair and thorough comparison, we design a benchmark with 7 simulated and 8 real datasets and several Bayesian network baselines. TGAN outperforms Bayesian methods on most of the real datasets whereas other deep learning methods could not.

In [10]:
if ctgan_synth_model == True:
  model = CTGAN(
    epochs=500,
    batch_size=100,
    generator_dim=(256, 256, 256),
    discriminator_dim=(256, 256, 256)
  )
  model.fit(data)
  model.save('ctgan.pkl')

  random_state=random_state).fit(X).labels_


# Model Loading and Preparation

In [11]:
model_file = []
model_to_load = []
if gaussian_copula_synth_model == True:
  model_file.append('gaussian_copula.pkl')
  model_to_load.append(("GaussianCopula", GaussianCopula))
if ctgan_synth_model == True:
  model_file.append('ctgan.pkl')
  model_to_load.append(("CTGAN", CTGAN))

loaded_model = []
for mf,ml in zip(model_file, model_to_load): 
  loaded_model.append((ml[0], ml[1].load(mf)))

# Synthetic Data Generation

In [12]:
synthetic_data = []
for lm in loaded_model: 
  synthetic_data.append((lm[0], lm[1].sample(n_to_generate)))

# Synthetic Data Exploratory Analysis

In [13]:
scored_and_synth_data = []
for sd in synthetic_data:
  try:
    print("\nMethod: ",sd[0])
    explore_data(sd[1])
    score = evaluate(sd[1], data)
    print("\n\nScore: ", score)
    scored_and_synth_data.append((sd[0], sd[1], score))  
  except:
    print("Error")



Method:  GaussianCopula

Head of Data: 
    sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
0    1     51     69      120    3.60    3.69
1    2     61     55      118    3.90    3.17
2    1     60     64      123    3.99    3.42
3    2     42     50       93    3.67    2.63
4    1     48     70      120    3.61    2.59

Tail of Data: 
      sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
995    1     55     63      118    3.52    2.80
996    2     38     49       85    2.68    3.68
997    2     37     36       73    3.77    1.59
998    2     46     59      105    3.50    2.59
999    2     53     40       94    3.02    1.19

Shape of Data:  (1000, 6)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   sex      1000 non-null   int64  
 1   sat_v    1000 non-null   int64  
 2   sat_m    1000 non-null   int64  
 3   sat_sum  1000 non-null 

OSError: ignored

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isna(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isna(synthetic_data)] = 0.0




Score:  0.7882952606135906

Method:  CTGAN

Head of Data: 
    sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
0    1     48     42      120    3.56    3.53
1    1     60     70      124    4.04    3.26
2    1     46     43      110    2.45    2.41
3    1     38     62       78    2.65    2.10
4    1     61     52      127    3.94    3.41

Tail of Data: 
      sex  sat_v  sat_m  sat_sum  hs_gpa  fy_gpa
995    1     41     66      115    3.09    3.10
996    2     39     50      114    3.34    3.10
997    1     33     46       90    2.30    0.84
998    2     52     66       84    2.42    2.70
999    1     65     71      124    2.96    3.95

Shape of Data:  (1000, 6)

Information about Data: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 6 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   sex      1000 non-null   int64  
 1   sat_v    1000 non-null   int64  
 2   sat_m    1000 non-null   int64  
 3   sat

OSError: ignored



Score:  0.5267581222887103


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isna(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isna(synthetic_data)] = 0.0


In [14]:
for sas in scored_and_synth_data:
  sas[1].to_csv(dataset+'_synth_data_generated_by_method_'+sas[0].lower()+'_score_'+str(round(sas[2],3))+'.csv', sep='\t')

In [15]:
print("Global Exectution Time: ", timeit.default_timer() - start_global_time)

Global Exectution Time:  106.49826682599996
