In [1]:
import os
import platform
# Operating System
OS = platform.system()                                                             # returns 'Windows', 'Linux', etc

# Libraries Installation Section

Installation of all required libraries: SDGym

In [2]:
os.system('pip install gdown')
os.system('pip install sdgym')
os.system('pip install pandas')

0

# All Imports

In [3]:
import timeit
import numpy as np
import pandas as pd
from sdv.demo import load_tabular_demo
from sdv.tabular import GaussianCopula, CTGAN
from sdv.evaluation import evaluate

# All Globals

In [4]:
benchmark = True
gaussian_copula_synth_model = True
ctgan_synth_model = True

# All Settings

In [5]:
start_global_time = timeit.default_timer()
pd.set_option('display.max_columns', 500) 
pd.set_option('display.max_rows', 500) 

# All Functions Definitions

In [6]:
def explore_data(data): 
  print("\nHead of Data: \n", data.head())
  print("\nTail of Data: \n", data.tail())
  print("\nShape of Data: ", data.shape)
  print("\nInformation about Data: \n")
  data.info()
  print("\nTypes of Data attributes: \n")
  data.dtypes
  print("\nSummary of all numerical fields in the dataset: \n")
  data.describe(include = [np.number])
  print("\nSummary of all categorical fields in the dataset: \n")
  data.describe(include = ['O'])
  print("\nLoop Through Each Column and Check for nulls: \n")
  for i in range(len(data.columns)):
      print(data.columns[i] + ": " + str(data[data.columns[i]].isna().sum()))

# Data Download - ACS and SatGPA

In [7]:
if benchmark == True: 
  data = load_tabular_demo('student_placements')
  n_to_generate = data.shape[0]
else: 
  if not os.path.exists("./satgpa.csv"):
      os.system('gdown --id "1NNVF1LhBDkW_KKp5_QW8cAiQDFatzWMy" --output "./satgpa.csv"')

  '''
  if not os.path.exists("./acs_dataset.zip"):
      os.system('gdown --id "1mKZfDieGBJP-cS-R7_i3zVKVawXThfUc" --output "./acs_dataset.zip"')
      if OS == "Linux":
          os.system('unzip -o -n "./acs_dataset.zip" -d "./"')
  '''


# Exploratory Analysis

In [8]:
explore_data(data)


Head of Data: 
    student_id gender  second_perc  high_perc high_spec  degree_perc  \
0       17264      M        67.00      91.00  Commerce        58.00   
1       17265      M        79.33      78.33   Science        77.48   
2       17266      M        65.00      68.00      Arts        64.00   
3       17267      M        56.00      52.00   Science        52.00   
4       17268      M        85.80      73.60  Commerce        73.30   

  degree_type  work_experience  experience_years  employability_perc mba_spec  \
0    Sci&Tech            False                 0                55.0   Mkt&HR   
1    Sci&Tech             True                 1                86.5  Mkt&Fin   
2   Comm&Mgmt            False                 0                75.0  Mkt&Fin   
3    Sci&Tech            False                 0                66.0   Mkt&HR   
4   Comm&Mgmt            False                 0                96.8  Mkt&Fin   

   mba_perc   salary  placed start_date   end_date  duration  
0     

# Synthetic Data Generation via Gaussian Copula Method 

In mathematical terms, a copula is a distribution over the unit cube [0,1]d which is constructed from a multivariate normal distribution over Rd by using the probability integral transform. Intuitively, a copula is a mathematical function that allows us to describe the joint distribution of multiple random variables by analyzing the dependencies between their marginal distributions.

In [9]:
if gaussian_copula_synth_model == True:
  model = GaussianCopula()
  model.fit(data)
  model.save('gaussian_copula.pkl')

  return c**2 / (c**2 - n**2)
  Lhat = muhat - Shat*mu
  sk = 2*(b-a)*np.sqrt(a + b + 1) / (a + b + 2) / np.sqrt(a*b)
  improvement from the last ten iterations.


# Synthetic Data Generation via Conditional GAN 

Modeling the probability distribution of rows in tabular data and generating realistic synthetic data is a non-trivial task. Tabular data usually contains a mix of discrete and continuous columns. Continuous columns may have multiple modes whereas discrete columns are sometimes imbalanced making the modeling difficult. Existing statistical and deep neural network models fail to properly model this type of data. We design TGAN, which uses a conditional generative adversarial network to address these challenges. To aid in a fair and thorough comparison, we design a benchmark with 7 simulated and 8 real datasets and several Bayesian network baselines. TGAN outperforms Bayesian methods on most of the real datasets whereas other deep learning methods could not.

In [10]:
if ctgan_synth_model == True:
  model = CTGAN(
    epochs=500,
    batch_size=100,
    generator_dim=(256, 256, 256),
    discriminator_dim=(256, 256, 256)
  )
  model.fit(data)
  model.save('ctgan.pkl')

  random_state=random_state).fit(X).labels_
  random_state=random_state).fit(X).labels_


# Model Loading and Preparation

In [11]:
model_file = []
model_to_load = []
if gaussian_copula_synth_model == True:
  model_file.append('gaussian_copula.pkl')
  model_to_load.append(("GaussianCopula", GaussianCopula))
if ctgan_synth_model == True:
  model_file.append('ctgan.pkl')
  model_to_load.append(("CTGAN", CTGAN))

loaded_model = []
for mf,ml in zip(model_file, model_to_load): 
  loaded_model.append((ml[0], ml[1].load(mf)))

# Synthetic Data Generation

In [12]:
synthetic_data = []
for lm in loaded_model: 
  synthetic_data.append((lm[0], lm[1].sample(n_to_generate)))

# Synthetic Data Exploratory Analysis

In [13]:
scored_and_synth_data = []
for sd in synthetic_data:
  try:
    print("\nMethod: ",sd[0])
    explore_data(sd[1])
    score = evaluate(sd[1], data)
    print("\n\nScore: ", score)
    scored_and_synth_data.append((sd[0], sd[1], score))  
  except:
    print("Error")



Method:  GaussianCopula

Head of Data: 
    student_id gender  second_perc  high_perc high_spec  degree_perc  \
0       17268      M        85.90      78.01   Science        71.04   
1       17280      M        74.76      68.88  Commerce        66.15   
2       17386      F        68.41      59.60   Science        77.89   
3       17449      M        54.57      63.91  Commerce        64.89   
4       17288      F        56.99      82.10  Commerce        72.47   

  degree_type  work_experience  experience_years  employability_perc mba_spec  \
0    Sci&Tech            False                 1               70.58  Mkt&Fin   
1   Comm&Mgmt            False                 1               88.65  Mkt&Fin   
2    Sci&Tech            False                 1               74.76  Mkt&Fin   
3   Comm&Mgmt            False                 0               62.10  Mkt&Fin   
4   Comm&Mgmt            False                 0               57.44   Mkt&HR   

   mba_perc   salary  placed start_date   en

OSError: ignored

OSError: ignored

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isna(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isna(synthetic_data)] = 0.0




Score:  0.5528270888508091

Method:  CTGAN

Head of Data: 
    student_id gender  second_perc  high_perc high_spec  degree_perc  \
0       17363      M        55.35      60.60   Science        50.00   
1       17264      M        89.40      64.65   Science        81.11   
2       17284      F        89.40      66.67   Science        53.16   
3       17264      M        89.40      51.33  Commerce        51.89   
4       17284      M        89.40      71.37  Commerce        91.00   

  degree_type  work_experience  experience_years  employability_perc mba_spec  \
0   Comm&Mgmt             True                 2               63.89   Mkt&HR   
1   Comm&Mgmt             True                 0               98.00   Mkt&HR   
2   Comm&Mgmt             True                 1               98.00   Mkt&HR   
3   Comm&Mgmt             True                 1               86.35   Mkt&HR   
4   Comm&Mgmt             True                 1               94.89  Mkt&Fin   

   mba_perc   salary  pl

OSError: ignored

OSError: ignored

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  real_data[pd.isna(real_data)] = 0.0
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._where(-key, value, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  synthetic_data[pd.isna(synthetic_data)] = 0.0




Score:  0.4836552920982443


In [19]:
for sas in scored_and_synth_data:
  sas[1].to_csv('synth_data_generated_by_method_'+sas[0].lower()+'_score_'+str(round(sas[2],3))+'.csv', sep='\t')

In [15]:
print("Global Exectution Time: ", timeit.default_timer() - start_global_time)

Global Exectution Time:  60.22274517299991
