<a href="https://colab.research.google.com/github/harnalashok/general/blob/master/synthetic_tabular_data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Last amended: 2nd July, 2021
# Myfolder: lubuntu_healthcare VM
# Ref: https://sdv.dev/
#      https://github.com/sdv-dev/CTGAN
#
# Objective: Generate Synthetic data using CTGAN


We generate 1lakh synthetic samples of that from big data mart. <br>My OneDrive folder: D:\data\OneDrive\Documents\big_mart_sales_problem

## Install CTGAN Synthetic Data Generator

In [None]:
# Refer
!pip install ctgan

## Call libraries

In [3]:
from ctgan import CTGANSynthesizer

In [4]:
import numpy as np
import pandas as pd
import os
import time

In [5]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

## Read data

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [6]:
path = "/content/drive/MyDrive/Colab_data_files/big_data_mart"
os.chdir(path)

In [None]:
df = pd.read_csv("bigdatamart.csv")
df.head()

### Nulls must be filled in

In [None]:
# Check nulls in features
df.isnull().sum()

In [None]:
# Begin filling up
df['Outlet_Size'].value_counts()

In [10]:
df['Outlet_Size']= df['Outlet_Size'].fillna(value = 'Medium')

In [None]:
df['Item_Weight'].median()

In [12]:
df['Item_Weight'] = df['Item_Weight'].fillna(value = 12.6)

In [None]:
# Check again
df.isnull().sum()

## Process for data generation

In [13]:
print(list(df.columns))

['Item_Identifier', 'Item_Weight', 'Item_Fat_Content', 'Item_Visibility', 'Item_Type', 'Item_MRP', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type', 'Item_Outlet_Sales']


In [None]:
# Discrete columns
discrete_columns =['Item_Identifier','Item_Fat_Content','Item_Type', 'Outlet_Identifier', 'Outlet_Establishment_Year', 'Outlet_Size', 'Outlet_Location_Type', 'Outlet_Type']
cont = ['Item_Weight','Item_Visibility','Item_MRP', 'Item_Outlet_Sales']

# Match totals
len(discrete_columns)
len(cont)
len(df.columns)


### Start generating

In [None]:
# Learn train data
# Takes time
# Some warning messages come. But do not worry
start = time.time()
ctgan = CTGANSynthesizer(epochs=10)
ctgan.fit(df, discrete_columns)
end = time.time()
(end-start)/60

In [16]:
# Get Synthetic data now
datasize = 100000   # Large datasize would require lots of RAM
                    # Be warned, session may crash
                     # So generate in steps and then stack
syn_samples = ctgan.sample(datasize)

In [None]:
# Look at the data
syn_samples.head()

In [18]:
# Save generated data
syn_samples.to_csv("bigdatamart_synthetic.csv",index = False)

In [None]:
# So file is in gdrive
! pwd
! ls -la '/content/drive/My Drive/Colab_data_files/big_data_mart'

## Set column values back as NULL

In [50]:
# Function to generate null indexes
def generateNaN(df, num_rows_2_setNULL):
  return np.random.choice(df.shape[0], num_rows_2_setNULL, replace=False)

In [53]:
howMany = 10
r = generateNaN(syn_samples, howMany )
r

array([55924, 78145, 54268, 59787, 41776, 32620, 86350, 83465, 96514,
       12232])

In [48]:
# Set column values to NaN
syn_samples.loc[r, 'Outlet_Size'] = np.nan

In [54]:
# Recheck
syn_samples.isnull().sum()

Item_Identifier               0
Item_Weight                   0
Item_Fat_Content              0
Item_Visibility               0
Item_Type                     0
Item_MRP                      0
Outlet_Identifier             0
Outlet_Establishment_Year     0
Outlet_Size                  10
Outlet_Location_Type          0
Outlet_Type                   0
Item_Outlet_Sales             0
dtype: int64

In [55]:
# Save generated data
syn_samples.to_csv("bigdatamart_synthetic.csv",index = False)

In [56]:
# So file is in gdrive
! pwd
! ls -la '/content/drive/My Drive/Colab_data_files/big_data_mart'

/content/drive/My Drive/Colab_data_files/big_data_mart
total 14981
-rw------- 1 root root   869537 Jul  2 09:07 bigdatamart.csv
-rw------- 1 root root 14465731 Jul  2 09:45 bigdatamart_synthetic.csv
drwx------ 2 root root     4096 Jul  2 09:07 .ipynb_checkpoints


In [None]:
####################### I am done ############