Load necessary modules

In [1]:
import numpy as np
import pandas as pd
from sdv.lite import TabularPreset as TP
from sdv import Metadata

SDV uses a weird metadata system to be able to model relational data. I suppose that it is highly useful for working with synthetic data in a production setting, but it is mainly an irritation for this example. We also load the data into a data frame as a separate step. This seems like double the effort.

In [3]:
meta = Metadata()
meta.add_table(name="clltim",
               data='data/cll_synth.csv')
               
clltim = pd.read_csv('data/cll_synth.csv')
clltim.describe()

Unnamed: 0,Patient_id,Binet_Stage_A,Binet_Stage_B,Binet_Stage_C,IGHV_unmut_Mutated,IGHV_unmut_Unmutated,IGHV_unmut_NA,ECOG_0,ECOG_1,ECOG_2,...,INR_mean,INR_max,INR_sd,NA+_min,NA+_max,NA+_sd,NA+_mean,INFEC_interval_mean,infec_90_days_mean,INFEC_after_diag
count,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,3000.0,...,3000.0,3000.0,3000.0,2669.0,2669.0,2669.0,2669.0,3000.0,3000.0,3000.0
mean,1500.5,0.854333,0.116333,0.029333,0.450667,0.331667,0.217667,0.782667,0.176,0.025667,...,1.136646,1.19011,0.05378933,139.31366,152.855185,5.906128,146.04201,0.245,3.426,0.314333
std,866.169729,0.352831,0.320678,0.168767,0.497643,0.47089,0.412728,0.4125,0.380884,0.158165,...,0.399444,0.495875,0.1991131,11.872978,14.912916,6.744991,10.79918,2.744963,12.910722,0.464327
min,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.246136,0.144209,8.57e-50,83.417708,113.297819,0.0,113.297819,0.0,0.0,0.0
25%,750.75,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,0.855809,0.824274,7.06e-10,133.050884,141.652636,0.0,138.77607,0.0,0.0,0.0
50%,1500.5,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,...,1.087242,1.126298,1.55e-05,137.813969,148.621803,3.311764,143.323587,0.0,0.0,0.0
75%,2250.25,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,...,1.37589,1.485011,0.007854461,144.324539,163.319389,10.119194,152.76463,0.0,0.0,1.0
max,3000.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,3.497131,4.026163,2.774942,208.357356,211.256674,38.979306,208.357356,114.0,156.0,1.0


Before anything else we will try using one of the TabularPresets loaded as `TP`  

In [4]:
fml = TP(name='FAST_ML',
         metadata=meta.get_table_meta('clltim'))

So far so good, now it is time to fit a model using the TabularPreset `fml` 

In [5]:
fml.fit(clltim)

  for column, column_data in data.iteritems():


Easy peasy lemon squeezy. Lets generate some samples

In [6]:
fml.sample(10)

Unnamed: 0,Patient_id,Binet_Stage_A,Binet_Stage_B,Binet_Stage_C,IGHV_unmut_Mutated,IGHV_unmut_Unmutated,IGHV_unmut_NA,ECOG_0,ECOG_1,ECOG_2,...,INR_mean,INR_max,INR_sd,NA+_min,NA+_max,NA+_sd,NA+_mean,INFEC_interval_mean,infec_90_days_mean,INFEC_after_diag
0,1564,1,0,0,0,0,1,0,0,0,...,1.271823,1.406492,8.57e-50,137.1917,164.535641,13.341147,151.802831,0,11,0
1,1837,1,0,0,0,1,0,1,0,0,...,0.834,0.776977,0.05351581,,176.90183,,163.123336,0,0,0
2,469,1,0,0,0,1,0,0,0,0,...,0.901627,0.585938,0.03641376,145.971547,159.916061,6.028226,153.271321,0,5,0
3,935,1,0,0,1,0,0,0,1,0,...,1.395946,1.286805,0.02014153,,144.387953,,140.615399,3,0,1
4,2189,1,0,0,0,0,1,0,1,0,...,0.785447,0.902272,0.1657832,154.21938,,,158.424905,1,28,1
5,1506,1,0,0,0,1,0,1,0,0,...,0.915938,0.934134,0.2263182,143.656185,,4.582707,149.956129,0,2,0
6,770,1,0,0,0,0,1,1,0,0,...,0.246136,0.144209,8.57e-50,128.283537,159.723838,13.476621,144.740409,0,6,1
7,1,1,0,0,0,1,0,1,0,0,...,1.792619,1.648986,0.1685006,144.490563,170.901373,,157.860203,1,10,0
8,827,1,0,0,0,1,0,1,0,0,...,1.352963,1.37522,0.5206771,143.019293,158.618944,7.982769,153.915688,1,11,1
9,1,0,0,0,0,0,0,1,0,0,...,1.137187,1.186354,8.57e-50,133.562082,173.547127,16.420656,154.133956,0,0,0


Okay cool, but notice that some onehot encoded features have multiple "hot" values in a single row. We should introduce some constraints.

In [2]:
from sdv.constraints import OneHotEncoding

In [7]:
binet_ohe   =     OneHotEncoding(['Binet_Stage_A', 'Binet_Stage_B', 'Binet_Stage_C'])
ighv_ohe    =     OneHotEncoding(clltim.columns[4:7])
ecog_ohe    =     OneHotEncoding(clltim.columns[7:14])
famcll_ohe  =     OneHotEncoding(clltim.columns[14:17])
beta2_ohe   =     OneHotEncoding(clltim.columns[17:20])
cd38_ohe    =     OneHotEncoding(clltim.columns[20:23])
zap70_ohe   =     OneHotEncoding(clltim.columns[23:26])
gender_ohe  =     OneHotEncoding(clltim.columns[26:28])
del13_ohe   =     OneHotEncoding(clltim.columns[28:31])
tri12_ohe   =     OneHotEncoding(clltim.columns[31:34])
del11_ohe   =     OneHotEncoding(clltim.columns[34:37])
fml_constraints = [binet_ohe, ighv_ohe, ecog_ohe, famcll_ohe, beta2_ohe, cd38_ohe, zap70_ohe,
               gender_ohe, del13_ohe, tri12_ohe, del11_ohe]


Inspect the constraints

In [8]:
for c in fml_constraints:
    print(c.to_dict())

{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': ['Binet_Stage_A', 'Binet_Stage_B', 'Binet_Stage_C']}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['IGHV_unmut_Mutated', 'IGHV_unmut_Unmutated', 'IGHV_unmut_NA'], dtype='object')}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['ECOG_0', 'ECOG_1', 'ECOG_2', 'ECOG_3', 'ECOG_4', 'ECOG_5', 'ECOG_NA'], dtype='object')}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['FamCLL_Yes', 'FamCLL_No', 'FamCLL_NA'], dtype='object')}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['Beta2m_Yes', 'Beta2m_No', 'Beta2m_NA'], dtype='object')}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['CD38_Yes', 'CD38_No', 'CD38_NA'], dtype='object')}
{'constraint': 'sdv.constraints.tabular.OneHotEncoding', 'column_names': Index(['ZAP70_Yes', 'ZAP70_No', 'ZAP70_NA'], dtype='object')}
{

Let's sample again, this time using the constraints

In [9]:
fml_constrained = TP("FAST_ML", metadata=meta.get_table_meta('clltim'), constraints=fml_constraints)
fml_constrained.fit(clltim)

In [1]:
fml_constrained.sample(10)

NameError: name 'fml_constrained' is not defined

We impose certain constraints on continuous values, we can use several other constraints

In [11]:
import sdv.constraints as constraints
import re

In [12]:
for c in map(lambda x: constraints.ScalarInequality(x, '>=', 0), clltim.columns[39:-3]):
    if c not in fml_constraints:
        fml_constraints.append(c)


In [16]:
len(fml_constraints)

141

In [13]:
fml_highly_constrained = TP(name='FAST_ML', metadata=meta.get_table_meta('clltim'), constraints=fml_constraints)
fml_highly_constrained.fit(clltim)

In [17]:
fml_highly_constrained.sample(10, max_tries_per_batch=1000)

Sampling rows:  10%|███████████████▎                                                                                                                                         | 1/10 [38:04<5:42:36, 2284.01s/it]


Unnamed: 0,Patient_id,Binet_Stage_A,Binet_Stage_B,Binet_Stage_C,IGHV_unmut_Mutated,IGHV_unmut_Unmutated,IGHV_unmut_NA,ECOG_0,ECOG_1,ECOG_2,...,INR_mean,INR_max,INR_sd,NA+_min,NA+_max,NA+_sd,NA+_mean,INFEC_interval_mean,infec_90_days_mean,INFEC_after_diag
0,1665,1,0,0,1,0,0,1,0,0,...,0.841381,0.930088,0.030767,121.159132,133.496404,8.346455,129.092974,0,0,0


ALAS! The number of constraints is very high, so the time it takes to generate a sample is also very long. In fact, it may not succeed within the allowed number of tries (1000 per row). Remember, we chose the `FAST_ML` option, so other models may be even slower at generating constrained samples. Then again, other models may fit the data better and be better at complying to the constraints. 

In [None]:
clltim.columns.to_list()

In [None]:
from sdv.tabular import CTGAN
fml_ohe_constraints = [binet_ohe, ighv_ohe, ecog_ohe, famcll_ohe, beta2_ohe, cd38_ohe, zap70_ohe,
               gender_ohe, del13_ohe, tri12_ohe, del11_ohe]

no_constrain_GAN = CTGAN(table_metadata=meta.get_table_meta('clltim'))
no_constrain_GAN.fit(clltim)