In [1]:
import numpy as np
import pandas as pd
import sdv
import matplotlib.pyplot as plt

In [2]:
sdv.__version__

'1.0.0b1'

In [3]:
real_data=pd.read_csv('german_credit_data.csv')

# Gaussian Copula

In [4]:
from sdv.metadata import SingleTableMetadata #this is a function that creates metadata, tries to capture the datatypes 
                                              
metadata = SingleTableMetadata()#creating an empty data table
metadata.detect_from_csv(filepath='german_credit_data.csv')

In [5]:
metadata #checking metadata

{
    "columns": {
        "checking_status": {
            "sdtype": "categorical"
        },
        "duration": {
            "sdtype": "numerical"
        },
        "credit_history": {
            "sdtype": "categorical"
        },
        "purpose": {
            "sdtype": "categorical"
        },
        "credit_amount": {
            "sdtype": "numerical"
        },
        "savings_status": {
            "sdtype": "categorical"
        },
        "employment": {
            "sdtype": "categorical"
        },
        "installment_commitment": {
            "sdtype": "numerical"
        },
        "personal_status": {
            "sdtype": "categorical"
        },
        "other_parties": {
            "sdtype": "categorical"
        },
        "residence_since": {
            "sdtype": "numerical"
        },
        "property_magnitude": {
            "sdtype": "categorical"
        },
        "age": {
            "sdtype": "numerical"
        },
        "other_payment_plans": 

In [6]:
metadata.validate() #validating for our synthesizer

In [7]:
from sdv.single_table import GaussianCopulaSynthesizer
synthesizer2 = GaussianCopulaSynthesizer(
    metadata, # required
    enforce_min_max_values=True,
    enforce_rounding=True,
    numerical_distributions = {
    'num_dependents': 'truncnorm',
    'existing_credits' : 'truncnorm',
         'housing' : 'gamma',
    'purpose': 'truncnorm',
        'savings_status': 'truncnorm'
    },
    default_distribution='beta'
)

In [8]:
synthesizer2.get_parameters()

{'enforce_min_max_values': True,
 'enforce_rounding': True,
 'numerical_distributions': {'num_dependents': 'truncnorm',
  'existing_credits': 'truncnorm',
  'housing': 'gamma',
  'purpose': 'truncnorm',
  'savings_status': 'truncnorm'},
 'default_distribution': 'beta'}

In [9]:
synthesizer2.fit(real_data)
new_data2=synthesizer2.sample(30000)

In [42]:
#used for transforming datatypes to cont. var
transformers = synthesizer2.get_transformers()
transformers

{'checking_status': LabelEncoder(add_noise=True),
 'duration': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'credit_history': LabelEncoder(add_noise=True),
 'purpose': LabelEncoder(add_noise=True),
 'credit_amount': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'savings_status': LabelEncoder(add_noise=True),
 'employment': LabelEncoder(add_noise=True),
 'installment_commitment': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'personal_status': LabelEncoder(add_noise=True),
 'other_parties': LabelEncoder(add_noise=True),
 'residence_since': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'property_magnitude': LabelEncoder(add_noise=True),
 'age': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'other_payment_plans': LabelEncoder(add_noise=True),
 'housing': LabelEncoder(add_noise=True),
 'existing_credits': FloatFormatter(learn_rounding_scheme=True, enf

In [43]:
processed_data = synthesizer2.preprocess(real_data)
processed_data



Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0.135999,6.0,0.135999,0.135999,1169.0,0.135999,0.135999,4.0,0.135999,0.135999,...,0.135999,67.0,0.135999,0.135999,2.0,0.135999,1.0,0.135999,0.135999,0.135999
1,1.435761,48.0,1.435761,0.435761,5951.0,1.435761,1.435761,2.0,1.435761,0.435761,...,0.435761,22.0,0.435761,0.435761,1.0,0.435761,1.0,1.435761,0.435761,1.435761
2,2.433343,12.0,0.433343,1.433343,2096.0,1.433343,2.433343,2.0,0.433343,0.433343,...,0.433343,49.0,0.433343,0.433343,1.0,1.433343,2.0,1.433343,0.433343,0.433343
3,0.913815,42.0,1.913815,2.913815,7882.0,1.913815,2.913815,2.0,0.913815,1.913815,...,1.913815,45.0,0.913815,1.913815,1.0,0.913815,2.0,1.913815,0.913815,0.913815
4,0.990847,24.0,2.990847,3.990847,4870.0,1.990847,1.990847,3.0,0.990847,0.990847,...,2.990847,53.0,0.990847,1.990847,2.0,0.990847,2.0,1.990847,0.990847,1.990847
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,2.434825,12.0,1.434825,2.434825,1736.0,1.434825,2.434825,3.0,1.434825,0.434825,...,0.434825,31.0,0.434825,0.434825,1.0,1.434825,1.0,1.434825,0.434825,0.434825
996,0.290422,30.0,1.290422,4.290422,3857.0,1.290422,1.290422,4.0,2.290422,0.290422,...,1.290422,40.0,0.290422,0.290422,1.0,2.290422,1.0,0.290422,0.290422,0.290422
997,2.731553,12.0,1.731553,0.731553,804.0,1.731553,0.731553,4.0,0.731553,0.731553,...,3.731553,38.0,0.731553,0.731553,1.0,0.731553,1.0,1.731553,0.731553,0.731553
998,0.927332,45.0,1.927332,0.927332,1845.0,1.927332,1.927332,4.0,0.927332,0.927332,...,2.927332,23.0,0.927332,1.927332,1.0,0.927332,1.0,0.927332,0.927332,1.927332


In [44]:
synthesizer2.fit_processed_data(processed_data)

In [45]:
synthesizer2.get_transformers()

{'checking_status': LabelEncoder(add_noise=True),
 'duration': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'credit_history': LabelEncoder(add_noise=True),
 'purpose': LabelEncoder(add_noise=True),
 'credit_amount': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'savings_status': LabelEncoder(add_noise=True),
 'employment': LabelEncoder(add_noise=True),
 'installment_commitment': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'personal_status': LabelEncoder(add_noise=True),
 'other_parties': LabelEncoder(add_noise=True),
 'residence_since': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'property_magnitude': LabelEncoder(add_noise=True),
 'age': FloatFormatter(learn_rounding_scheme=True, enforce_min_max_values=True),
 'other_payment_plans': LabelEncoder(add_noise=True),
 'housing': LabelEncoder(add_noise=True),
 'existing_credits': FloatFormatter(learn_rounding_scheme=True, enf

In [46]:
new_data= synthesizer2.sample(30000)
new_data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,12,existing paid,repairs,5200,no known savings,4<=X<7,2,male single,none,...,life insurance,37,none,own,1,unskilled resident,1,none,yes,good
1,0<=X<200,49,critical/other existing credit,new car,8065,<100,>=7,4,male single,none,...,car,38,none,for free,1,skilled,2,yes,yes,bad
2,0<=X<200,43,no credits/all paid,other,1933,no known savings,4<=X<7,4,female div/dep/mar,none,...,no known property,32,none,own,2,skilled,1,none,yes,good
3,<0,29,critical/other existing credit,radio/tv,1434,no known savings,>=7,3,male div/sep,none,...,life insurance,41,none,own,1,skilled,1,none,yes,good
4,no checking,14,existing paid,new car,3598,500<=X<1000,unemployed,2,male div/sep,none,...,real estate,38,bank,for free,2,skilled,1,none,yes,good


In [47]:

from sdv.evaluation.single_table import evaluate_quality

quality_report1 = evaluate_quality(
    real_data,
    new_data,
    metadata
)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.20s/it]



Overall Quality Score: 85.7%

Properties:
Column Shapes: 89.26%
Column Pair Trends: 82.15%


## CTGAN 

In [48]:
from sdv.single_table import CTGANSynthesizer

synthesizergan = CTGANSynthesizer(metadata, # required
    enforce_rounding=False,
    epochs=500,
    verbose=True)


In [49]:
synthesizergan.fit(real_data)
synthetic_dataCTGAN2 = synthesizergan.sample(num_rows=30000)

Epoch 1, Loss G:  1.3322,Loss D: -0.0127
Epoch 2, Loss G:  1.2964,Loss D: -0.0490
Epoch 3, Loss G:  1.3477,Loss D: -0.1445
Epoch 4, Loss G:  1.3091,Loss D: -0.1784
Epoch 5, Loss G:  1.2910,Loss D: -0.2555
Epoch 6, Loss G:  1.2279,Loss D: -0.4052
Epoch 7, Loss G:  1.1432,Loss D: -0.3808
Epoch 8, Loss G:  1.0210,Loss D: -0.5624
Epoch 9, Loss G:  0.9873,Loss D: -0.5773
Epoch 10, Loss G:  0.7744,Loss D: -0.5989
Epoch 11, Loss G:  0.6777,Loss D: -0.5594
Epoch 12, Loss G:  0.5199,Loss D: -0.4566
Epoch 13, Loss G:  0.2029,Loss D: -0.3634
Epoch 14, Loss G:  0.0663,Loss D: -0.0391
Epoch 15, Loss G:  0.1249,Loss D:  0.0199
Epoch 16, Loss G:  0.0432,Loss D:  0.0343
Epoch 17, Loss G: -0.0087,Loss D:  0.2160
Epoch 18, Loss G: -0.0465,Loss D:  0.2520
Epoch 19, Loss G: -0.0035,Loss D:  0.2109
Epoch 20, Loss G: -0.1609,Loss D:  0.1481
Epoch 21, Loss G:  0.0047,Loss D:  0.1295
Epoch 22, Loss G:  0.0176,Loss D:  0.1696
Epoch 23, Loss G:  0.1556,Loss D:  0.1438
Epoch 24, Loss G:  0.0837,Loss D:  0.2291
E

Epoch 195, Loss G: -1.1435,Loss D:  0.0585
Epoch 196, Loss G: -1.2279,Loss D: -0.0460
Epoch 197, Loss G: -1.3075,Loss D: -0.0263
Epoch 198, Loss G: -1.3578,Loss D:  0.0925
Epoch 199, Loss G: -1.3566,Loss D:  0.0540
Epoch 200, Loss G: -1.2634,Loss D: -0.0340
Epoch 201, Loss G: -1.1995,Loss D: -0.1297
Epoch 202, Loss G: -1.2437,Loss D: -0.0395
Epoch 203, Loss G: -1.3764,Loss D:  0.0296
Epoch 204, Loss G: -1.3248,Loss D: -0.0146
Epoch 205, Loss G: -1.4377,Loss D:  0.1129
Epoch 206, Loss G: -1.4917,Loss D:  0.0340
Epoch 207, Loss G: -1.4617,Loss D: -0.0550
Epoch 208, Loss G: -1.4481,Loss D:  0.0782
Epoch 209, Loss G: -1.3531,Loss D: -0.0272
Epoch 210, Loss G: -1.3999,Loss D: -0.0002
Epoch 211, Loss G: -1.3858,Loss D:  0.0910
Epoch 212, Loss G: -1.5320,Loss D:  0.1159
Epoch 213, Loss G: -1.4612,Loss D:  0.2538
Epoch 214, Loss G: -1.4787,Loss D:  0.0280
Epoch 215, Loss G: -1.5433,Loss D:  0.0779
Epoch 216, Loss G: -1.3965,Loss D:  0.0705
Epoch 217, Loss G: -1.4848,Loss D: -0.1595
Epoch 218, 

Epoch 386, Loss G: -2.1119,Loss D:  0.1229
Epoch 387, Loss G: -2.2787,Loss D: -0.1548
Epoch 388, Loss G: -2.2802,Loss D: -0.0520
Epoch 389, Loss G: -2.2738,Loss D: -0.0687
Epoch 390, Loss G: -2.2843,Loss D: -0.0551
Epoch 391, Loss G: -2.2248,Loss D: -0.0737
Epoch 392, Loss G: -2.2540,Loss D: -0.0970
Epoch 393, Loss G: -2.2691,Loss D: -0.0071
Epoch 394, Loss G: -2.3218,Loss D: -0.1097
Epoch 395, Loss G: -2.2019,Loss D:  0.0271
Epoch 396, Loss G: -1.9620,Loss D: -0.2360
Epoch 397, Loss G: -2.0699,Loss D: -0.2424
Epoch 398, Loss G: -2.1590,Loss D: -0.1974
Epoch 399, Loss G: -2.0549,Loss D: -0.1062
Epoch 400, Loss G: -2.0901,Loss D:  0.0156
Epoch 401, Loss G: -2.0827,Loss D:  0.0143
Epoch 402, Loss G: -2.2401,Loss D:  0.0625
Epoch 403, Loss G: -2.2373,Loss D:  0.0821
Epoch 404, Loss G: -2.3676,Loss D:  0.1857
Epoch 405, Loss G: -2.3696,Loss D:  0.2963
Epoch 406, Loss G: -2.4250,Loss D:  0.2865
Epoch 407, Loss G: -2.3670,Loss D:  0.0564
Epoch 408, Loss G: -2.5316,Loss D:  0.3543
Epoch 409, 

In [50]:
synthetic_dataCTGAN2.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,47,existing paid,repairs,250,100<=X<500,4<=X<7,4,female div/dep/mar,none,...,car,31,none,own,2,unemp/unskilled non res,2,none,yes,bad
1,<0,24,existing paid,education,922,<100,1<=X<4,2,male single,none,...,car,73,stores,rent,1,high qualif/self emp/mgmt,1,none,yes,bad
2,<0,17,delayed previously,furniture/equipment,283,<100,<1,4,male mar/wid,none,...,car,28,none,rent,1,unskilled resident,1,yes,yes,good
3,>=200,46,existing paid,radio/tv,1264,100<=X<500,>=7,4,female div/dep/mar,none,...,life insurance,19,none,rent,2,unskilled resident,1,none,yes,good
4,<0,18,existing paid,furniture/equipment,250,<100,>=7,4,female div/dep/mar,none,...,life insurance,19,none,rent,1,high qualif/self emp/mgmt,1,none,yes,good


In [51]:
processed_datagan = synthesizergan.preprocess(real_data)
processed_datagan



Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,6.0,critical/other existing credit,radio/tv,1169.0,no known savings,>=7,4.0,male single,none,...,real estate,67.0,none,own,2.0,skilled,1.0,yes,yes,good
1,0<=X<200,48.0,existing paid,radio/tv,5951.0,<100,1<=X<4,2.0,female div/dep/mar,none,...,real estate,22.0,none,own,1.0,skilled,1.0,none,yes,bad
2,no checking,12.0,critical/other existing credit,education,2096.0,<100,4<=X<7,2.0,male single,none,...,real estate,49.0,none,own,1.0,unskilled resident,2.0,none,yes,good
3,<0,42.0,existing paid,furniture/equipment,7882.0,<100,4<=X<7,2.0,male single,guarantor,...,life insurance,45.0,none,for free,1.0,skilled,2.0,none,yes,good
4,<0,24.0,delayed previously,new car,4870.0,<100,1<=X<4,3.0,male single,none,...,no known property,53.0,none,for free,2.0,skilled,2.0,none,yes,bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,no checking,12.0,existing paid,furniture/equipment,1736.0,<100,4<=X<7,3.0,female div/dep/mar,none,...,real estate,31.0,none,own,1.0,unskilled resident,1.0,none,yes,good
996,<0,30.0,existing paid,used car,3857.0,<100,1<=X<4,4.0,male div/sep,none,...,life insurance,40.0,none,own,1.0,high qualif/self emp/mgmt,1.0,yes,yes,good
997,no checking,12.0,existing paid,radio/tv,804.0,<100,>=7,4.0,male single,none,...,car,38.0,none,own,1.0,skilled,1.0,none,yes,good
998,<0,45.0,existing paid,radio/tv,1845.0,<100,1<=X<4,4.0,male single,none,...,no known property,23.0,none,for free,1.0,skilled,1.0,yes,yes,bad


In [52]:
synthesizergan.fit_processed_data(processed_datagan)

Epoch 1, Loss G:  1.3885,Loss D: -0.0355
Epoch 2, Loss G:  1.3830,Loss D: -0.0500
Epoch 3, Loss G:  1.3876,Loss D: -0.1130
Epoch 4, Loss G:  1.4189,Loss D: -0.1727
Epoch 5, Loss G:  1.3836,Loss D: -0.2727
Epoch 6, Loss G:  1.3413,Loss D: -0.3117
Epoch 7, Loss G:  1.2220,Loss D: -0.3709
Epoch 8, Loss G:  1.2083,Loss D: -0.4493
Epoch 9, Loss G:  1.1080,Loss D: -0.4920
Epoch 10, Loss G:  1.0752,Loss D: -0.3648
Epoch 11, Loss G:  0.8564,Loss D: -0.3746
Epoch 12, Loss G:  0.8079,Loss D: -0.3033
Epoch 13, Loss G:  0.6320,Loss D:  0.0683
Epoch 14, Loss G:  0.6442,Loss D: -0.0468
Epoch 15, Loss G:  0.4012,Loss D: -0.0434
Epoch 16, Loss G:  0.3678,Loss D:  0.0162
Epoch 17, Loss G:  0.3886,Loss D:  0.0078
Epoch 18, Loss G:  0.3238,Loss D:  0.2275
Epoch 19, Loss G:  0.4628,Loss D:  0.1786
Epoch 20, Loss G:  0.5868,Loss D:  0.1647
Epoch 21, Loss G:  0.5387,Loss D:  0.1263
Epoch 22, Loss G:  0.5730,Loss D:  0.2560
Epoch 23, Loss G:  0.6856,Loss D:  0.0379
Epoch 24, Loss G:  0.7840,Loss D:  0.1169
E

Epoch 195, Loss G: -1.3022,Loss D:  0.1076
Epoch 196, Loss G: -1.0685,Loss D:  0.0405
Epoch 197, Loss G: -0.9873,Loss D:  0.1249
Epoch 198, Loss G: -1.0055,Loss D:  0.1124
Epoch 199, Loss G: -1.0860,Loss D: -0.0009
Epoch 200, Loss G: -1.0719,Loss D: -0.0886
Epoch 201, Loss G: -1.2170,Loss D:  0.0771
Epoch 202, Loss G: -1.1712,Loss D:  0.0754
Epoch 203, Loss G: -1.1575,Loss D:  0.0194
Epoch 204, Loss G: -0.9996,Loss D:  0.0405
Epoch 205, Loss G: -1.0299,Loss D: -0.1172
Epoch 206, Loss G: -1.1094,Loss D: -0.0384
Epoch 207, Loss G: -1.1884,Loss D:  0.0077
Epoch 208, Loss G: -1.1527,Loss D: -0.0479
Epoch 209, Loss G: -1.3258,Loss D:  0.0723
Epoch 210, Loss G: -1.2371,Loss D:  0.0312
Epoch 211, Loss G: -1.3524,Loss D:  0.0303
Epoch 212, Loss G: -1.2832,Loss D:  0.1275
Epoch 213, Loss G: -1.4141,Loss D:  0.0750
Epoch 214, Loss G: -1.3604,Loss D:  0.0557
Epoch 215, Loss G: -1.3773,Loss D: -0.0384
Epoch 216, Loss G: -1.3252,Loss D:  0.1179
Epoch 217, Loss G: -1.4228,Loss D:  0.0498
Epoch 218, 

Epoch 386, Loss G: -2.4248,Loss D: -0.0210
Epoch 387, Loss G: -2.4266,Loss D:  0.0278
Epoch 388, Loss G: -2.4233,Loss D:  0.2154
Epoch 389, Loss G: -2.5469,Loss D: -0.0001
Epoch 390, Loss G: -2.5716,Loss D:  0.1166
Epoch 391, Loss G: -2.5009,Loss D:  0.1670
Epoch 392, Loss G: -2.3928,Loss D:  0.1006
Epoch 393, Loss G: -2.4382,Loss D:  0.2294
Epoch 394, Loss G: -2.4663,Loss D: -0.1477
Epoch 395, Loss G: -2.3839,Loss D:  0.0656
Epoch 396, Loss G: -2.2712,Loss D:  0.2315
Epoch 397, Loss G: -2.1635,Loss D: -0.1123
Epoch 398, Loss G: -2.1470,Loss D: -0.0517
Epoch 399, Loss G: -2.1460,Loss D: -0.1864
Epoch 400, Loss G: -2.1682,Loss D: -0.0568
Epoch 401, Loss G: -2.1689,Loss D: -0.2760
Epoch 402, Loss G: -2.0756,Loss D: -0.0483
Epoch 403, Loss G: -2.1018,Loss D:  0.0799
Epoch 404, Loss G: -2.1926,Loss D: -0.0505
Epoch 405, Loss G: -2.3896,Loss D:  0.2211
Epoch 406, Loss G: -2.2208,Loss D:  0.0954
Epoch 407, Loss G: -2.3277,Loss D:  0.0277
Epoch 408, Loss G: -2.2746,Loss D: -0.0769
Epoch 409, 

In [53]:
gan_data= synthesizer2.sample(30000)
gan_data.head()

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,<0,7,existing paid,new car,1500,no known savings,4<=X<7,3,male single,none,...,real estate,28,bank,own,2,skilled,1,yes,yes,good
1,no checking,9,delayed previously,new car,3815,>=1000,>=7,3,male single,none,...,real estate,56,none,own,2,skilled,1,yes,yes,good
2,no checking,24,existing paid,radio/tv,5764,500<=X<1000,<1,1,male single,none,...,life insurance,30,none,own,2,unskilled resident,1,yes,yes,good
3,no checking,23,existing paid,business,4967,<100,1<=X<4,3,male single,none,...,no known property,31,none,own,2,unskilled resident,1,yes,yes,bad
4,<0,28,existing paid,radio/tv,1409,no known savings,>=7,4,male single,none,...,real estate,45,none,rent,1,unskilled resident,1,yes,yes,bad


In [54]:
from sdv.evaluation.single_table import evaluate_quality

quality_report1 = evaluate_quality(
    real_data,
    gan_data,
    metadata
)

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.21it/s]



Overall Quality Score: 85.72%

Properties:
Column Shapes: 89.26%
Column Pair Trends: 82.17%


In [56]:
import sdv
from sdv.evaluation.single_table import evaluate_quality
from rdt import HyperTransformer
from rdt.transformers.categorical import LabelEncoder

def hybrid(real_data,df1,df2,metadata):
    print("First dataset quality score:")
    quality_report1 = evaluate_quality(
    real_data,
    df1,
    metadata)
    print("Second dataset quality score:")
    quality_report2 = evaluate_quality(
    real_data,
    df2,
    metadata)
    tab1=quality_report1.get_details('Column Shapes')
    tab2=quality_report2.get_details('Column Shapes')
    print("--------------First dataset report:-------------")
    print(tab1)
    print("--------------Second dataset report:------------")
    print(tab2)
    print("------------- Combined Quality score------------")
    tab1['Quality Score_1']=tab1['Quality Score']
    tab1['Quality Score_2']=tab2['Quality Score']
    tab1=tab1.drop('Quality Score',axis=1)
    print(tab1)
    #conversion of data
    
    ht = HyperTransformer()
    ht.detect_initial_config(data=df1)
    ht.remove_transformers_by_sdtype(sdtype='numerical')
    
    #Transformaer update(only for those variable that are not of good quality)
    col=df1.columns
    col1 =[i for i in col if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] < tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]]  
    # do not transform the credit_card or age columns
    ht.remove_transformers(column_names=col1)
    # do not transform any categorical columns in the dataset
    ht.remove_transformers_by_sdtype(sdtype='numerical')
    config = ht.get_config()
    

    print("Name of columns for which transformation have  been used:")
    col=[i for i in col if i not in col1]
    print(col)
    for feature in col:
        if config["transformers"][feature].__class__.__name__ == 'FrequencyEncoder':
            config["transformers"][feature] = LabelEncoder()

    print(config)
    ht.set_config(config)
    
    config = ht.get_config()
    print("Details of transformation22")
    print(config)
    #merging datasets 
    
    
    ht.fit(df2)
    transformed_df2 = ht.transform(df2)
    ht.fit(df1)
    transformed_df1 = ht.transform(df1)
    
    dff2=transformed_df2
    dff1=transformed_df1
    for i in col:
        if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] > tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]:
            dff2=dff2.sort_values(by=[i])
            dff1=dff1.sort_values(by=[i])
            dff2[i]=dff1[i]
    #reversing conversion
    
    reversed_dff = ht.reverse_transform(dff2)
    
    
    print("Hybrid quality score")
    quality_report4 = evaluate_quality(
    real_data,
    reversed_dff,
    metadata) 
    
    #qualityscore
    tab3=quality_report4.get_details('Column Shapes')
    print("Hybrid dataset report:")
    print(tab3)
    print(" Combined Quality score")
    tab1['Quality Score_3']=tab3['Quality Score']
    
    
    return tab1
hybrid(real_data,new_data,gan_data,metadata)
    
    
    
    
    
    
    

First dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.25it/s]



Overall Quality Score: 85.7%

Properties:
Column Shapes: 89.26%
Column Pair Trends: 82.15%
Second dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.30it/s]



Overall Quality Score: 85.72%

Properties:
Column Shapes: 89.26%
Column Pair Trends: 82.17%
--------------First dataset report:-------------
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.909833
1            credit_amount  KSComplement       0.936433
2   installment_commitment  KSComplement       0.950000
3          residence_since  KSComplement       0.928500
4                      age  KSComplement       0.977533
5         existing_credits  KSComplement       0.889767
6           num_dependents  KSComplement       0.933767
7          checking_status  TVComplement       0.859400
8           credit_history  TVComplement       0.811000
9                  purpose  TVComplement       0.803933
10          savings_status  TVComplement       0.650533
11              employment  TVComplement       0.848267
12         personal_status  TVComplement       0.912400
13           other_parties  TVComplement       0.884900
14      property_m

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.17it/s]



Overall Quality Score: 85.74%

Properties:
Column Shapes: 89.4%
Column Pair Trends: 82.08%
Hybrid dataset report:
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.909833
1            credit_amount  KSComplement       0.936433
2   installment_commitment  KSComplement       0.952033
3          residence_since  KSComplement       0.931400
4                      age  KSComplement       0.978367
5         existing_credits  KSComplement       0.889767
6           num_dependents  KSComplement       0.939967
7          checking_status  TVComplement       0.865200
8           credit_history  TVComplement       0.814600
9                  purpose  TVComplement       0.807667
10          savings_status  TVComplement       0.650533
11              employment  TVComplement       0.848267
12         personal_status  TVComplement       0.912500
13           other_parties  TVComplement       0.889367
14      property_magnitude  TVComplement     

Unnamed: 0,Column,Metric,Quality Score_1,Quality Score_2,Quality Score_3
0,duration,KSComplement,0.909833,0.900367,0.909833
1,credit_amount,KSComplement,0.936433,0.9351,0.936433
2,installment_commitment,KSComplement,0.95,0.952033,0.952033
3,residence_since,KSComplement,0.9285,0.9314,0.9314
4,age,KSComplement,0.977533,0.978367,0.978367
5,existing_credits,KSComplement,0.889767,0.889767,0.889767
6,num_dependents,KSComplement,0.933767,0.939967,0.939967
7,checking_status,TVComplement,0.8594,0.8652,0.8652
8,credit_history,TVComplement,0.811,0.8146,0.8146
9,purpose,TVComplement,0.803933,0.807667,0.807667


In [57]:
new_data2=pd.read_csv('new_data2.csv')
synthetic_dataCTGAN=pd.read_csv('synthetic_dataCTGAN.csv')

In [58]:
new_data2= new_data2.drop(new_data2.columns[[0]],axis=1)
new_data2
synthetic_dataCTGAN=synthetic_dataCTGAN.drop(synthetic_dataCTGAN.columns[[0]],axis=1)

In [59]:
new_data2

Unnamed: 0,checking_status,duration,credit_history,purpose,credit_amount,savings_status,employment,installment_commitment,personal_status,other_parties,...,property_magnitude,age,other_payment_plans,housing,existing_credits,job,num_dependents,own_telephone,foreign_worker,class
0,0<=X<200,10,critical/other existing credit,education,1789,<100,4<=X<7,4,male div/sep,none,...,car,42,bank,own,1,high qualif/self emp/mgmt,1,none,no,bad
1,0<=X<200,12,existing paid,used car,1739,>=1000,<1,1,female div/dep/mar,none,...,real estate,30,none,own,1,unskilled resident,2,none,yes,good
2,>=200,11,critical/other existing credit,furniture/equipment,568,<100,unemployed,3,female div/dep/mar,none,...,real estate,30,none,own,2,skilled,1,yes,yes,bad
3,<0,8,existing paid,furniture/equipment,1230,500<=X<1000,unemployed,4,male div/sep,guarantor,...,car,55,none,for free,1,high qualif/self emp/mgmt,1,none,no,bad
4,0<=X<200,8,critical/other existing credit,furniture/equipment,2817,no known savings,4<=X<7,1,female div/dep/mar,none,...,car,42,none,own,2,skilled,1,yes,yes,good
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29995,0<=X<200,21,critical/other existing credit,furniture/equipment,1388,<100,unemployed,4,male single,none,...,no known property,49,none,own,2,skilled,1,yes,yes,good
29996,<0,13,delayed previously,business,1863,no known savings,1<=X<4,4,male single,none,...,car,32,none,own,2,skilled,2,yes,yes,bad
29997,<0,6,existing paid,radio/tv,1828,no known savings,4<=X<7,1,male single,none,...,real estate,31,bank,own,1,skilled,1,none,yes,good
29998,<0,7,delayed previously,new car,3604,500<=X<1000,>=7,1,male single,none,...,no known property,24,none,for free,1,skilled,2,none,yes,good


In [60]:
hybrid(real_data,new_data2,synthetic_dataCTGAN,metadata)

First dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.09it/s]



Overall Quality Score: 85.47%

Properties:
Column Shapes: 89.03%
Column Pair Trends: 81.9%
Second dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.31it/s]



Overall Quality Score: 87.74%

Properties:
Column Shapes: 89.88%
Column Pair Trends: 85.59%
--------------First dataset report:-------------
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.900233
1            credit_amount  KSComplement       0.939000
2   installment_commitment  KSComplement       0.951000
3          residence_since  KSComplement       0.927133
4                      age  KSComplement       0.977800
5         existing_credits  KSComplement       0.883900
6           num_dependents  KSComplement       0.934433
7          checking_status  TVComplement       0.859067
8           credit_history  TVComplement       0.812833
9                  purpose  TVComplement       0.803333
10          savings_status  TVComplement       0.646467
11              employment  TVComplement       0.823167
12         personal_status  TVComplement       0.914600
13           other_parties  TVComplement       0.883467
14      property_m

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.32it/s]



Overall Quality Score: 91.59%

Properties:
Column Shapes: 93.99%
Column Pair Trends: 89.18%
Hybrid dataset report:
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.900233
1            credit_amount  KSComplement       0.939000
2   installment_commitment  KSComplement       0.951000
3          residence_since  KSComplement       0.927200
4                      age  KSComplement       0.977800
5         existing_credits  KSComplement       0.883900
6           num_dependents  KSComplement       0.934433
7          checking_status  TVComplement       0.874200
8           credit_history  TVComplement       0.946100
9                  purpose  TVComplement       0.902133
10          savings_status  TVComplement       0.968933
11              employment  TVComplement       0.936400
12         personal_status  TVComplement       0.937733
13           other_parties  TVComplement       0.939400
14      property_magnitude  TVComplement    

Unnamed: 0,Column,Metric,Quality Score_1,Quality Score_2,Quality Score_3
0,duration,KSComplement,0.900233,0.8666,0.900233
1,credit_amount,KSComplement,0.939,0.592633,0.939
2,installment_commitment,KSComplement,0.951,0.896733,0.951
3,residence_since,KSComplement,0.927133,0.9272,0.9272
4,age,KSComplement,0.9778,0.8707,0.9778
5,existing_credits,KSComplement,0.8839,0.779533,0.8839
6,num_dependents,KSComplement,0.934433,0.879267,0.934433
7,checking_status,TVComplement,0.859067,0.8742,0.8742
8,credit_history,TVComplement,0.812833,0.9461,0.9461
9,purpose,TVComplement,0.803333,0.902133,0.902133


In [61]:
hybrid(real_data,new_data2,synthetic_dataCTGAN2,metadata)

First dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.10it/s]



Overall Quality Score: 85.47%

Properties:
Column Shapes: 89.03%
Column Pair Trends: 81.9%
Second dataset quality score:


Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.00it/s]



Overall Quality Score: 81.8%

Properties:
Column Shapes: 84.55%
Column Pair Trends: 79.06%
--------------First dataset report:-------------
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.900233
1            credit_amount  KSComplement       0.939000
2   installment_commitment  KSComplement       0.951000
3          residence_since  KSComplement       0.927133
4                      age  KSComplement       0.977800
5         existing_credits  KSComplement       0.883900
6           num_dependents  KSComplement       0.934433
7          checking_status  TVComplement       0.859067
8           credit_history  TVComplement       0.812833
9                  purpose  TVComplement       0.803333
10          savings_status  TVComplement       0.646467
11              employment  TVComplement       0.823167
12         personal_status  TVComplement       0.914600
13           other_parties  TVComplement       0.883467
14      property_ma

Creating report: 100%|███████████████████████████████████████████████████████████████████| 4/4 [00:04<00:00,  1.16s/it]



Overall Quality Score: 89.29%

Properties:
Column Shapes: 92.07%
Column Pair Trends: 86.5%
Hybrid dataset report:
                    Column        Metric  Quality Score
0                 duration  KSComplement       0.900233
1            credit_amount  KSComplement       0.939000
2   installment_commitment  KSComplement       0.951000
3          residence_since  KSComplement       0.946767
4                      age  KSComplement       0.977800
5         existing_credits  KSComplement       0.883900
6           num_dependents  KSComplement       0.934433
7          checking_status  TVComplement       0.859067
8           credit_history  TVComplement       0.861900
9                  purpose  TVComplement       0.870800
10          savings_status  TVComplement       0.887967
11              employment  TVComplement       0.885000
12         personal_status  TVComplement       0.914600
13           other_parties  TVComplement       0.941833
14      property_magnitude  TVComplement     

Unnamed: 0,Column,Metric,Quality Score_1,Quality Score_2,Quality Score_3
0,duration,KSComplement,0.900233,0.758667,0.900233
1,credit_amount,KSComplement,0.939,0.473667,0.939
2,installment_commitment,KSComplement,0.951,0.880767,0.951
3,residence_since,KSComplement,0.927133,0.946767,0.946767
4,age,KSComplement,0.9778,0.6236,0.9778
5,existing_credits,KSComplement,0.8839,0.829533,0.8839
6,num_dependents,KSComplement,0.934433,0.930967,0.934433
7,checking_status,TVComplement,0.859067,0.819433,0.859067
8,credit_history,TVComplement,0.812833,0.8619,0.8619
9,purpose,TVComplement,0.803333,0.8708,0.8708


In [None]:
#removing columns:
ht = HyperTransformer()
ht.detect_initial_config(data=real_data)
col = real_data.columns
col =[i for i in col if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] < tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]]  
print(col)
# do not transform the credit_card or age columns
ht.remove_transformers(column_names=col)
# do not transform any categorical columns in the dataset
ht.remove_transformers_by_sdtype(sdtype='numerical')
config = ht.get_config()
config

In [None]:
print(col)

In [None]:
tab1

In [None]:
from rdt.transformers.categorical import LabelEncoder
ht.update_transformers(column_name_to_transformer={
        "checking_status": LabelEncoder(),
        "duration": None,
        "credit_history":LabelEncoder(),
        "purpose": None,
        "credit_amount": None,
        "savings_status": LabelEncoder(),
        "employment": LabelEncoder(),
        "installment_commitment": None,
        "personal_status": LabelEncoder(),
        "other_parties": LabelEncoder(),
        "residence_since": None,
        "property_magnitude": LabelEncoder(),
        "age": None,
        "other_payment_plans": None,
        "housing": None,
        "existing_credits": None,
        "job": LabelEncoder(),
        "num_dependents": None,
        "own_telephone": LabelEncoder(),
        "foreign_worker": LabelEncoder(),
        "class": None
    })

In [None]:
    col1 =[i for i in col if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] < tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]]  
    # do not transform the credit_card or age columns
    ht.remove_transformers(column_names=col1)
    # do not transform any categorical columns in the dataset
    ht.remove_transformers_by_sdtype(sdtype='numerical')
    config = ht.get_config()
    print("Details of transformation")
    print(config)
    print("Name of columns for which transformation have  been used:")
    col=[i for i in col if i not in col1]
    print(col)

In [None]:
    #merging datasets
    df2=gan_data
    df1=new_data
    ht.fit(df2)
    transformed_df2 = ht.transform(df2)
    ht.fit(df1)
    transformed_df1 = ht.transform(df1)
    col=df1.columns
    dff2=transformed_df2
    dff1=transformed_df1
    for i in col:
        if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] > tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]:
            dff2=dff2.sort_values(by=[i])
            dff1=dff1.sort_values(by=[i])
            dff2[i]=dff1[i]
    #reversing conversion
    
    reversed_dff = ht.reverse_transform(dff2)
    
    
    print("Hybrid quality score")
    quality_report4 = evaluate_quality(
    real_data,
    reversed_dff,
    metadata) 
    
    #qualityscore
    tab3=quality_report4.get_details('Column Shapes')
    print("Hybrid dataset report:")
    print(tab3)
    print(" Combined Quality score")
    tab1['Quality Score_3']=tab3['Quality Score']
    
    

In [None]:
tab1

In [None]:
from rdt import HyperTransformer
from rdt.transformers.categorical import LabelEncoder
from sklearn import preprocessing
def hybrid(real_data,df1,df2,metadata,col):
    print("First dataset quality score:")
    quality_report1 = evaluate_quality(
    real_data,
    df1,
    metadata)
    print("Second dataset quality score:")
    quality_report2 = evaluate_quality(
    real_data,
    df2,
    metadata)
    tab1=quality_report1.get_details('Column Shapes')
    tab2=quality_report2.get_details('Column Shapes')
    print("--------------First dataset report:-------------")
    print(tab1)
    print("--------------Second dataset report:------------")
    print(tab2)
    print("------------- Combined Quality score------------")
    tab1['Quality Score_1']=tab1['Quality Score']
    tab1['Quality Score_2']=tab2['Quality Score']
    tab1=tab1.drop('Quality Score',axis=1)
    print(tab1)
    #conversion of data
    
    le = preprocessing.LabelEncoder()
    
    #Transformaer update(only for those variable that are not of good quality)
    
    col1 =[i for i in col if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] < tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]]  
    # do not transform the credit_card or age columns
    ht.remove_transformers(column_names=col1)
    # do not transform any categorical columns in the dataset
    ht.remove_transformers_by_sdtype(sdtype='numerical')
    config = ht.get_config()
    print("Details of transformation")
    print(config)
    print("Name of columns for which transformation have  been used:")
    col=[i for i in col if i not in col1]
    print(col)
    
    
    #merging datasets 
    
    dff2=transformed_df2
    dff1=transformed_df1
    for i in col:
        if tab1.loc[tab1['Column']==i, 'Quality Score_1'].values[0] > tab1.loc[tab1['Column']==i, 'Quality Score_2'].values[0]:
            dff2=dff2.sort_values(by=[i])
            dff1=dff1.sort_values(by=[i])
            dff2[i]=dff1[i]
    #reversing conversion
    
    reversed_dff = ht.reverse_transform(dff2)
    
    
    print("Hybrid quality score")
    quality_report4 = evaluate_quality(
    real_data,
    reversed_dff,
    metadata) 
    
    #qualityscore
    tab3=quality_report4.get_details('Column Shapes')
    print("Hybrid dataset report:")
    print(tab3)
    print(" Combined Quality score")
    tab1['Quality Score_3']=tab3['Quality Score']
    
    
    return tab1

In [39]:
from rdt import HyperTransformer
ht = HyperTransformer()
ht.detect_initial_config(data=real_data)
col = real_data.columns
col

Index(['checking_status', 'duration', 'credit_history', 'purpose',
       'credit_amount', 'savings_status', 'employment',
       'installment_commitment', 'personal_status', 'other_parties',
       'residence_since', 'property_magnitude', 'age', 'other_payment_plans',
       'housing', 'existing_credits', 'job', 'num_dependents', 'own_telephone',
       'foreign_worker', 'class'],
      dtype='object')

In [40]:
config = ht.get_config()
config

{
    "sdtypes": {
        "checking_status": "categorical",
        "duration": "numerical",
        "credit_history": "categorical",
        "purpose": "categorical",
        "credit_amount": "numerical",
        "savings_status": "categorical",
        "employment": "categorical",
        "installment_commitment": "numerical",
        "personal_status": "categorical",
        "other_parties": "categorical",
        "residence_since": "numerical",
        "property_magnitude": "categorical",
        "age": "numerical",
        "other_payment_plans": "categorical",
        "housing": "categorical",
        "existing_credits": "numerical",
        "job": "categorical",
        "num_dependents": "numerical",
        "own_telephone": "categorical",
        "foreign_worker": "categorical",
        "class": "categorical"
    },
    "transformers": {
        "checking_status": FrequencyEncoder(),
        "duration": FloatFormatter(),
        "credit_history": FrequencyEncoder(),
        "pu

In [26]:
from rdt.transformers.categorical import LabelEncoder

for feature in col:
    if config["transformers"][feature] == 'FrequencyEncoder()' :
        config["transformers"][feature] = LabelEncoder()

In [27]:
config


{
    "sdtypes": {
        "checking_status": "categorical",
        "duration": "numerical",
        "credit_history": "categorical",
        "purpose": "categorical",
        "credit_amount": "numerical",
        "savings_status": "categorical",
        "employment": "categorical",
        "installment_commitment": "numerical",
        "personal_status": "categorical",
        "other_parties": "categorical",
        "residence_since": "numerical",
        "property_magnitude": "categorical",
        "age": "numerical",
        "other_payment_plans": "categorical",
        "housing": "categorical",
        "existing_credits": "numerical",
        "job": "categorical",
        "num_dependents": "numerical",
        "own_telephone": "categorical",
        "foreign_worker": "categorical",
        "class": "categorical"
    },
    "transformers": {
        "checking_status": FrequencyEncoder(),
        "duration": FloatFormatter(),
        "credit_history": FrequencyEncoder(),
        "pu

In [35]:
for feature in col:
    if config["transformers"][feature] == 'FrequencyEncoder()' :
          print(feature)

In [41]:



for feature in col:
    if config["transformers"][feature].__class__.__name__ == 'FrequencyEncoder':
        config["transformers"][feature] = LabelEncoder()

config

{
    "sdtypes": {
        "checking_status": "categorical",
        "duration": "numerical",
        "credit_history": "categorical",
        "purpose": "categorical",
        "credit_amount": "numerical",
        "savings_status": "categorical",
        "employment": "categorical",
        "installment_commitment": "numerical",
        "personal_status": "categorical",
        "other_parties": "categorical",
        "residence_since": "numerical",
        "property_magnitude": "categorical",
        "age": "numerical",
        "other_payment_plans": "categorical",
        "housing": "categorical",
        "existing_credits": "numerical",
        "job": "categorical",
        "num_dependents": "numerical",
        "own_telephone": "categorical",
        "foreign_worker": "categorical",
        "class": "categorical"
    },
    "transformers": {
        "checking_status": LabelEncoder(),
        "duration": FloatFormatter(),
        "credit_history": LabelEncoder(),
        "purpose": 