# Imports

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, average_precision_score, confusion_matrix, ConfusionMatrixDisplay, accuracy_score, f1_score

In [2]:
from sdv.metadata import Metadata
from sdv.single_table import CopulaGANSynthesizer
from sdv.single_table import TVAESynthesizer
from sdv.single_table import CTGANSynthesizer

from ctgan import CTGAN
from ctgan import TVAE

In [3]:
%load_ext autoreload
%autoreload 2
from models.kan_ctgan import KANCTGAN
from models.kan_ctabgan import KANCTABGAN
from models.kan_tvae import KANTVAE

In [4]:
from catboost import CatBoostClassifier, Pool
from catboost.utils import eval_metric

In [5]:
import sdmetrics
from sdmetrics.reports.single_table import QualityReport

# german_credit_data_target

## preproc

In [6]:
df1 = pd.read_csv('data/german_credit_data_target.csv', index_col=0)
df1.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,,little,1169,6,radio/TV,good
1,22,female,2,own,little,moderate,5951,48,radio/TV,bad
2,49,male,1,own,little,,2096,12,education,good
3,45,male,2,free,little,little,7882,42,furniture/equipment,good
4,53,male,2,free,little,little,4870,24,car,bad


In [7]:
df1['Job'] = df1['Job'].astype(str)

In [8]:
target = 'Risk'

In [9]:
df1['Saving accounts'] = df1['Saving accounts'].fillna('no_info')
df1['Checking account'] = df1['Checking account'].fillna('no_info')

r = {"good":0, "bad": 1}
df1[target] = df1[target].map(r)

df1.head()

Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose,Risk
0,67,male,2,own,no_info,little,1169,6,radio/TV,0
1,22,female,2,own,little,moderate,5951,48,radio/TV,1
2,49,male,1,own,little,no_info,2096,12,education,0
3,45,male,2,free,little,little,7882,42,furniture/equipment,0
4,53,male,2,free,little,little,4870,24,car,1


## train_test

In [10]:
X_train, X_test, y_train, y_test = train_test_split(df1.drop(target, axis=1), df1[target], 
                                                    test_size=0.3, random_state=42,
                                                    stratify=df1[target])
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

X_test.head()

(700, 9) (300, 9) (700,) (300,)


Unnamed: 0,Age,Sex,Job,Housing,Saving accounts,Checking account,Credit amount,Duration,Purpose
0,47,male,2,own,no_info,no_info,3578,48,radio/TV
1,23,male,2,own,little,moderate,882,13,radio/TV
2,31,male,2,own,little,rich,4473,36,radio/TV
3,33,female,2,own,little,no_info,2831,30,radio/TV
4,21,male,1,own,little,little,1289,12,furniture/equipment


In [11]:
X_train_0 = X_train[y_train==0]
X_train_1 = X_train[y_train==1]

y_train_ = np.concatenate([np.zeros(len(X_train_0)), np.ones(len(X_train_1))])

In [12]:
int_cols = [col for col in X_train.columns if X_train[col].dtype == 'int' and col != target]
num_cols = [col for col in X_train.columns if X_train[col].dtype != 'O' and col != target]
cat_cols = [col for col in X_train.columns if X_train[col].dtype == 'O' and col != target]

print(num_cols, cat_cols)

['Age', 'Credit amount', 'Duration'] ['Sex', 'Job', 'Housing', 'Saving accounts', 'Checking account', 'Purpose']


## Gen

In [13]:
ep = 150 # epochs

### SDV

In [14]:
metadata = Metadata.detect_from_dataframe(
    data=X_train_0)

sdv_ctgan = CTGANSynthesizer(metadata, epochs=ep)
sdv_tvae = TVAESynthesizer(metadata, epochs=ep)
sdv_copulagan = CopulaGANSynthesizer(metadata, epochs=ep)

sdv_ctgan.fit(X_train_0)
sdv_tvae.fit(X_train_0)
sdv_copulagan.fit(X_train_0)

X_train_0_sdv_ctgan = sdv_ctgan.sample(len(X_train_0))
X_train_0_sdv_tvae = sdv_tvae.sample(len(X_train_0))
X_train_0_sdv_copulagan = sdv_copulagan.sample(len(X_train_0))

In [15]:
metadata = Metadata.detect_from_dataframe(
    data=X_train_1)

sdv_ctgan = CTGANSynthesizer(metadata, epochs=ep)
sdv_tvae = TVAESynthesizer(metadata, epochs=ep)
sdv_copulagan = CopulaGANSynthesizer(metadata, epochs=ep)

sdv_ctgan.fit(X_train_1)
sdv_tvae.fit(X_train_1)
sdv_copulagan.fit(X_train_1)

X_train_1_sdv_ctgan = sdv_ctgan.sample(len(X_train_1))
X_train_1_sdv_tvae = sdv_tvae.sample(len(X_train_1))
X_train_1_sdv_copulagan = sdv_copulagan.sample(len(X_train_1))

In [16]:
X_train_sdv_ctgan = pd.concat([X_train_0_sdv_ctgan, X_train_1_sdv_ctgan]).reset_index(drop=True)
X_train_sdv_tvae = pd.concat([X_train_0_sdv_tvae, X_train_1_sdv_tvae]).reset_index(drop=True)
X_train_sdv_copulagan = pd.concat([X_train_0_sdv_copulagan, X_train_1_sdv_copulagan]).reset_index(drop=True)

### ctgan

In [17]:
ctgan = CTGAN(epochs=ep)
tvae = TVAE(epochs=ep)

ctgan.set_random_state(42)
tvae.set_random_state(42)

ctgan.fit(X_train[y_train==0], cat_cols)
tvae.fit(X_train[y_train==0], cat_cols)

X_train_0_ctgan = ctgan.sample(len(X_train_0))
X_train_0_tvae = tvae.sample(len(X_train_0))

In [18]:
ctgan = CTGAN(epochs=ep)
tvae = TVAE(epochs=ep)

ctgan.set_random_state(42)
tvae.set_random_state(42)

ctgan.fit(X_train[y_train==1], cat_cols)
tvae.fit(X_train[y_train==1], cat_cols)

X_train_1_ctgan = ctgan.sample(len(X_train_1))
X_train_1_tvae = tvae.sample(len(X_train_1))

In [19]:
X_train_ctgan = pd.concat([X_train_0_ctgan, X_train_1_ctgan]).reset_index(drop=True)
X_train_tvae = pd.concat([X_train_0_tvae, X_train_1_tvae]).reset_index(drop=True)

### kan variations

In [20]:
kan_ctgan = KANCTGAN(epochs=ep)
kan_tvae = KANTVAE(epochs=ep)

kan_ctgan.set_random_state(42)
kan_tvae.set_random_state(42)

kan_ctgan.fit(X_train[y_train==0], cat_cols)
kan_tvae.fit(X_train[y_train==0], cat_cols)

X_train_0_kan_ctgan = kan_ctgan.sample(len(X_train_0))
X_train_0_kan_tvae = kan_tvae.sample(len(X_train_0))

In [21]:
kan_ctgan = KANCTGAN(epochs=ep)
kan_tvae = KANTVAE(epochs=ep)

kan_ctgan.set_random_state(42)
kan_tvae.set_random_state(42)

kan_ctgan.fit(X_train[y_train==1], cat_cols)
kan_tvae.fit(X_train[y_train==1], cat_cols)

X_train_1_kan_ctgan = kan_ctgan.sample(len(X_train_1))
X_train_1_kan_tvae = kan_tvae.sample(len(X_train_1))

In [22]:
X_train_kan_ctgan = pd.concat([X_train_0_kan_ctgan, X_train_1_kan_ctgan]).reset_index(drop=True)
X_train_kan_tvae = pd.concat([X_train_0_kan_tvae, X_train_1_kan_tvae]).reset_index(drop=True)

In [23]:
# takes a long time to learn and works poorly
#
# kan_ctabgan = KANCTABGAN(pd.concat([X_train, y_train], axis=1), 
#                          categorical_columns=cat_cols,
#                          integer_columns=int_cols,
#                          non_categorical_columns=num_cols,
#                          problem_type= {'Classification': target})
# kan_ctabgan.fit()
# X_train_kan_ctabgan = kan_ctabgan.generate_samples()
# X_train_kan_ctabgan['Risk'] = X_train_kan_ctabgan['Risk'].round()

## modeling

In [24]:
catboost_params = {'max_depth': 7, 'learning_rate': 0.035, 
                   'min_data_in_leaf': 1295, 'subsample': 0.95, 
                   'grow_policy': 'SymmetricTree', 'loss_function': 'Logloss',
                   'random_seed': 42}

In [25]:
X_train_data = {
    'real': X_train,

    'sdv_ctgan': X_train_sdv_ctgan,
    'sdv_tvae': X_train_sdv_tvae,
    'sdv_copulagan': X_train_sdv_copulagan,

    'ctgan': X_train_ctgan,
    'tvae': X_train_tvae,

    'kan_ctgan': X_train_kan_ctgan,
    'kan_tvae': X_train_kan_tvae,

    # 'kan_ctabgab': X_train_kan_ctabgan,

}

In [26]:
results = pd.DataFrame(columns = ['name', 'auc', 'avp', 'acc', 'f1'])
for name, data in X_train_data.items():
    model = CatBoostClassifier(**catboost_params)
    if name=='real':
        model.fit(
            data,
            y_train,
            cat_features=cat_cols,
            verbose=False)
    # elif name=='kan_ctabgab':
    #     model.fit(
    #         data.drop(target, axis=1),
    #         data[target],
    #         cat_features=cat_cols,
    #         verbose=False)
    else:
        model.fit(
            data,
            y_train_,
            cat_features=cat_cols,
            verbose=False)

    y_pred = model.predict(X_test)
    auc = roc_auc_score(y_test, y_pred)
    avp = average_precision_score(y_test, y_pred)
    acc = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    results.loc[len(results)] = [name, auc, avp, acc, f1]


In [27]:
results.sort_values('auc', ascending=False)

Unnamed: 0,name,auc,avp,acc,f1
7,kan_tvae,0.679365,0.407216,0.613333,0.567164
2,sdv_tvae,0.669841,0.399292,0.6,0.558824
5,tvae,0.669841,0.399292,0.6,0.558824
6,kan_ctgan,0.652381,0.417895,0.726667,0.506024
0,real,0.638095,0.412258,0.733333,0.473684
4,ctgan,0.621429,0.372857,0.643333,0.488038
1,sdv_ctgan,0.603175,0.37463,0.706667,0.413333
3,sdv_copulagan,0.556349,0.330415,0.636667,0.369942


## sdmetrics

In [28]:
metadata = Metadata.detect_from_dataframe(
    data=X_train)

results = pd.DataFrame(columns = ['name', 'Column Shapes', 'Column Pair Trends', 'Average'])
for name, data in X_train_data.items():
    model = CatBoostClassifier(**catboost_params)
    my_report = QualityReport()
    my_report.generate(X_train, data, metadata.to_dict()['tables']['table'])


    results.loc[len(results)] = [name, 
                                 my_report.get_properties()['Score'].tolist()[0], 
                                 my_report.get_properties()['Score'].tolist()[1], 
                                 my_report.get_score()]


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 2930.57it/s]|
Column Shapes Score: 100.0%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 1059.88it/s]|
Column Pair Trends Score: 100.0%

Overall Score (Average): 100.0%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 3110.73it/s]|
Column Shapes Score: 87.16%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 988.87it/s]|
Column Pair Trends Score: 79.67%

Overall Score (Average): 83.42%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 3054.10it/s]|
Column Shapes Score: 65.44%

(2/2) Evaluating Column Pair Trends: |██████████| 36/36 [00:00<00:00, 934.50it/s]|
Column Pair Trends Score: 52.93%

Overall Score (Average): 59.19%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 9/9 [00:00<00:00, 3112.01it/s]|
Column Shapes Score: 87.59%

(2/2) Evaluating Column Pair Tren

In [29]:
results.sort_values('Average', ascending=False)

Unnamed: 0,name,Column Shapes,Column Pair Trends,Average
0,real,1.0,1.0,1.0
6,kan_ctgan,0.918254,0.83591,0.877082
1,sdv_ctgan,0.871587,0.796736,0.834162
3,sdv_copulagan,0.875873,0.771384,0.823629
4,ctgan,0.856032,0.786877,0.821455
2,sdv_tvae,0.654444,0.529298,0.591871
7,kan_tvae,0.63746,0.507973,0.572717
5,tvae,0.63619,0.494146,0.565168


# loans_full_schema

## preproc

In [30]:
# dataset info - https://www.openintro.org/data/index.php?data=loans_full_schema

In [31]:
df2 = pd.read_csv('data/loans_full_schema.csv')

In [32]:
df2['emp_title'] = df2['emp_title'].fillna('unemp')
df2['emp_length'] = df2['emp_length'].astype('Int64').astype('str')
df2['emp_length'] = df2['emp_length'].fillna('unk')

In [33]:
df2['debt_to_income'] = df2['debt_to_income'].fillna(0) # Solo debt is 0 or .fillna(df2['debt_to_income_joint'])
df2['annual_income_joint'] = df2['annual_income_joint'].fillna(df2['annual_income'])
df2['debt_to_income_joint'] = df2['debt_to_income_joint'].fillna(df2['debt_to_income'])

In [34]:
df2 = df2.drop(['num_accounts_120d_past_due'], axis=1)

In [35]:
temp1 = (df2['verification_income_joint'].isna()) & (df2['application_type']=='individual')
df2.loc[temp1, 'verification_income_joint'] = df2.loc[temp1, 'verification_income_joint'].fillna('Solo')
df2['verification_income_joint'] = df2['verification_income_joint'].fillna('Not Verified')
del temp1

In [36]:
# months_since_last_credit_inquiry - Number of months since the last credit inquiry on this applicant.
# months_since_90d_late - Months since the last time the applicant was 90 days late on a payment.
# months_since_last_delinq - Months since the last delinquency.

In [37]:
temp1 = (df2['months_since_last_delinq'].isna()) & (df2['delinq_2y']!=0)
df2.loc[temp1, 'months_since_last_delinq'] = df2.loc[temp1, 'months_since_last_delinq'].fillna(24)
df2['months_since_last_delinq'] = df2['months_since_last_delinq'].fillna(-1)
del temp1

In [38]:
df2['months_since_last_credit_inquiry'] = df2['months_since_last_credit_inquiry'].fillna(-1)
df2['months_since_90d_late'] = df2['months_since_90d_late'].fillna(-1)

In [39]:
df2 = df2.drop(['issue_month'], axis=1)

In [40]:
df2['term'] = df2['term'].astype(str)

In [41]:
target_cols = ['sub_grade', 
               'emp_title', # its not connected with target, but has too many cat values
               'paid_late_fees', 'paid_interest', 'paid_principal', 'paid_total', 'balance', 'loan_status']
df2 = df2.drop(target_cols, axis=1)
df2.head()

Unnamed: 0,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,debt_to_income_joint,delinq_2y,...,public_record_bankrupt,loan_purpose,application_type,loan_amount,term,interest_rate,installment,grade,initial_listing_status,disbursement_method
0,3,NJ,MORTGAGE,90000.0,Verified,18.01,90000.0,Solo,18.01,0,...,0,moving,individual,28000,60,14.07,652.53,C,whole,Cash
1,10,HI,RENT,40000.0,Not Verified,5.04,40000.0,Solo,5.04,0,...,1,debt_consolidation,individual,5000,36,12.61,167.54,C,whole,Cash
2,3,WI,RENT,40000.0,Source Verified,21.15,40000.0,Solo,21.15,0,...,0,other,individual,2000,36,17.09,71.4,D,fractional,Cash
3,1,PA,RENT,30000.0,Not Verified,10.16,30000.0,Solo,10.16,0,...,0,debt_consolidation,individual,21600,36,6.72,664.19,A,whole,Cash
4,10,CA,RENT,35000.0,Verified,57.96,57000.0,Verified,37.66,0,...,0,credit_card,joint,23000,36,14.07,786.87,C,whole,Cash


In [42]:
target = 'grade'
int_cols = [col for col in df2.columns if df2[col].dtype == 'int' and col != target]
num_cols = [col for col in df2.columns if df2[col].dtype != 'O' and col != target]
cat_cols = [col for col in df2.columns if df2[col].dtype == 'O' and col != target]

print(num_cols, cat_cols)

['annual_income', 'debt_to_income', 'annual_income_joint', 'debt_to_income_joint', 'delinq_2y', 'months_since_last_delinq', 'earliest_credit_line', 'inquiries_last_12m', 'total_credit_lines', 'open_credit_lines', 'total_credit_limit', 'total_credit_utilized', 'num_collections_last_12m', 'num_historical_failed_to_pay', 'months_since_90d_late', 'current_accounts_delinq', 'total_collection_amount_ever', 'current_installment_accounts', 'accounts_opened_24m', 'months_since_last_credit_inquiry', 'num_satisfactory_accounts', 'num_accounts_30d_past_due', 'num_active_debit_accounts', 'total_debit_limit', 'num_total_cc_accounts', 'num_open_cc_accounts', 'num_cc_carrying_balance', 'num_mort_accounts', 'account_never_delinq_percent', 'tax_liens', 'public_record_bankrupt', 'loan_amount', 'interest_rate', 'installment'] ['emp_length', 'state', 'homeownership', 'verified_income', 'verification_income_joint', 'loan_purpose', 'application_type', 'term', 'initial_listing_status', 'disbursement_method']


## train_test

In [43]:
X_train, X_test, y_train, y_test = train_test_split(df2.drop(target, axis=1), df2[target], 
                                                    test_size=0.3, random_state=42,
                                                    stratify=df2[target])
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

X_train.reset_index(drop=True, inplace=True)
X_test.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_test.reset_index(drop=True, inplace=True)

X_test.head()

(7000, 44) (3000, 44) (7000,) (3000,)


Unnamed: 0,emp_length,state,homeownership,annual_income,verified_income,debt_to_income,annual_income_joint,verification_income_joint,debt_to_income_joint,delinq_2y,...,tax_liens,public_record_bankrupt,loan_purpose,application_type,loan_amount,term,interest_rate,installment,initial_listing_status,disbursement_method
0,10,NY,RENT,39000.0,Source Verified,21.05,39000.0,Solo,21.05,1,...,0,0,debt_consolidation,individual,8000,36,24.84,317.41,fractional,Cash
1,10,VA,MORTGAGE,155000.0,Not Verified,17.64,155000.0,Solo,17.64,0,...,0,1,debt_consolidation,individual,10000,36,10.41,324.61,whole,Cash
2,5,WA,MORTGAGE,46000.0,Verified,24.47,46000.0,Solo,24.47,0,...,0,0,debt_consolidation,individual,6700,36,14.08,229.26,fractional,Cash
3,10,TN,MORTGAGE,110000.0,Not Verified,6.71,110000.0,Solo,6.71,0,...,0,0,debt_consolidation,individual,12000,36,5.31,361.33,whole,Cash
4,4,MN,MORTGAGE,30000.0,Verified,45.78,80000.0,Verified,29.95,0,...,0,0,debt_consolidation,joint,15100,60,12.62,340.65,whole,Cash


In [44]:
X_train_sdv_ctgan = pd.DataFrame()
X_train_sdv_tvae = pd.DataFrame()
X_train_sdv_copulagan = pd.DataFrame()
X_train_ctgan = pd.DataFrame()
X_train_tvae = pd.DataFrame()
X_train_kan_ctgan = pd.DataFrame()
X_train_kan_tvae = pd.DataFrame()

y_train_gen = pd.Series()

grades = y_train.unique()
grades.sort()

ep = 150

for g in grades:
    X_train_g = X_train[y_train==g]
    n = len(X_train_g)
    y_train_gen = pd.concat([y_train_gen, y_train[y_train==g]]).reset_index(drop=True)

    metadata = Metadata.detect_from_dataframe(data=X_train_g)

    sdv_ctgan = CTGANSynthesizer(metadata, epochs=ep)
    sdv_tvae = TVAESynthesizer(metadata, epochs=ep)
    sdv_copulagan = CopulaGANSynthesizer(metadata, epochs=ep)
    
    ctgan = CTGAN(epochs=ep)
    tvae = TVAE(epochs=ep)
    kan_ctgan = KANCTGAN(epochs=ep)
    kan_tvae = KANTVAE(epochs=ep)

    ctgan.set_random_state(42)
    tvae.set_random_state(42)
    kan_ctgan.set_random_state(42)
    kan_tvae.set_random_state(42)

    sdv_ctgan.fit(X_train_g)
    sdv_tvae.fit(X_train_g)
    sdv_copulagan.fit(X_train_g)

    ctgan.fit(X_train_g, cat_cols)
    tvae.fit(X_train_g, cat_cols)

    kan_ctgan.fit(X_train_g, cat_cols)
    kan_tvae.fit(X_train_g, cat_cols)

    X_train_sdv_ctgan = pd.concat([X_train_sdv_ctgan, sdv_ctgan.sample(n)]).reset_index(drop=True)
    X_train_sdv_tvae = pd.concat([X_train_sdv_tvae, sdv_tvae.sample(n)]).reset_index(drop=True)
    X_train_sdv_copulagan = pd.concat([X_train_sdv_copulagan, sdv_copulagan.sample(n)]).reset_index(drop=True)
    X_train_ctgan = pd.concat([X_train_ctgan, ctgan.sample(n)]).reset_index(drop=True)
    X_train_tvae = pd.concat([X_train_tvae, tvae.sample(n)]).reset_index(drop=True)
    X_train_kan_ctgan = pd.concat([X_train_kan_ctgan, kan_ctgan.sample(n)]).reset_index(drop=True)
    X_train_kan_tvae = pd.concat([X_train_kan_tvae, kan_tvae.sample(n)]).reset_index(drop=True)

In [45]:
X_train.to_csv('data/d2_synth/X_train.csv')
X_test.to_csv('data/d2_synth/X_test.csv')
y_test.to_csv('data/d2_synth/y_test.csv')
y_train.to_csv('data/d2_synth/y_train.csv')
y_train_gen.to_csv('data/d2_synth/y_train_gen.csv')
X_train_sdv_ctgan.to_csv('data/d2_synth/X_train_sdv_ctgan.csv')
X_train_sdv_tvae.to_csv('data/d2_synth/X_train_sdv_tvae.csv')
X_train_sdv_copulagan.to_csv('data/d2_synth/X_train_sdv_copulagan.csv')
X_train_ctgan.to_csv('data/d2_synth/X_train_ctgan.csv')
X_train_tvae.to_csv('data/d2_synth/X_train_tvae.csv')
X_train_kan_ctgan.to_csv('data/d2_synth/X_train_kan_ctgan.csv')
X_train_kan_tvae.to_csv('data/d2_synth/X_train_kan_tvae.csv')

In [46]:
y_train = pd.get_dummies(y_train)
y_test = pd.get_dummies(y_test)
y_train_gen = pd.get_dummies(y_train_gen)

In [47]:
results = pd.DataFrame(columns=['name', 'acc', 'mean_acc_per_class', 'hamming', 'prec_avg', 'rec_avg', 'f1_avg'])

train_pool = Pool(X_train.drop('interest_rate', axis=1), y_train, cat_features=cat_cols)
test_pool = Pool(X_test.drop('interest_rate', axis=1), y_test, cat_features=cat_cols)
model = CatBoostClassifier(
    loss_function='MultiLogloss',
    class_names=list(y_train.columns))

model.fit(train_pool, verbose=False)

test_predict = model.predict(X_test.drop('interest_rate', axis=1))

accuracy = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Accuracy')[0]
accuracy_per_class = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Accuracy:type=PerClass')
hamming = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'HammingLoss')[0]
mean_accuracy_per_class = sum(accuracy_per_class) / len(accuracy_per_class)
prec_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Precision'))
rec_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Recall'))
f1_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'F1'))

results.loc[len(results)] = ['Real', accuracy, mean_accuracy_per_class, hamming, prec_avg, rec_avg, f1_avg]

Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0


In [48]:
results

Unnamed: 0,name,acc,mean_acc_per_class,hamming,prec_avg,rec_avg,f1_avg
0,Real,0.452,0.914333,0.085667,0.615829,0.262325,0.338158


In [49]:
X_train_data = {
    'sdv_ctgan': X_train_sdv_ctgan,
    'sdv_tvae': X_train_sdv_tvae,
    'sdv_copulagan': X_train_sdv_copulagan, 
    'ctgan': X_train_ctgan,
    'tvae': X_train_tvae, 
    'kan_ctgan': X_train_kan_ctgan, 
    'kan_tvae': X_train_kan_tvae, 
}

In [50]:
for name, data in X_train_data.items():
    model = CatBoostClassifier(
        loss_function='MultiLogloss',
        class_names=list(y_train.columns))

    train_pool = Pool(data.drop('interest_rate', axis=1), y_train_gen, cat_features=cat_cols)
    model.fit(train_pool, verbose=False)

    test_predict = model.predict(X_test.drop('interest_rate', axis=1))
    
    accuracy = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Accuracy')[0]
    accuracy_per_class = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Accuracy:type=PerClass')
    hamming = eval_metric(y_test.astype(int).to_numpy(), test_predict, 'HammingLoss')[0]
    mean_accuracy_per_class = sum(accuracy_per_class) / len(accuracy_per_class)
    prec_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Precision'))
    rec_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'Recall'))
    f1_avg = np.mean(eval_metric(y_test.astype(int).to_numpy(), test_predict, 'F1'))

    results.loc[len(results)] = [name, accuracy, mean_accuracy_per_class, hamming, prec_avg, rec_avg, f1_avg]


Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0
Number of the positive class predictions is 0. Setting Precision metric value to the default 0


In [51]:
results.sort_values('f1_avg', ascending=False)

Unnamed: 0,name,acc,mean_acc_per_class,hamming,prec_avg,rec_avg,f1_avg
0,Real,0.452,0.914333,0.085667,0.615829,0.262325,0.338158
7,kan_tvae,0.197333,0.841286,0.158714,0.268137,0.132641,0.155398
6,kan_ctgan,0.214333,0.839333,0.160667,0.203492,0.131939,0.144069
2,sdv_tvae,0.193667,0.847762,0.152238,0.242928,0.112532,0.141822
1,sdv_ctgan,0.229333,0.81619,0.18381,0.190946,0.129988,0.139235
5,tvae,0.192333,0.837333,0.162667,0.214568,0.113611,0.130051
3,sdv_copulagan,0.212,0.824667,0.175333,0.174157,0.115879,0.116112
4,ctgan,0.189333,0.825,0.175,0.18762,0.107576,0.11449


In [52]:
metadata = Metadata.detect_from_dataframe(
    data=X_train)

results = pd.DataFrame(columns = ['name', 'Column Shapes', 'Column Pair Trends', 'Average'])
for name, data in X_train_data.items():
    my_report = QualityReport()
    my_report.generate(X_train, data, metadata.to_dict()['tables']['table'])


    results.loc[len(results)] = [name, 
                                 my_report.get_properties()['Score'].tolist()[0], 
                                 my_report.get_properties()['Score'].tolist()[1], 
                                 my_report.get_score()]


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 44/44 [00:00<00:00, 625.56it/s]|
Column Shapes Score: 89.92%

(2/2) Evaluating Column Pair Trends: |██████████| 946/946 [00:01<00:00, 764.70it/s]|
Column Pair Trends Score: 87.11%

Overall Score (Average): 88.51%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 44/44 [00:00<00:00, 670.73it/s]|
Column Shapes Score: 83.9%

(2/2) Evaluating Column Pair Trends: |██████████| 946/946 [00:01<00:00, 759.83it/s]|
Column Pair Trends Score: 78.96%

Overall Score (Average): 81.43%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 44/44 [00:00<00:00, 657.57it/s]|
Column Shapes Score: 90.79%

(2/2) Evaluating Column Pair Trends: |██████████| 946/946 [00:01<00:00, 703.47it/s]|
Column Pair Trends Score: 88.06%

Overall Score (Average): 89.43%

Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 44/44 [00:00<00:00, 653.76it/s]|
Column Shapes Score: 88.3%

(2/2) Evaluating Column Pa

In [53]:
results.sort_values('Average', ascending=False)

Unnamed: 0,name,Column Shapes,Column Pair Trends,Average
2,sdv_copulagan,0.907934,0.88059,0.894262
0,sdv_ctgan,0.899173,0.871064,0.885118
5,kan_ctgan,0.908615,0.763544,0.836079
1,sdv_tvae,0.839027,0.789575,0.814301
3,ctgan,0.883027,0.704808,0.793917
4,tvae,0.807455,0.706119,0.756787
6,kan_tvae,0.779864,0.708767,0.744315
