In [13]:
%%capture

!pip install sdv
!pip install ucimlrepo
!pio install sdmetrics


In [15]:
import sdv
from ctgan import CTGAN
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings

In [3]:
warnings.filterwarnings('ignore')

## Loading data

In [4]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [5]:
y['income'] = y['income'].str.replace('.', '', regex=False)

In [6]:
all_data = pd.concat([X, y], axis=1)

## Fit CTGAN

In [7]:
# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

ctgan = CTGAN(epochs=100)
ctgan.fit(all_data, discrete_columns)

# Create synthetic data
#synthetic_data = ctgan.sample(1000)

## Classification results

In [8]:
def preprocess_data(data):
    df = data.copy()

    le_dict = {}
    for column in discrete_columns:
        if column in df.columns:
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column].astype(str))
            le_dict[column] = le

    return df, le_dict


def train_and_evaluate(X_train, X_test, y_train, y_test, n_splits=20):
    param_grid = {
        'n_estimators': [10, 50, 100],
        'min_child_weight': [5, 10, 20],
        'max_depth': [1, 10],
        'gamma': [0.0, 1.0]
    }

    scores = []

    for i in range(n_splits):
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y_train, test_size=0.111, random_state=i
        )

        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        grid_search = GridSearchCV(
            model, param_grid, scoring='roc_auc', cv=3
        )

        grid_search.fit(X_train_split, y_train_split)

        best_model = XGBClassifier(**grid_search.best_params_,
                                 use_label_encoder=False,
                                 eval_metric='auc')

        best_model.fit(X_train, y_train)

        y_pred = best_model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_pred)
        scores.append(score)

    return np.mean(scores), np.std(scores)

X_processed, le_dict = preprocess_data(X)
y_processed = LabelEncoder().fit_transform(y)

X_train_real, X_test, y_train_real, y_test = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=42
)

# Generate synthetic data of the same size
synthetic_data = ctgan.sample(len(X_train_real))
synthetic_data_processed, _ = preprocess_data(synthetic_data)

synthetic_y = synthetic_data_processed['income']
synthetic_X = synthetic_data_processed.drop('income', axis=1)


real_mean, real_std = train_and_evaluate(
    X_train_real, X_test, y_train_real, y_test
)

synthetic_mean, synthetic_std = train_and_evaluate(
    synthetic_X, X_test, synthetic_y, y_test
)

print("Результаты оценки классификаторов:")
print(f"Реальные данные - AUC: {real_mean:.3f} ± {real_std:.3f}")
print(f"Синтетические данные - AUC: {synthetic_mean:.3f} ± {synthetic_std:.3f}")

Результаты оценки классификаторов:
Реальные данные - AUC: 0.923 ± 0.001
Синтетические данные - AUC: 0.882 ± 0.000


## Metrics


In [9]:
synthetic_data = ctgan.sample(len(all_data))

In [10]:
from sdv.metadata import SingleTableMetadata

metadata = SingleTableMetadata()
metadata.detect_from_dataframe(data=all_data)
metadata = metadata.to_dict()

# Low-order statistics

### Column-wise density estimation

In [11]:
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport

qual_report = QualityReport()
qual_report.generate(all_data, synthetic_data, metadata)

diag_report = DiagnosticReport()
diag_report.generate(all_data, synthetic_data, metadata)

quality =  qual_report.get_properties()
fig = qual_report.get_visualization(property_name='Column Shapes')
fig.show()
diag = diag_report.get_properties()

Shape = quality['Score'][0]
Trend = quality['Score'][1]
print(Shape)
print(Trend)


Quality = (Shape + Trend) / 2

shapes = qual_report.get_details(property_name='Column Shapes')
trends = qual_report.get_details(property_name='Column Pair Trends')
validity = diag_report.get_details('Data Validity')
structure = diag_report.get_details('Data Structure')

#print(shapes)
#print(trends)
#print(validity)
#print(structure )


Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 38.13it/s]|
Column Shapes Score: 88.56%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:03<00:00, 28.06it/s]|
Column Pair Trends Score: 78.43%

Overall Score (Average): 83.5%

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 120.28it/s]|
Data Validity Score: 94.92%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 139.44it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 97.46%



0.8856375951160096
0.7842857319497145


In [24]:
cat_cols = ['workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country', 'income']
numeric_cols = ['age', 'fnlwgt', 'education-num','capital-gain', 'capital-loss',
                'hours-per-week']

In [26]:
from sdmetrics.visualization import get_column_plot
for i in numeric_cols:
    fig = get_column_plot(
        real_data=all_data,
        synthetic_data=synthetic_data,
        column_name= i,
        plot_type='distplot'
    )

    fig.show()

In [27]:
fig = qual_report.get_visualization(property_name='Column Pair Trends')
fig.show()

### Pair-wise column correlation

In [None]:
from sdmetrics.column_pairs import CorrelationSimilarity

CorrelationSimilarity.compute(
    real_data=all_data[numeric_cols],
    synthetic_data=synthetic_data[numeric_cols],
    coefficient='Pearson'
)

0.9975088475417567

In [None]:
from sdmetrics.column_pairs import ContingencySimilarity
from itertools import combinations
import statistics

res = []
for col1, col2 in combinations(all_data.columns, 2):
  continious_cols = []
  if col1 in numeric_cols:
    continious_cols.append(col1)
  if col2 in numeric_cols:
    continious_cols.append(col2)
  similarity = ContingencySimilarity.compute(
      real_data=all_data[[col1, col2]],
      synthetic_data=synthetic_data[[col1, col2]],
      continuous_column_names=continious_cols
  )
  res.append(similarity)
  #print(col1, col2, similarity)
print(statistics.mean(res))

0.8623072258263282


### Classifier Two Sample Test

In [None]:
from sdmetrics.single_table import LogisticDetection

LogisticDetection.compute(
    real_data=all_data,
    synthetic_data=synthetic_data,
    metadata=metadata
)

0.6273446290173219

# High-order metrics

In [None]:
#%%capture

!pip install synthcity[full]

In [None]:
from sklearn.preprocessing import OneHotEncoder
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.metrics import eval_detection, eval_performance, eval_statistical



In [None]:
num_real_data = all_data[numeric_cols]
cat_real_data = all_data[cat_cols]

num_real_data_np = num_real_data.to_numpy()
cat_real_data_np = cat_real_data.to_numpy().astype('str')


num_syn_data = synthetic_data[numeric_cols]
cat_syn_data = synthetic_data[cat_cols]

num_syn_data_np = num_syn_data.to_numpy()
cat_syn_data_np = cat_syn_data.to_numpy().astype('str')

In [None]:
encoder = OneHotEncoder()
encoder.fit(cat_real_data_np)

cat_real_data_oh = encoder.transform(cat_real_data_np).toarray()
cat_syn_data_oh = encoder.transform(cat_syn_data_np).toarray()

In [None]:
le_real_data = pd.DataFrame(np.concatenate((num_real_data_np, cat_real_data_oh), axis = 1)).astype(float)
le_real_num = pd.DataFrame(num_real_data_np).astype(float)
le_real_cat = pd.DataFrame(cat_real_data_oh).astype(float)


le_syn_data = pd.DataFrame(np.concatenate((num_syn_data_np, cat_syn_data_oh), axis = 1)).astype(float)
le_syn_num = pd.DataFrame(num_syn_data_np).astype(float)
le_syn_cat = pd.DataFrame(cat_syn_data_oh).astype(float)

In [None]:
X_syn_loader = GenericDataLoader(le_syn_data)
X_real_loader = GenericDataLoader(le_real_data)

In [None]:
quality_evaluator = eval_statistical.AlphaPrecision()
qual_res = quality_evaluator.evaluate(X_real_loader, X_syn_loader)
qual_res = {
    k: v for (k, v) in qual_res.items() if "naive" in k
}  # use the naive implementation of AlphaPrecision
qual_score = np.mean(list(qual_res.values()))

print('alpha precision: {:.6f}, beta recall: {:.6f}'.format(qual_res['delta_precision_alpha_naive'], qual_res['delta_coverage_beta_naive'] ))

Alpha_Precision_all = qual_res['delta_precision_alpha_naive']
Beta_Recall_all = qual_res['delta_coverage_beta_naive']

alpha precision: 0.680812, beta recall: 0.221543


# DCR

In [None]:
from sklearn.model_selection import train_test_split
import torch

In [None]:
train_data, test_data = train_test_split(all_data, test_size=0.5, random_state=42)

In [None]:
ctgan.fit(train_data, discrete_columns)
synthetic_data = ctgan.sample(len(train_data))

In [None]:
num_ranges = []
for i in numeric_cols:
    num_ranges.append(train_data[i].max() - train_data[i].min())
num_ranges = np.array(num_ranges)

num_train_data = train_data[numeric_cols]
cat_train_data = train_data[cat_cols]
num_synthetic_data = synthetic_data[numeric_cols]
cat_synthetic_data = synthetic_data[cat_cols]
num_test_data = test_data[numeric_cols]
cat_test_data = test_data[cat_cols]

num_train_data_np = num_train_data.to_numpy()
cat_train_data_np = cat_train_data.to_numpy().astype('str')
num_synthetic_data_np = num_synthetic_data.to_numpy()
cat_synthetic_data_np = cat_synthetic_data.to_numpy().astype('str')
num_test_data_np = num_test_data.to_numpy()
cat_test_data_np = cat_test_data.to_numpy().astype('str')

encoder = OneHotEncoder(handle_unknown='ignore')
encoder.fit(cat_train_data_np)

cat_train_data_oh = encoder.transform(cat_train_data_np).toarray()
cat_synthetic_data_oh = encoder.transform(cat_synthetic_data_np).toarray()
cat_test_data_oh = encoder.transform(cat_test_data_np).toarray()

num_train_data_np = num_train_data_np / num_ranges
num_synthetic_data_np = num_synthetic_data_np / num_ranges
num_test_data_np = num_test_data_np / num_ranges

train_data_np = np.concatenate([num_train_data_np, cat_train_data_oh], axis=1)
synthetic_data_np = np.concatenate([num_synthetic_data_np, cat_synthetic_data_oh], axis=1)
test_data_np = np.concatenate([num_test_data_np, cat_test_data_oh], axis=1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'
train_data_th = torch.tensor(train_data_np).to(device)
synthetic_data_th = torch.tensor(synthetic_data_np).to(device)
test_data_th = torch.tensor(test_data_np).to(device)

dcrs_train = []
dcrs_test = []
batch_size = 100

for i in range((synthetic_data_th.shape[0] // batch_size) + 1):
    if i != (synthetic_data_th.shape[0] // batch_size):
        batch_synthetic_data_th = synthetic_data_th[i*batch_size: (i+1) * batch_size]
    else:
        batch_synthetic_data_th = synthetic_data_th[i*batch_size:]

    dcr_train = (batch_synthetic_data_th[:, None] - train_data_th).abs().sum(dim=2).min(dim=1).values
    dcr_test = (batch_synthetic_data_th[:, None] - test_data_th).abs().sum(dim=2).min(dim=1).values
    dcrs_train.append(dcr_train)
    dcrs_test.append(dcr_test)

dcrs_train = torch.cat(dcrs_train)
dcrs_test = torch.cat(dcrs_test)

score = (dcrs_train < dcrs_test).nonzero().shape[0] / dcrs_train.shape[0]

print('DCR Score, a value closer to 0.5 is better')
print(f'DCR Score = {score}')

DCR Score, a value closer to 0.5 is better
DCR Score = 0.5031325498546333
