In [1]:
%%capture

!pip install sdv
!pip install ucimlrepo

In [10]:
import sdv
from ctgan import CTGAN
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder
import warnings

In [3]:
warnings.filterwarnings('ignore')

## Loading data

In [11]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)

{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [12]:
y['income'] = y['income'].str.replace('.', '', regex=False)

In [13]:
all_data = pd.concat([X, y], axis=1)

## Fit CTGAN

In [14]:
# Names of the columns that are discrete
discrete_columns = [
    'workclass',
    'education',
    'marital-status',
    'occupation',
    'relationship',
    'race',
    'sex',
    'native-country',
    'income'
]

ctgan = CTGAN(epochs=100)
ctgan.fit(all_data, discrete_columns)

# Create synthetic data
#synthetic_data = ctgan.sample(1000)

## Classification results

In [15]:
def preprocess_data(data):
    df = data.copy()

    le_dict = {}
    for column in discrete_columns:
        if column in df.columns:
            le = LabelEncoder()
            df[column] = le.fit_transform(df[column].astype(str))
            le_dict[column] = le

    return df, le_dict


def train_and_evaluate(X_train, X_test, y_train, y_test, n_splits=20):
    param_grid = {
        'n_estimators': [10, 50, 100],
        'min_child_weight': [5, 10, 20],
        'max_depth': [1, 10],
        'gamma': [0.0, 1.0]
    }

    scores = []

    for i in range(n_splits):
        X_train_split, X_val, y_train_split, y_val = train_test_split(
            X_train, y_train, test_size=0.111, random_state=i
        )

        model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
        grid_search = GridSearchCV(
            model, param_grid, scoring='roc_auc', cv=3
        )

        grid_search.fit(X_train_split, y_train_split)

        best_model = XGBClassifier(**grid_search.best_params_,
                                 use_label_encoder=False,
                                 eval_metric='auc')

        best_model.fit(X_train, y_train)

        y_pred = best_model.predict_proba(X_test)[:, 1]
        score = roc_auc_score(y_test, y_pred)
        scores.append(score)

    return np.mean(scores), np.std(scores)

X_processed, le_dict = preprocess_data(X)
y_processed = LabelEncoder().fit_transform(y)

X_train_real, X_test, y_train_real, y_test = train_test_split(
    X_processed, y_processed, test_size=0.2, random_state=42
)

# Generate synthetic data of the same size
synthetic_data = ctgan.sample(len(X_train_real))
synthetic_data_processed, _ = preprocess_data(synthetic_data)

synthetic_y = synthetic_data_processed['income']
synthetic_X = synthetic_data_processed.drop('income', axis=1)


real_mean, real_std = train_and_evaluate(
    X_train_real, X_test, y_train_real, y_test
)

synthetic_mean, synthetic_std = train_and_evaluate(
    synthetic_X, X_test, synthetic_y, y_test
)

print("Результаты оценки классификаторов:")
print(f"Реальные данные - AUC: {real_mean:.3f} ± {real_std:.3f}")
print(f"Синтетические данные - AUC: {synthetic_mean:.3f} ± {synthetic_std:.3f}")

Результаты оценки классификаторов:
Реальные данные - AUC: 0.923 ± 0.001
Синтетические данные - AUC: 0.873 ± 0.000
