In [1]:
%%capture

!pip install sdv
!pip install ucimlrepo
!pio install sdmetrics
!pip install -U kaleido
!pip install synthcity[full]

In [2]:
import sdv
from ctgan import CTGAN
from ucimlrepo import fetch_ucirepo
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from xgboost import XGBClassifier, XGBRegressor
from sklearn.metrics import roc_auc_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder
import warnings
from sdv.metadata import SingleTableMetadata
from sdmetrics.reports.single_table import QualityReport, DiagnosticReport
from sdmetrics.visualization import get_column_plot
from sdmetrics.column_pairs import CorrelationSimilarity
from sdmetrics.column_pairs import ContingencySimilarity
from itertools import combinations
import statistics
from sdmetrics.single_table import LogisticDetection
from sklearn.preprocessing import OneHotEncoder
from synthcity.plugins.core.dataloader import GenericDataLoader
from synthcity.metrics import eval_detection, eval_performance, eval_statistical
import torch
from sklearn.preprocessing import OrdinalEncoder

In [3]:
class Metrics:
    def __init__(self, real_df, synthetic_df, numeric_cols, cat_cols, target_col, classification=True, epochs=300):
        self.real_df = real_df
        self.synthetic_df = synthetic_df
        self.numeric_cols = numeric_cols
        self.cat_cols = cat_cols
        self.target_col = target_col
        self.metadata = SingleTableMetadata()
        self.metadata.detect_from_dataframe(data=self.real_df)
        self.metadata = self.metadata.to_dict()
        self.classification = classification
        self.epochs=epochs

    def preprocess_data(self, data):
        df = data.copy()
        enc_dict = {}
        for column in self.cat_cols:
            if column in df.columns:
                enc = OrdinalEncoder(
                    handle_unknown='use_encoded_value',
                    unknown_value=-1
                )
                arr = df[[column]].astype(str)
                df[column] = enc.fit_transform(arr).astype(int)
                enc_dict[column] = enc
        return df, enc_dict

    def encode_data(self, data, enc_dict):
        df = data.copy()
        for column in self.cat_cols:
            if column in df.columns and column in enc_dict:
                enc = enc_dict[column]
                arr = df[[column]].astype(str)
                df[column] = enc.transform(arr).astype(int)
        return df

    def train_and_evaluate(self, X_train, X_test, y_train, y_test, n_splits=20):
        param_grid = {
            'n_estimators': [10, 50, 100],
            'min_child_weight': [5, 10, 20],
            'max_depth': [1, 10],
            'gamma': [0.0, 1.0]
        }

        scores = []

        for i in range(n_splits):
            X_train_split, X_val, y_train_split, y_val = train_test_split(
                X_train, y_train, test_size=0.111, random_state=i
            )

            if self.classification:
                model = XGBClassifier(use_label_encoder=False, eval_metric='logloss')
            else:
                model = XGBRegressor(use_label_encoder=False)
            grid_search = GridSearchCV(
                model, param_grid, scoring='roc_auc', cv=3
            )

            grid_search.fit(X_train_split, y_train_split)

            best_model = XGBClassifier(**grid_search.best_params_,
                                    use_label_encoder=False,
                                    eval_metric='auc')

            best_model.fit(X_train, y_train)

            y_pred = best_model.predict_proba(X_test)[:, 1]
            if self.classification:
                score = roc_auc_score(y_test, y_pred)
            else:
                score = mean_squared_error(y_test, y_pred)
            scores.append(score)

        return np.mean(scores), np.std(scores)

    def classification_metrics(self):
            X = self.real_df.drop(columns=[self.target_col])
            y = self.real_df[self.target_col]

            X_train_real, X_test, y_train_real, y_test = train_test_split(
                X, y, test_size=0.2, random_state=42
            )

            ctgan = CTGAN(epochs=self.epochs)
            ctgan.fit(pd.concat([X_train_real, y_train_real], axis=1), self.cat_cols)
            synthetic_data = ctgan.sample(len(X_train_real))

            X_train_synthetic = synthetic_data.drop(self.target_col, axis=1)
            y_train_synthetic = synthetic_data[self.target_col]

            X_train_real, enc_dict_X = self.preprocess_data(X_train_real)
            X_test = self.encode_data(X_test, enc_dict_X)
            X_train_synthetic = self.encode_data(X_train_synthetic, enc_dict_X)

            if self.target_col in self.cat_cols:
                enc_target = OrdinalEncoder(
                    handle_unknown='use_encoded_value',
                    unknown_value=-1
                )

                y_tr = y_train_real.astype(str).to_frame()
                y_train_real = enc_target.fit_transform(y_tr).ravel().astype(int)
                y_train_synthetic = enc_target.transform(
                    y_train_synthetic.astype(str).to_frame()
                ).ravel().astype(int)
                y_test = enc_target.transform(
                    y_test.astype(str).to_frame()
                ).ravel().astype(int)

            real_mean, real_std = self.train_and_evaluate(
                X_train_real, X_test, y_train_real, y_test
            )
            synthetic_mean, synthetic_std = self.train_and_evaluate(
                X_train_synthetic, X_test, y_train_synthetic, y_test
            )

            if self.classification:
                print("Classificator results (AUC):")
            else:
                print("Regression results (MSE):")
            print(f"Real data - : {real_mean:.3f} ± {real_std:.3f}")
            print(f"Synthetic data - : {synthetic_mean:.3f} ± {synthetic_std:.3f}")

            return {
                'real_mean': real_mean, 'real_std': real_std,
                'synthetic_mean': synthetic_mean, 'synthetic_std': synthetic_std
            }


    def density_estimation(self) -> dict:
        qual_report = QualityReport()
        qual_report.generate(self.real_df, self.synthetic_df, self.metadata)

        diag_report = DiagnosticReport()
        diag_report.generate(self.real_df, self.synthetic_df, self.metadata)

        fig_shape = qual_report.get_visualization(property_name='Column Shapes')
        fig_trend = qual_report.get_visualization(property_name='Column Pair Trends')
        fig_shape.show()
        fig_trend.show()

        quality_scores = qual_report.get_score()
        diag_scores = diag_report.get_score()
        quality =  qual_report.get_properties()

        Shape = quality['Score'][0]
        Trend = quality['Score'][1]
        shapes = qual_report.get_details(property_name='Column Shapes')
        trends = qual_report.get_details(property_name='Column Pair Trends')
        validity = diag_report.get_details('Data Validity')
        structure = diag_report.get_details('Data Structure')

        metrics = {
            'overall_quality': {
                'shapes': Shape,
                'trends': Trend,
                'total_score': (Shape + Trend) / 2
            },
            'details': {
                'column_shapes': shapes,
                'column_trends': trends,
                'data_validity': validity,
                'data_structure': structure
            }
        }

        print(f"\n{' METRICS REPORT ':=^80}")
        print(f"Column Shapes Score: {metrics['overall_quality']['shapes']:.3f}")
        print(f"Column Trends Score: {metrics['overall_quality']['trends']:.3f}")
        print(f"Overall Score: {metrics['overall_quality']['total_score']:.3f}")

        #fig = qual_report.get_visualization(property_name='Column Pair Trends')
        #fig.show()

        return metrics


    def visualize_distr(self):
        for i in self.numeric_cols:
            fig = get_column_plot(
                real_data=self.real_df,
                synthetic_data=self.synthetic_df,
                column_name= i,
                plot_type='distplot'
            )

            fig.show()

    def correlation_similarity(self):
        correlation_score = CorrelationSimilarity.compute(
            real_data=self.real_df[self.numeric_cols],
            synthetic_data=self.synthetic_df[self.numeric_cols],
            coefficient='Pearson'
        )
        print(f'correlation score: {correlation_score:.3f}')
        return correlation_score

    def contingency_similarity(self):
        res = []
        for col1, col2 in combinations(self.numeric_cols, 2):
            similarity = ContingencySimilarity.compute(
                real_data=self.real_df[[col1, col2]],
                synthetic_data=self.synthetic_df[[col1, col2]],
                continuous_column_names=[col1, col2]
            )
            res.append(similarity)
        mean_similarity = statistics.mean(res)
        print(f'contingency similarity score: {mean_similarity:.3f}')
        return mean_similarity

    def logistic_detection(self):
        log_detection_score = LogisticDetection.compute(
            real_data=self.real_df,
            synthetic_data=self.synthetic_df,
            metadata=self.metadata
        )
        print(f'log detection score: {log_detection_score:.3f}')
        return log_detection_score

    def alpha_precision(self):
        num_real_data = self.real_df[self.numeric_cols]
        cat_real_data = self.real_df[self.cat_cols]

        num_syn_data = self.synthetic_df[self.numeric_cols]
        cat_syn_data = self.synthetic_df[self.cat_cols]

        encoder = OneHotEncoder()
        cat_real_data_oh = encoder.fit_transform(cat_real_data.astype(str)).toarray()
        cat_syn_data_oh = encoder.transform(cat_syn_data.astype(str)).toarray()

        le_real_data = pd.DataFrame(np.concatenate([num_real_data.to_numpy(), cat_real_data_oh], axis=1))
        le_syn_data = pd.DataFrame(np.concatenate([num_syn_data.to_numpy(), cat_syn_data_oh], axis=1))

        X_real_loader = GenericDataLoader(le_real_data)
        X_syn_loader = GenericDataLoader(le_syn_data)

        quality_evaluator = eval_statistical.AlphaPrecision()
        qual_res = quality_evaluator.evaluate(X_real_loader, X_syn_loader)

        qual_res = {k: v for (k, v) in qual_res.items() if "naive" in k}
        qual_score = np.mean(list(qual_res.values()))

        print(f'Alpha Precision: {qual_res["delta_precision_alpha_naive"]:.6f}, '
              f'Beta Recall: {qual_res["delta_coverage_beta_naive"]:.6f}')

        return qual_res['delta_precision_alpha_naive'], qual_res['delta_coverage_beta_naive']

    def dcr_score(self):
        train_data, test_data = train_test_split(self.real_df, test_size=0.5, random_state=42)

        ctgan = CTGAN(epochs=self.epochs)
        ctgan.fit(train_data, self.cat_cols)
        synthetic_data = ctgan.sample(len(train_data))

        num_train_data = train_data[self.numeric_cols]
        num_synthetic_data = synthetic_data[self.numeric_cols]
        num_test_data = test_data[self.numeric_cols]

        num_train_data_np = num_train_data.to_numpy().astype(float)
        num_synthetic_data_np = num_synthetic_data.to_numpy().astype(float)
        num_test_data_np = num_test_data.to_numpy().astype(float)

        num_ranges = np.array([train_data[col].max() - train_data[col].min() for col in self.numeric_cols]).astype(float)
        num_train_data_np /= num_ranges
        num_synthetic_data_np /= num_ranges
        num_test_data_np /= num_ranges

        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        train_data_th = torch.tensor(num_train_data_np).to(device)
        synthetic_data_th = torch.tensor(num_synthetic_data_np).to(device)
        test_data_th = torch.tensor(num_test_data_np).to(device)

        batch_size = 100
        dcrs_train = []
        dcrs_test = []

        for i in range((synthetic_data_th.shape[0] // batch_size) + 1):
            batch_synthetic_data_th = synthetic_data_th[i*batch_size: (i+1) * batch_size]
            dcr_train = (batch_synthetic_data_th[:, None] - train_data_th).abs().sum(dim=2).min(dim=1).values
            dcr_test = (batch_synthetic_data_th[:, None] - test_data_th).abs().sum(dim=2).min(dim=1).values
            dcrs_train.append(dcr_train)
            dcrs_test.append(dcr_test)

        dcrs_train = torch.cat(dcrs_train)
        dcrs_test = torch.cat(dcrs_test)

        score = (dcrs_train < dcrs_test).nonzero().shape[0] / dcrs_train.shape[0]
        print(f'DCR Score = {score:.6f}')

        return score

    def collect_all_metrics(self):
        metrics_dict = {}

        # Классификационные метрики
        classification_metrics_result = self.classification_metrics()
        metrics_dict.update(classification_metrics_result)

        # Оценка плотности
        density_metrics = self.density_estimation()
        metrics_dict['density_metrics'] = density_metrics

        # Корреляционное сходство
        correlation_score = self.correlation_similarity()
        metrics_dict['correlation_score'] = correlation_score

        # Контингентное сходство
        contingency_similarity_score = self.contingency_similarity()
        metrics_dict['contingency_similarity_score'] = contingency_similarity_score

        # Логистическое обнаружение
        log_detection_score = self.logistic_detection()
        metrics_dict['log_detection_score'] = log_detection_score

        # Alpha Precision
        alpha_precision, beta_recall = self.alpha_precision()
        metrics_dict['alpha_precision'] = alpha_precision
        metrics_dict['beta_recall'] = beta_recall

        # DCR Score
        dcr_score_value = self.dcr_score()
        metrics_dict['dcr_score'] = dcr_score_value

        return metrics_dict

## CTGAN
### Adult


In [3]:
# fetch dataset
adult = fetch_ucirepo(id=2)

# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

# metadata
print(adult.metadata)

# variable information
print(adult.variables)


{'uci_id': 2, 'name': 'Adult', 'repository_url': 'https://archive.ics.uci.edu/dataset/2/adult', 'data_url': 'https://archive.ics.uci.edu/static/public/2/data.csv', 'abstract': 'Predict whether annual income of an individual exceeds $50K/yr based on census data. Also known as "Census Income" dataset. ', 'area': 'Social Science', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 48842, 'num_features': 14, 'feature_types': ['Categorical', 'Integer'], 'demographics': ['Age', 'Income', 'Education Level', 'Other', 'Race', 'Sex'], 'target_col': ['income'], 'index_col': None, 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 1996, 'last_updated': 'Tue Sep 24 2024', 'dataset_doi': '10.24432/C5XW20', 'creators': ['Barry Becker', 'Ronny Kohavi'], 'intro_paper': None, 'additional_info': {'summary': "Extraction was done by Barry Becker from the 1994 Census database.  A set of reasonably clean records was extracted using the fol

In [4]:
all_data = pd.concat([X, y], axis=1)

In [5]:
cat_cols = ['workclass', 'education', 'marital-status', 'occupation',
            'relationship', 'race', 'sex', 'native-country', 'income']
numeric_cols = [x for x in all_data.columns if x not in cat_cols]

In [6]:
ctgan = CTGAN(epochs=300)
ctgan.fit(all_data, cat_cols)

In [7]:
synthetic_data = ctgan.sample(len(all_data))

In [8]:
synthetic_data.to_csv('adult_CTGAN.csv')

In [9]:
ctgan_metrics = Metrics(all_data, synthetic_data, numeric_cols, cat_cols, 'income')

In [18]:
all_data['income'] = all_data['income'].apply(lambda x: x[:-1] if x[-1]=='.' else x)
synthetic_data['income'] = synthetic_data['income'].apply(lambda x: x[:-1] if x[-1]=='.' else x)

In [22]:
metrics = ctgan_metrics.collect_all_metrics()

Classificator results:
Real data - AUC: 0.923 ± 0.001
Synthetic data - AUC: 0.878 ± 0.000
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 15/15 [00:00<00:00, 38.96it/s]|
Column Shapes Score: 86.33%

(2/2) Evaluating Column Pair Trends: |██████████| 105/105 [00:02<00:00, 39.43it/s]|
Column Pair Trends Score: 81.18%

Overall Score (Average): 83.76%

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 15/15 [00:00<00:00, 122.99it/s]|
Data Validity Score: 96.63%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 265.18it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 98.31%




Column Shapes Score: 0.863
Column Trends Score: 0.812
Overall Score: 0.838
correlation score: 0.978
contingency similarity score: 0.883
log detection score: 0.567
Alpha Precision: 0.662991, Beta Recall: 0.261488
DCR Score = 0.498423


### Default

In [24]:
# fetch dataset
default_of_credit_card_clients = fetch_ucirepo(id=350)

# data (as pandas dataframes)
X = default_of_credit_card_clients.data.features
y = default_of_credit_card_clients.data.targets

# metadata
print(default_of_credit_card_clients.metadata)

# variable information
print(default_of_credit_card_clients.variables)


{'uci_id': 350, 'name': 'Default of Credit Card Clients', 'repository_url': 'https://archive.ics.uci.edu/dataset/350/default+of+credit+card+clients', 'data_url': 'https://archive.ics.uci.edu/static/public/350/data.csv', 'abstract': "This research aimed at the case of customers' default payments in Taiwan and compares the predictive accuracy of probability of default among six data mining methods.", 'area': 'Business', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 30000, 'num_features': 23, 'feature_types': ['Integer', 'Real'], 'demographics': ['Sex', 'Education Level', 'Marital Status', 'Age'], 'target_col': ['Y'], 'index_col': ['ID'], 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2009, 'last_updated': 'Fri Mar 29 2024', 'dataset_doi': '10.24432/C55S3H', 'creators': ['I-Cheng Yeh'], 'intro_paper': {'ID': 365, 'type': 'NATIVE', 'title': 'The comparisons of data mining techniques for the predictive accuracy of 

In [25]:
all_data_ucirepo = pd.concat([X, y], axis=1)

In [27]:
cat_cols = ['X2', 'X3', 'X4', 'Y']
numeric_cols = [x for x in all_data_ucirepo.columns if x not in cat_cols]

ctgan = CTGAN(epochs=300)
ctgan.fit(all_data_ucirepo, cat_cols)

In [28]:
synthetic_data = ctgan.sample(len(all_data_ucirepo))

ctgan_metrics = Metrics(all_data_ucirepo, synthetic_data, numeric_cols, cat_cols, 'Y')
metrics = ctgan_metrics.collect_all_metrics()

Classificator results:
Real data - AUC: 0.778 ± 0.000
Synthetic data - AUC: 0.757 ± 0.005
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 24/24 [00:00<00:00, 48.84it/s]|
Column Shapes Score: 90.7%

(2/2) Evaluating Column Pair Trends: |██████████| 276/276 [00:04<00:00, 67.18it/s]|
Column Pair Trends Score: 84.94%

Overall Score (Average): 87.82%

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 24/24 [00:00<00:00, 362.45it/s]|
Data Validity Score: 97.26%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 183.45it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 98.63%




Column Shapes Score: 0.907
Column Trends Score: 0.849
Overall Score: 0.878
correlation score: 0.964
contingency similarity score: 0.937
log detection score: 0.645
Alpha Precision: 0.698198, Beta Recall: 0.203227
DCR Score = 0.506733


### Shoppers

In [29]:
# fetch dataset
online_shoppers_purchasing_intention_dataset = fetch_ucirepo(id=468)

# data (as pandas dataframes)
X = online_shoppers_purchasing_intention_dataset.data.features
y = online_shoppers_purchasing_intention_dataset.data.targets

# metadata
print(online_shoppers_purchasing_intention_dataset.metadata)

# variable information
print(online_shoppers_purchasing_intention_dataset.variables)

{'uci_id': 468, 'name': 'Online Shoppers Purchasing Intention Dataset', 'repository_url': 'https://archive.ics.uci.edu/dataset/468/online+shoppers+purchasing+intention+dataset', 'data_url': 'https://archive.ics.uci.edu/static/public/468/data.csv', 'abstract': 'Of the 12,330 sessions in the dataset,\n84.5% (10,422) were negative class samples that did not\nend with shopping, and the rest (1908) were positive class\nsamples ending with shopping.', 'area': 'Business', 'tasks': ['Classification', 'Clustering'], 'characteristics': ['Multivariate'], 'num_instances': 12330, 'num_features': 17, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['Revenue'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2018, 'last_updated': 'Thu Jan 11 2024', 'dataset_doi': '10.24432/C5F88Q', 'creators': ['C. Sakar', 'Yomi Kastro'], 'intro_paper': {'ID': 367, 'type': 'NATIVE', 'title': 'Real-time prediction of online shoppers’ pur

In [30]:
all_data_shoppers = pd.concat([X, y], axis=1)

In [32]:
cat_cols = ['Month', 'VisitorType', 'Weekend', 'Revenue']
numeric_cols = [x for x in all_data_shoppers.columns if x not in cat_cols]

ctgan = CTGAN(epochs=300)
ctgan.fit(all_data_shoppers, cat_cols)

In [33]:
synthetic_data = ctgan.sample(len(all_data_shoppers))

ctgan_metrics = Metrics(all_data_shoppers, synthetic_data, numeric_cols, cat_cols, 'Revenue')
metrics = ctgan_metrics.collect_all_metrics()

Classificator results:
Real data - AUC: 0.925 ± 0.001
Synthetic data - AUC: 0.875 ± 0.000
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 18/18 [00:00<00:00, 91.01it/s]|
Column Shapes Score: 82.84%

(2/2) Evaluating Column Pair Trends: |██████████| 153/153 [00:01<00:00, 107.01it/s]|
Column Pair Trends Score: 88.99%

Overall Score (Average): 85.91%

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 18/18 [00:00<00:00, 461.21it/s]|
Data Validity Score: 90.78%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 201.79it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 95.39%




Column Shapes Score: 0.828
Column Trends Score: 0.890
Overall Score: 0.859
correlation score: 0.909
contingency similarity score: 0.892
log detection score: 0.749
Alpha Precision: 0.888029, Beta Recall: 0.291901
DCR Score = 0.493755


### Magic

In [3]:
# fetch dataset
magic_gamma_telescope = fetch_ucirepo(id=159)

# data (as pandas dataframes)
X = magic_gamma_telescope.data.features
y = magic_gamma_telescope.data.targets

# metadata
print(magic_gamma_telescope.metadata)

# variable information
print(magic_gamma_telescope.variables)


{'uci_id': 159, 'name': 'MAGIC Gamma Telescope', 'repository_url': 'https://archive.ics.uci.edu/dataset/159/magic+gamma+telescope', 'data_url': 'https://archive.ics.uci.edu/static/public/159/data.csv', 'abstract': 'Data are MC generated to simulate registration of high energy gamma particles in an atmospheric Cherenkov telescope', 'area': 'Physics and Chemistry', 'tasks': ['Classification'], 'characteristics': ['Multivariate'], 'num_instances': 19020, 'num_features': 10, 'feature_types': ['Real'], 'demographics': [], 'target_col': ['class'], 'index_col': None, 'has_missing_values': 'no', 'missing_values_symbol': None, 'year_of_dataset_creation': 2004, 'last_updated': 'Tue Dec 19 2023', 'dataset_doi': '10.24432/C52C8B', 'creators': ['R. Bock'], 'intro_paper': None, 'additional_info': {'summary': "The data are MC generated (see below) to simulate registration of high energy gamma particles in a ground-based atmospheric Cherenkov gamma telescope using the imaging technique. Cherenkov gamm

In [4]:
all_data_telescope = pd.concat([X, y], axis=1)

In [5]:
cat_cols = ['class']
numeric_cols = [x for x in all_data_telescope.columns if x not in cat_cols]

ctgan = CTGAN(epochs=300)
ctgan.fit(all_data_telescope, cat_cols)

In [6]:
synthetic_data = ctgan.sample(len(all_data_telescope))

ctgan_metrics = Metrics(all_data_telescope, synthetic_data, numeric_cols, cat_cols, 'class')
metrics = ctgan_metrics.collect_all_metrics()

Classificator results:
Real data - AUC: 0.929 ± 0.001
Synthetic data - AUC: 0.836 ± 0.000
Generating report ...

(1/2) Evaluating Column Shapes: |██████████| 11/11 [00:00<00:00, 51.05it/s]|
Column Shapes Score: 91.35%

(2/2) Evaluating Column Pair Trends: |██████████| 55/55 [00:00<00:00, 79.69it/s]|
Column Pair Trends Score: 89.36%

Overall Score (Average): 90.35%

Generating report ...

(1/2) Evaluating Data Validity: |██████████| 11/11 [00:00<00:00, 325.97it/s]|
Data Validity Score: 99.92%

(2/2) Evaluating Data Structure: |██████████| 1/1 [00:00<00:00, 156.60it/s]|
Data Structure Score: 100.0%

Overall Score (Average): 99.96%




Column Shapes Score: 0.913
Column Trends Score: 0.894
Overall Score: 0.904
correlation score: 0.890
contingency similarity score: 0.832
log detection score: 0.667
Alpha Precision: 0.860894, Beta Recall: 0.102001
DCR Score = 0.502629


### Beijing

In [4]:
# fetch dataset
beijing_pm2_5 = fetch_ucirepo(id=381)

# data (as pandas dataframes)
X = beijing_pm2_5.data.features
y = beijing_pm2_5.data.targets

# metadata
print(beijing_pm2_5.metadata)

# variable information
print(beijing_pm2_5.variables)


{'uci_id': 381, 'name': 'Beijing PM2.5', 'repository_url': 'https://archive.ics.uci.edu/dataset/381/beijing+pm2+5+data', 'data_url': 'https://archive.ics.uci.edu/static/public/381/data.csv', 'abstract': 'This hourly data set contains the PM2.5 data of US Embassy in Beijing. Meanwhile, meteorological data from Beijing Capital International Airport are also included. ', 'area': 'Climate and Environment', 'tasks': ['Regression'], 'characteristics': ['Multivariate', 'Time-Series'], 'num_instances': 43824, 'num_features': 11, 'feature_types': ['Integer', 'Real'], 'demographics': [], 'target_col': ['pm2.5'], 'index_col': ['No'], 'has_missing_values': 'yes', 'missing_values_symbol': 'NaN', 'year_of_dataset_creation': 2015, 'last_updated': 'Sat Mar 16 2024', 'dataset_doi': '10.24432/C5JS49', 'creators': ['Song Chen'], 'intro_paper': {'ID': 432, 'type': 'NATIVE', 'title': "Assessing Beijing's PM2.5 pollution: severity, weather impact, APEC and winter heating", 'authors': 'Xuan Liang, T. Zou, Bi

In [5]:
all_data_beijing = pd.concat([X, y], axis=1)

In [6]:
all_data_beijing = all_data_beijing.dropna()

In [None]:
cat_cols = ['cbwd']
numeric_cols = [x for x in all_data_beijing.columns if x not in cat_cols]

ctgan = CTGAN(epochs=300)
ctgan.fit(all_data_beijing, cat_cols)

In [None]:
synthetic_data = ctgan.sample(len(all_data_beijing))

ctgan_metrics = Metrics(all_data_beijing, synthetic_data, numeric_cols, cat_cols, 'pm2.5', classification=False)
metrics = ctgan_metrics.collect_all_metrics()