In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import uuid
import time
from typing import List

import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.utils.validation import check_is_fitted
from statsmodels.tsa.stattools import adfuller
from IPython.display import Code, display
import inspect

import avh.utility_functions as utils
from avh.data_issues import (
    IssueTransfomer,
    NumericIssueTransformer,
    CategoricalIssueTransformer,
    IncreasedNulls,
    SchemaChange,
    DistributionChange,
    UnitChange,
    CasingChange,
    DQIssueDatasetGenerator,
    VolumeChangeUpsample,
    VolumeChangeDownsample,
    NumericPerturbation,
)

from avh.data_generation import (
    DataColumn,
    NumericColumn,
    CategoricalColumn,
    NormalNumericColumn,
    UniformNumericColumn,
    BetaNumericColumn,
    StaticCategoricalColumn,
    RandomCategoricalColumn,
    DataGenerationPipeline,
)

from avh.metrics import (
    Metric,
    SingleDistributionMetric,
    TwoDistributionMetric,
    RowCount,
    DistinctCount,
    DistinctRatio,
    CompleteRatio,
    Mean,
    Median,
    Range,
    Min,
    Max,
    Sum,
    MeanDigitLength,
    MeanPunctuationLength,
    MeanStringLength,
    EMD,
    KsDist,
    CohenD,
    KlDivergence,
    JsDivergence,
)

from avh.constraints import (
    Constraint,
    ConstantConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
    CLTConstraint,
    ConjuctivDQProgram,
)

from avh.auto_validate_by_history import AVH

In [3]:
import jupyter_black

jupyter_black.load()

In [4]:
rng = np.random.default_rng(42)

In [5]:
pipeline = DataGenerationPipeline(
    columns=[
        NormalNumericColumn("money", 300, 10),
        NormalNumericColumn("height", 18, 2),
        # UniformNumericColumn("houses", 30, 50, dtype=np.int32),
        # RandomCategoricalColumn("pets", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("pets2", ["dog", "cat", "snail"]),
        # RandomCategoricalColumn("text"),
    ],
    issues=[
        ("money", [IncreasedNulls(p=0.9)]),
        # ("height", [IncreasedNulls(p=0.99)]),
        # ("pets", [IncreasedNulls(p=0.5)]),
    ],
    random_state=rng,
)

In [6]:
# Defining M space
M = [
    RowCount,
    DistinctRatio,
    DistinctCount,
    CompleteRatio,
    Min,
    Max,
    Mean,
    Median,
    Sum,
    Range,
    EMD,
    JsDivergence,
    KlDivergence,
    KsDist,
    CohenD,
]

# # Defining available constraint estimators
# # Note: Order not important as Q space will be constructed
# #     from cartesian product of E & M
E = [
    CLTConstraint,
    ChebyshevConstraint,
    CantelliConstraint,
]

H = [pipeline.generate(1000) for i in range(31)]

In [7]:
avh = AVH(random_state=42, verbose=3, columns=["money"])

In [8]:
PS = avh.generate(H, fpr_target=0.05)

creating D(C)...: 23it [00:03,  6.50it/s] 
creating P(S) (with joblib)...: 100%|██████████| 1/1 [00:00<00:00,  4.97it/s]

2024-05-06 18:50:48|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history._generate_parallel Took 3772.1150 ms to execute.
2024-05-06 18:50:48|DEBUG|utility_functions.wrapper: avh.auto_validate_by_history.generate Took 3772.5592 ms to execute.





In [9]:
PS

{'money': ChebyshevConstraint(29341.3798 <= Sum <= 30690.2296, FPR = 0.0204)
 CantelliConstraint(0.0000 <= EMD <= 10.9308, FPR = 0.0051), FPR = 0.025484}

### Synthetic benchmark data generation

**Benchmark setup**  
columns:
- Numeric:
    - count: 1000
    - dtypes:
        - int: 30%
        - float: 70%
    - distributions:
        - uniform: 10%
        - normal: 20%
        - beta: 70%
- Categorical:
    - count: 1000

In [8]:
rng = np.random.default_rng(42)
n = 1000

numeric_columns = []
for i in range(n):

    column_name = f"numeric_{i}"
    dtype = rng.choice([np.int32, np.float32], p=[0.3, 0.7])
    distribution = rng.choice(["uniform", "normal", "beta"], p=[0.1, 0.1, 0.8])

    scale = rng.integers(1, 100000)
    sign = rng.choice([-1, 1], p=[0.2, 0.8])

    if distribution == "uniform":
        low = rng.uniform(0, scale)
        high = rng.uniform(low, scale)

        if sign == -1:
            low, high = high * sign, low * sign

        column = UniformNumericColumn(column_name, low, high, dtype=dtype)

    elif distribution == "normal":
        mean = rng.uniform(0, scale) * sign
        std = rng.uniform(0, 10)
        column = NormalNumericColumn(column_name, mean, std, dtype=dtype)

    elif distribution == "beta":
        alfa = rng.uniform(0.1, 10)
        beta = rng.uniform(0.1, 10)
        column = BetaNumericColumn(
            column_name, alfa, beta, scale=scale * sign, dtype=dtype
        )

    numeric_columns.append(column)

pipeline = DataGenerationPipeline(numeric_columns, random_state=rng)

In [10]:
H_FULL = [pipeline.generate(100000) for i in range(31)]