In [1]:
from utils import AuditTrail
import pandas as pd
import numpy as np

In [2]:
def gerar_dataframes_teste(
    n_samples: int = 1000,
    n_num_features: int = 3,
    n_cat_low: int = 1,
    n_cat_high: int = 1,
    seed: int = 42
) -> dict:
    np.random.seed(seed)

    # ID e target
    base = {
        'client_id': np.arange(n_samples),
        'target': np.random.randint(0, 2, n_samples),
        'bool_flag': np.random.choice([True, False], size=n_samples),
        'missing_95pct': np.where(np.random.rand(n_samples) < 0.95, np.nan, 'ok')
    }

    # Variáveis numéricas
    num_data = {
        f'num_feat_{i}': np.random.normal(loc=0, scale=1 + i, size=n_samples)
        for i in range(n_num_features)
    }

    # Variáveis categóricas de baixa cardinalidade
    cat_low_data = {
        f'cat_low_{i}': np.random.choice(['A', 'B', 'C'], size=n_samples)
        for i in range(n_cat_low)
    }

    # Variáveis categóricas de alta cardinalidade
    cat_high_data = {
        f'cat_high_{i}': np.random.choice([f"Grupo_{j}" for j in range(100)], size=n_samples)
        for i in range(n_cat_high)
    }

    # Concatena tudo com axis=1
    df = pd.concat(
        [pd.DataFrame(d) for d in [base, num_data, cat_low_data, cat_high_data]],
        axis=1
    )

    # Derivações
    df_merge_errado = df.drop(columns='missing_95pct').iloc[:-int(0.05 * n_samples)].copy()

    # df_tipo = df.copy()
    # if n_num_features > 0:
    #     df_tipo['num_feat_0'] = df_tipo['num_feat_0'].astype(str)

    # df_dist = df.copy()
    # if n_num_features > 0:
    #     df_dist['num_feat_0'] += np.random.normal(0, 3, size=n_samples)
    # if n_cat_low > 0:
    #     df_dist['cat_low_0'] = df_dist['cat_low_0'].replace({'A': 'B'})

    # df_duplicado = pd.concat([df, df.iloc[:10]], ignore_index=True)

    # df_all_missing = df.copy()
    # df_all_missing['all_missing'] = np.nan

    return {
        "original": df,
        "merge_errado": df_merge_errado,
        #"mudou_tipo": df_tipo,
        #"mudou_distribuicao": df_dist,
        #"duplicado": df_duplicado,
        #"all_missing": df_all_missing
    }


In [3]:
dfs = gerar_dataframes_teste(
    n_samples=2_000_000,
    n_num_features=200,
    n_cat_low=2,
    n_cat_high=2,
    seed=123
)

df1 = dfs['original']
df2 = dfs['merge_errado']
# df3 = dfs['mudou_tipo']
# df4 = dfs['mudou_distribuicao']
# df5 = dfs['duplicado']
# df6 = dfs['all_missing']

In [4]:
audit = AuditTrail(
    track_histograms=True,
    track_distributions=True,
    enable_logging=True,
    auto_detect_types=True,
    target_col='target',
    default_keys=["client_id"]
)

In [5]:
# df1 = pd.read_csv('Dataset_Original.csv')
# df2 = pd.read_csv('Dataset_P_s-Merge__Simulado_.csv')

In [None]:
audit.take_snapshot(df1, name="original")

In [None]:
audit.describe_snapshot("original")

In [None]:
audit.take_snapshot(df2, name="merge_errado")

In [None]:
audit.compare_snapshots("original", "merge_errado")