In [4]:
from audittrail import AuditTrail
import pandas as pd
import numpy as np

In [2]:
def gerar_dataframes_teste(
    n_samples: int = 1000,
    n_num_features: int = 3,
    n_cat_low: int = 1,
    n_cat_high: int = 1,
    seed: int = 42
) -> dict:
    np.random.seed(seed)

    # ID e target
    base = {
        'client_id': np.arange(n_samples),
        'target': np.random.randint(0, 2, n_samples),
        'bool_flag': np.random.choice([True, False], size=n_samples),
        'missing_95pct': np.where(np.random.rand(n_samples) < 0.95, np.nan, 'ok')
    }

    # Variáveis numéricas
    num_data = {
        f'num_feat_{i}': np.random.normal(loc=0, scale=1 + i, size=n_samples)
        for i in range(n_num_features)
    }

    # Variáveis categóricas de baixa cardinalidade
    cat_low_data = {
        f'cat_low_{i}': np.random.choice(['A', 'B', 'C'], size=n_samples)
        for i in range(n_cat_low)
    }

    # Variáveis categóricas de alta cardinalidade
    cat_high_data = {
        f'cat_high_{i}': np.random.choice([f"Grupo_{j}" for j in range(100)], size=n_samples)
        for i in range(n_cat_high)
    }

    # Concatena tudo com axis=1
    df = pd.concat(
        [pd.DataFrame(d) for d in [base, num_data, cat_low_data, cat_high_data]],
        axis=1
    )

    # Derivações
    df_merge_errado = df.drop(columns='missing_95pct').iloc[:-int(0.05 * n_samples)].copy()

    # df_tipo = df.copy()
    # if n_num_features > 0:
    #     df_tipo['num_feat_0'] = df_tipo['num_feat_0'].astype(str)

    # df_dist = df.copy()
    # if n_num_features > 0:
    #     df_dist['num_feat_0'] += np.random.normal(0, 3, size=n_samples)
    # if n_cat_low > 0:
    #     df_dist['cat_low_0'] = df_dist['cat_low_0'].replace({'A': 'B'})

    # df_duplicado = pd.concat([df, df.iloc[:10]], ignore_index=True)

    # df_all_missing = df.copy()
    # df_all_missing['all_missing'] = np.nan

    return {
        "original": df,
        "merge_errado": df_merge_errado,
        #"mudou_tipo": df_tipo,
        #"mudou_distribuicao": df_dist,
        #"duplicado": df_duplicado,
        #"all_missing": df_all_missing
    }


In [14]:
dfs = gerar_dataframes_teste(
    n_samples=20_000,
    n_num_features=10,
    n_cat_low=4,
    n_cat_high=2,
    seed=123
)

df1 = dfs['original']
df2 = dfs['merge_errado']
#df3 = dfs['mudou_tipo']
#df4 = dfs['mudou_distribuicao']
#df5 = dfs['duplicado']
#df6 = dfs['all_missing']

In [15]:
trail = AuditTrail(
    track_histograms=True,
    track_distributions=True,
    enable_logging=True,
    auto_detect_types=True,
    target_col='target',
    default_keys=["client_id"]
)

In [16]:
trail.take_snapshot(df1, name="original")

In [17]:
# Após transformações...
trail.take_snapshot(df2, name="filtrado")

In [18]:
trail.compare_snapshots("original", "filtrado")


🔍 Comparando 'original' vs 'filtrado':

▶️ Shape:
  original: (20000, 20) vs filtrado: (19000, 19)

▶️ Diferença de valores ausentes:
 missing_95pct   NaN
dtype: float64 

▶️ Duplicatas nas chaves:
  original: 0 vs filtrado: 0

▶️ Mudança na média de variáveis numéricas:
client_id    -500.000000
target         -0.000766
num_feat_0     -0.001988
num_feat_1     -0.000494
num_feat_2      0.002796
num_feat_3     -0.010567
num_feat_4     -0.003681
num_feat_5      0.004641
num_feat_6     -0.010024
num_feat_7     -0.028219
num_feat_8     -0.018411
num_feat_9     -0.001460
Name: mean, dtype: float64

▶️ KS-test e PSI para variáveis:
  bool_flag: KS=1.000, PSI=0.000
  cat_high_0: KS=0.320, PSI=0.000
  cat_high_1: KS=0.340, PSI=0.000
  cat_low_0: KS=1.000, PSI=0.000
  cat_low_1: KS=1.000, PSI=0.000
  cat_low_2: KS=1.000, PSI=0.000
  cat_low_3: KS=1.000, PSI=0.000
  client_id: KS=0.000, PSI=0.199
  num_feat_0: KS=0.000, PSI=0.199
  num_feat_1: KS=0.000, PSI=0.199
  num_feat_2: KS=0.000, PSI=0.19

In [19]:
trail.describe_snapshot("original")


📄 Descrição do snapshot 'original':

▶️ Shape: (20000, 20)
▶️ Chaves de duplicação: ['client_id']
   • Duplicatas nas chaves: 0

🧱 Tipos de dados:


client_id          int32
target             int32
bool_flag           bool
missing_95pct     object
num_feat_0       float64
num_feat_1       float64
num_feat_2       float64
num_feat_3       float64
num_feat_4       float64
num_feat_5       float64
num_feat_6       float64
num_feat_7       float64
num_feat_8       float64
num_feat_9       float64
cat_low_0         object
cat_low_1         object
cat_low_2         object
cat_low_3         object
cat_high_0        object
cat_high_1        object
dtype: object


🔎 Colunas detectadas automaticamente:
   • Numéricas (11): ['bool_flag', 'num_feat_0', 'num_feat_1', 'num_feat_2', 'num_feat_3', 'num_feat_4', 'num_feat_5', 'num_feat_6', 'num_feat_7', 'num_feat_8', 'num_feat_9']
   • Categóricas (5): ['missing_95pct', 'cat_low_0', 'cat_low_1', 'cat_low_2', 'cat_low_3']

🕳️ Valores ausentes:
  ✅ Nenhuma coluna com valores ausentes.

📊 Estatísticas numéricas:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
client_id,20000.0,9999.5,5773.647028,0.0,4999.75,9999.5,14999.25,19999.0
target,20000.0,0.49645,0.5,0.0,0.0,0.0,1.0,1.0
num_feat_0,20000.0,0.006334,1.002797,-3.60259,-0.67413,0.0047,0.683578,4.185027
num_feat_1,20000.0,-0.006488,2.001615,-7.677566,-1.359606,-0.025104,1.363081,7.287609
num_feat_2,20000.0,0.005131,3.001864,-11.783489,-2.024251,0.025318,2.020079,12.442578
num_feat_3,20000.0,-0.053214,3.963022,-17.627736,-2.713851,-0.044251,2.59953,14.602492
num_feat_4,20000.0,-0.009496,4.980469,-20.320761,-3.393812,0.01873,3.394554,18.673099
num_feat_5,20000.0,0.059577,5.996961,-23.918198,-3.993086,0.056095,4.108762,23.721435
num_feat_6,20000.0,0.091882,6.994593,-29.780242,-4.556245,0.076166,4.792688,30.095259
num_feat_7,20000.0,0.073954,8.002805,-36.857039,-5.289758,0.111299,5.429993,32.351081



🏷️ Estatísticas categóricas:


Unnamed: 0,count,unique,top,freq
missing_95pct,20000,2,,19056
cat_low_0,20000,3,A,6785
cat_low_1,20000,3,B,6751
cat_low_2,20000,3,C,6722
cat_low_3,20000,3,C,6730
cat_high_0,20000,100,Grupo_16,233
cat_high_1,20000,100,Grupo_97,235



📈 Histogramas (categorias apenas):
  missing_95pct: 2 valores distintos (top 3: {'nan': 19056, 'ok': 944})
  cat_low_0: 3 valores distintos (top 3: {'A': 6785, 'C': 6610, 'B': 6605})
  cat_low_1: 3 valores distintos (top 3: {'B': 6751, 'A': 6625, 'C': 6624})
  cat_low_2: 3 valores distintos (top 3: {'C': 6722, 'A': 6683, 'B': 6595})
  cat_low_3: 3 valores distintos (top 3: {'C': 6730, 'A': 6676, 'B': 6594})
