In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Carga de los datasets
import pandas as pd

data = pd.read_parquet('../../data/processed/data_processed.parquet')
data

Unnamed: 0,account_number,user_id,transaction_date,transaction_amount,transaction_type,fraction_flag
0,aabf72379081113950,1ca3c448-0279-46a6-8f97-97b06d7ce3c9,2025-01-02 09:51:51.628826880,90.50,debit,no fraccionada
1,aabl46199073805045,901bcdef-b56f-48ce-bd66-971e88476c56,2025-01-07 10:41:55.302610944,258.64,debit,no fraccionada
2,aaeg88841649433869,9f708368-cb3c-48ca-a392-57316b79afcc,2025-01-04 00:10:32.930148096,196.79,debit,no fraccionada
3,aafh61149854104782,4c6a70f4-8ede-444d-876c-f68c3f1be0d0,2025-01-01 07:41:08.506103040,95.11,debit,no fraccionada
4,aafj92785234615121,8d244e3e-c4da-4ddb-a0b1-5abaa6a27967,2025-01-08 14:42:41.516628992,921.28,debit,fraccionada
...,...,...,...,...,...,...
15195,zzvu82915110913519,f8cda88b-436d-46e2-b83c-fe0be037e5ed,2025-01-07 14:31:48.443721984,362.73,credit,no fraccionada
15196,zzvx10748935558717,23e2fcb4-72d8-467d-894a-05e430b187ef,2025-01-02 08:19:45.090318080,314.03,debit,no fraccionada
15197,zzwe93426747284460,497ec6d1-081f-46dc-b8d9-a88aef0bea4f,2025-01-01 01:28:22.675819008,307.59,debit,no fraccionada
15198,zzwk36251231154111,2d174fc9-6f7c-45ea-a72a-6d8eb5122df8,2025-01-04 07:54:49.409979904,724.54,debit,fraccionada


In [None]:
from pandera import Column, DataFrameSchema, Check
from pandera.errors import SchemaError
from datetime import datetime

# Definir el esquema
schema = DataFrameSchema({
    'account_number': Column(str, Check.str_length(10, 30), nullable=False),
    'user_id': Column(str, Check.str_length(36), nullable=False),
    'transaction_date': Column(datetime, nullable=False),
    'transaction_amount': Column(float, Check.greater_than(0), nullable=False),
    'transaction_type': Column(
        str, 
        Check.isin(['debit', 'credit']), 
        nullable=False
    ),
    'fraction_flag': Column(
        str, 
        Check.isin(['fraccionada', 'no fraccionada']), 
        nullable=False
    ),
})

# Validar el DataFrame
try:
    validated_data = schema.validate(data)
    print('Datos validados correctamente.')
except SchemaError as e:
    print(f'Error en la validación: {e}')

Datos validados correctamente.


In [16]:
import yaml

# Convertir tipos al nombre estándar
schema_as_dict = {
    col_name: str(col.dtype).replace("<class '", "").replace("'>", "")
    for col_name, col in schema.columns.items()
}

# Exportar como YAML
yaml_file_path = '../configs/schema.yaml'

with open(yaml_file_path, 'w') as yaml_file:
    yaml.dump(schema_as_dict, yaml_file, default_flow_style=False)