In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
# Carga de los datasets
import pandas as pd

data = pd.read_parquet('../../data/processed/data_processed.parquet')
data.sample(10, random_state=42)

Unnamed: 0,merchant_id,_id,subsidiary,transaction_date,account_number,user_id,transaction_amount,transaction_type,fraction_flag
2293209,817d18cd3c31e40e9bff0566baae7758,d0ab2acbc3d334f4770b692757a61acf,e8505c5d5d68106db9b1b952ba02abef,2021-11-26 17:17:44,73f7afa3e638a00c01436f95b6eb314d,886bbf3ac7497d4a04d192330ff1d783,392.334031,DEBITO,NO_FRACCIONADA
1759290,817d18cd3c31e40e9bff0566baae7758,fecb49bd63077d55d885789c559e3c89,6ffb37fe7169210e65528c840afe6e9c,2021-09-03 13:55:52,43ac40d8178558bf7aee78b7b6a40dee,68b2b7eed6613f4889d7bbbf73afb0a5,23.77782,DEBITO,NO_FRACCIONADA
3077467,817d18cd3c31e40e9bff0566baae7758,38df8e488caaba5dcca8459734fd3d32,dd399a26a594b802591b88c80af17c58,2021-04-14 16:08:39,55b6e771931f1959e9a7100333dc4f77,b7057677b8d85d46af88b63999cf144a,118.8891,DEBITO,NO_FRACCIONADA
1490932,817d18cd3c31e40e9bff0566baae7758,6a5da11fb472e89117363e59db83c9e7,59c8459c4824bd2ee47b3088d5f9a6c7,2021-11-23 12:01:00,0c65879f76d3ce66926cc87f9720694e,58d2123f1269605c33d97bd8f016d2cc,1188.891002,DEBITO,NO_FRACCIONADA
1918623,838a8fa992a4aa2fb5a0cf8b15b63755,6e982367f81e3de1220ced22fc200ffe,6c2787d7a0ca8404222e1ac52d83dc59,2021-03-18 13:10:27,43a1445d5a9753f5e658d91e547f6e1c,7232bbd3bdbdf3bceebb920a65f147ce,237.7782,DEBITO,NO_FRACCIONADA
1900897,817d18cd3c31e40e9bff0566baae7758,04613821f1d2de7724fe1f2a1f563e92,206e1d9176370f63314f3527945537a0,2021-11-27 18:38:24,300627771a4ab8327e682bca8158c323,71235780adf77d551ba283614c257ada,297.222751,DEBITO,NO_FRACCIONADA
1268331,817d18cd3c31e40e9bff0566baae7758,69bfd5f4e0a9f870dca4ecf23c2eaffd,8082983022839f3cf85e4523a7d4e79c,2021-08-28 15:07:05,0a74b4cb280015bd23749342c36532c4,4b81cc6f23731192c62090a46990cffc,225.88929,DEBITO,NO_FRACCIONADA
3278198,817d18cd3c31e40e9bff0566baae7758,9b542e227cc5fa1e4d3d16e50f907cb4,e1a4ee4a99bfcb4b1f23f9a0eca58221,2021-06-20 12:08:08,fc5b4a9050753e95ef3f63d829daa2af,c3154c61ddb18a1b34292605fc258dbc,11.88891,DEBITO,NO_FRACCIONADA
1863930,817d18cd3c31e40e9bff0566baae7758,73ae4db3dad8e18ddaad1c6a31229288,1394d4a77b636fc0938dc8e8702a2c38,2021-07-17 17:51:34,ef6530e55eadd3934c2b8d5e8994b267,6ef4b20bb32fe1080fa07e9e7b949b65,23.77782,DEBITO,NO_FRACCIONADA
3478757,817d18cd3c31e40e9bff0566baae7758,80721b813b138701cd479741ba08e0b8,0653f1f7c424a215bea01d0582b14b27,2021-08-30 14:08:51,3a5c9365402aa6d1e00dbb7d2d548c5b,cf125ce1335c67514b4a800481969f57,23.77782,DEBITO,NO_FRACCIONADA


In [3]:
from pydantic import BaseModel, Field, field_validator, ValidationError
from typing import Literal
from datetime import datetime

# Define the schema using Pydantic
class TransactionModel(BaseModel):
    merchant_id: str = Field(..., min_length=6, max_length=36)
    id: str = Field(..., min_length=6, max_length=36, alias='_id')
    subsidiary: str = Field(..., min_length=6, max_length=36)
    transaction_date: datetime
    account_number: str = Field(..., min_length=6, max_length=36)
    user_id: str = Field(..., min_length=6, max_length=36)
    transaction_amount: float = Field(..., gt=0)
    transaction_type: Literal['DEBITO', 'CREDITO']
    fraction_flag: Literal['FRACCIONADA', 'NO_FRACCIONADA']

    @field_validator('transaction_date', mode='before')
    def validate_transaction_date(cls, value):
        if not isinstance(value, datetime):
            raise ValueError('transaction_date must be a datetime object')
        return value

In [5]:
# Validate a DataFrame
def validate_dataframe(df: pd.DataFrame):
    errors = list()
    validated_rows = list()

    for idx, row in df.iterrows():
        try:
            # Validate each row as a Pydantic model
            transaction = TransactionModel(**row.to_dict())
            validated_rows.append(transaction)
        except ValidationError as e:
            errors.append((idx, e.errors()))

    if errors:
        for idx, error in errors:
            print(f'Error in row {idx}: {error}')
        raise ValueError('Validation failed for one or more rows.')
    else:
        print('All data validated successfully.')
        return validated_rows

try:
    validated_data = pd.DataFrame([row.model_dump() for row in validate_dataframe(data)])
    print('DataFrame validated successfully.')
except ValueError as e:
    print(str(e))

All data validated successfully.
DataFrame validated successfully.
