In [2]:
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)
input_path = PROJECT_ROOT + '/data/raw/customers_raw.csv'

#part 1: Data Profiler
from src.part1.dataprofiler import DataProfiler
from src.utils.logger_config import setup_pipeline_logger

#part 2: PII Detector
from src.part2.pii_detector import PIIDetector

#part 3: Data Validator
from src.part3.data_validator import FintechGXValidator

import pandas as pd

# Initialize our specialized logger
logger = setup_pipeline_logger()

logger.info("Pipeline Logger initialized. Ready for Data Quality Analysis.")

# Verify the data directory exists
if not os.path.exists('../data'):
    logger.error("Data directory not found!")
else:
    logger.info("Data directory verified.")

INFO: Pipeline Logger initialized. Ready for Data Quality Analysis.
INFO: Data directory verified.


part 1

In [3]:
# Instantiate and run
profiler = DataProfiler(input_path=input_path)
profiler.run_full_analysis(output_report_path='../data/reports/data_quality_report.txt')

INFO: Loaded 10 rows from customers_raw.csv
INFO: Analysis complete. Report saved to: ../data/reports/data_quality_report.txt


part 2

In [4]:
# Assuming 'df' is the dataframe from our previous profiling step
detector = PIIDetector(profiler.df)

# Run the scan and generate report
detector.scan_pii().generate_report('../data/reports/pii_detection_report.txt')

# Quick visual check in the notebook
print("\nPII Scan Summary:")
print(f"Emails found: {detector.risk_results['emails']}")
print(f"Phones found: {detector.risk_results['phones']}")

INFO: Starting PII scanning process...
INFO: Scan complete. Found 10 emails and 10 phone numbers.
INFO: PII Detection Report saved to ../data/reports/pii_detection_report.txt

PII Scan Summary:
Emails found: 10
Phones found: 10


part 3

In [5]:
# Initialize with the modern API
gx_engine = FintechGXValidator(profiler.df)

# Build, Validate, and Generate the deliverable
results = gx_engine.build_expectations().validate(
    report_path='../data/reports/validation_results.txt'
)

# Optional: View the full result object
print(f"Validation Success: {results.success}")

INFO: Building Strict Expectations for suite: fintech_suite
INFO: Starting GX 1.x Validation execution...


Calculating Metrics:   0%|          | 0/84 [00:00<?, ?it/s]

INFO: Forensic GX Report saved to ../data/reports/validation_results.txt
Validation Success: False
