In [10]:
import os
import sys
PROJECT_ROOT = os.path.abspath(os.path.join(os.getcwd(), ".."))
sys.path.append(PROJECT_ROOT)
input_path = PROJECT_ROOT + '/data/raw/customers_raw.csv'

#part 1: Data Profiler
from src.part1.dataprofiler import DataProfiler
from src.utils.logger_config import setup_pipeline_logger

#part 2: PII Detector
from src.part2.pii_detector import PIIDetector

#part 3: Data Validator
from src.part3.data_validator import FintechGXValidator

#part 4: Data Remediator
from src.part4.cleaning import DataRemediator

#part 5: Data Masker
from src.part5.data_masker import DataMasker


import pandas as pd

# Initialize our specialized logger
logger = setup_pipeline_logger()

logger.info("Pipeline Logger initialized. Ready for Data Quality Analysis.")

# Verify the data directory exists
if not os.path.exists('../data'):
    logger.error("Data directory not found!")
else:
    logger.info("Data directory verified.")

INFO: Pipeline Logger initialized. Ready for Data Quality Analysis.
INFO: Data directory verified.


part 1

In [3]:
# Instantiate and run
profiler = DataProfiler(input_path=input_path)
profiler.run_full_analysis(output_report_path='../data/reports/data_quality_report.txt')

INFO: Loaded 10 rows from customers_raw.csv
INFO: Analysis complete. Report saved to: ../data/reports/data_quality_report.txt


part 2

In [4]:
# Assuming 'df' is the dataframe from our previous profiling step
detector = PIIDetector(profiler.df)

# Run the scan and generate report
detector.scan_pii().generate_report('../data/reports/pii_detection_report.txt')

# Quick visual check in the notebook
print("\nPII Scan Summary:")
print(f"Emails found: {detector.risk_results['emails']}")
print(f"Phones found: {detector.risk_results['phones']}")

INFO: Starting PII scanning process...
INFO: Scan complete. Found 10 emails and 10 phone numbers.
INFO: PII Detection Report saved to ../data/reports/pii_detection_report.txt

PII Scan Summary:
Emails found: 10
Phones found: 10


part 3

In [5]:
# Initialize with the modern API
gx_engine = FintechGXValidator(profiler.df)

# Build, Validate, and Generate the deliverable
results = gx_engine.build_expectations().validate(
    report_path='../data/reports/validation_results.txt'
)

# Optional: View the full result object
print(f"Validation Success: {results.success}")

INFO: Building Strict Expectations for suite: fintech_suite
INFO: Starting GX 1.x Validation execution...


Calculating Metrics:   0%|          | 0/84 [00:00<?, ?it/s]

INFO: Forensic GX Report saved to ../data/reports/validation_results.txt
Validation Success: False


part 4

In [9]:
# 1. Initialize Remediator
remediator = DataRemediator(profiler.df)

# 2. Execute Cleaning Pipeline
remediator.normalize_names().normalize_phones().normalize_dates().handle_missing()

# 3. Re-Validate to confirm 0 failures
gx_engine_v2 = FintechGXValidator(remediator.df, suite_name="cleaned_suite")
results_after = gx_engine_v2.build_expectations().validate(report_path='../data/reports/validation_final.txt')

# 4. Generate the Log
remediator.generate_log(
    output_path='../data/reports/cleaning_log.txt',
    validation_before=7, # Based on our previous Part 3 findings
    validation_after=results_after.statistics['unsuccessful_expectations']
)

# 5. Save final CSV
remediator.df.to_csv('../data/processed/customers_cleaned.csv', index=False)
logger.info("Pipeline Execution Complete. Golden Dataset generated.")

INFO: Building Strict Expectations for suite: cleaned_suite
INFO: Starting GX 1.x Validation execution...


Calculating Metrics:   0%|          | 0/84 [00:00<?, ?it/s]

INFO: Forensic GX Report saved to ../data/reports/validation_final.txt
INFO: Cleaning log saved to ../data/reports/cleaning_log.txt
INFO: Pipeline Execution Complete. Golden Dataset generated.


Part 5

In [11]:
# 1. Initialize Masker with the cleaned data from Part 4
masker = DataMasker(remediator.df)

# 2. Execute Masking Chain
masker.mask_names().mask_emails().mask_phones().mask_addresses().mask_dob()

# 3. Save the final GDPR-compliant file
masked_df = masker.save_masked_data('../data/processed/customers_masked.csv')

# 4. Generate the deliverable comparison
masker.generate_masked_sample(profiler.df, '../data/reports/masked_sample.txt')

display(masked_df.head(5))

INFO: Masked dataset saved to ../data/processed/customers_masked.csv
INFO: Masked sample report generated at ../data/reports/masked_sample.txt


Unnamed: 0,customer_id,first_name,last_name,email,phone,date_of_birth,address,income,account_status,created_date
0,1,J***,D***,j***@gmail.com,***-***-4567,1985-**-**,[MASKED ADDRESS],75000.0,active,2024-01-10
1,2,J***,S***,j***@company.com,***-***-6543,1990-**-**,[MASKED ADDRESS],95000.0,active,2024-01-11
2,3,N***,J***,b***@email.com,***-***-5678,1988-**-**,[MASKED ADDRESS],0.0,suspended,2024-01-12
3,4,M***,B***,m***@gmail.com,***-***-6789,,[MASKED ADDRESS],120000.0,unknown,2024-01-13
4,5,R***,N***,r***@yahoo.com,***-***-7890,2005-**-**,[MASKED ADDRESS],55000.0,active,
