# âœ… BÃ€I 4: DATA VALIDATION WITH PYTHON

## Má»¥c tiÃªu:
- Validate data quality
- Check business rules
- Referential integrity
- Custom validation rules
- Validation reports

In [None]:
# Setup
import pandas as pd
import numpy as np
import sys
sys.path.append('/home/jovyan/week-03-04-python-etl/scripts')

from db_connector import DatabaseConnector
from validators import DataValidator, validate_dataframe

print("âœ… Setup complete!")

---
## ðŸ“Š PART 1: Load Test Data

In [None]:
# Create sample data with quality issues
test_data = pd.DataFrame({
    'customer_id': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'customer_name': ['John Doe', 'Jane Smith', None, 'Bob Wilson', 'Alice Brown', 
                      'Charlie Davis', 'Eve White', 'Frank Black', 'Grace Green', 'Henry Blue'],
    'email': ['john@test.com', 'invalid-email', 'jane@test.com', 'bob@test.com', None,
              'charlie@test.com', 'eve@test.com', 'frank@test.com', 'grace@test.com', 'henry@test.com'],
    'age': [25, 30, 35, -5, 200, 40, 28, 32, 45, 50],
    'country': ['Vietnam', 'USA', 'UK', 'Vietnam', 'USA', 'UK', 'Vietnam', 'USA', 'UK', 'Vietnam'],
    'status': ['active', 'active', 'inactive', 'pending', 'active', 'unknown', 'active', 'inactive', 'active', 'pending'],
    'created_at': pd.date_range('2024-01-01', periods=10, freq='D')
})

print(f"Test data shape: {test_data.shape}")
test_data

---
## ðŸŽ¯ EXERCISE 1: Basic Validations

In [None]:
# TODO: Check for null values in critical columns
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_not_null(['customer_id', 'customer_name', 'email'])

# Print report
validator.print_report()

In [None]:
# TODO: Check for unique customer_id
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_unique(['customer_id'])

validator.print_report()

In [None]:
# TODO: Check data types
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_data_type({
    'customer_id': 'int',
    'customer_name': 'string',
    'age': 'int',
    'created_at': 'datetime'
})

validator.print_report()

---
## ðŸŽ¯ EXERCISE 2: Range Validations

In [None]:
# TODO: Validate age range (18-100)
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_range('age', 18, 100)

validator.print_report()

# Show invalid records
invalid_ages = test_data[(test_data['age'] < 18) | (test_data['age'] > 100)]
print("\nInvalid age records:")
print(invalid_ages[['customer_id', 'customer_name', 'age']])

---
## ðŸŽ¯ EXERCISE 3: Pattern Validations

In [None]:
# TODO: Validate email format
import re

validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
email_pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$'
validator.check_pattern('email', email_pattern, 'valid email format')

validator.print_report()

# Show invalid emails
test_data['email_valid'] = test_data['email'].astype(str).str.match(email_pattern, na=False)
invalid_emails = test_data[~test_data['email_valid']]
print("\nInvalid emails:")
print(invalid_emails[['customer_id', 'customer_name', 'email']])

---
## ðŸŽ¯ EXERCISE 4: Value List Validations

In [None]:
# TODO: Validate status values
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
valid_statuses = ['active', 'inactive', 'pending']
validator.check_values_in_list('status', valid_statuses)

validator.print_report()

# Show invalid statuses
invalid_status = test_data[~test_data['status'].isin(valid_statuses)]
print("\nInvalid status records:")
print(invalid_status[['customer_id', 'customer_name', 'status']])

In [None]:
# TODO: Validate country values
validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
valid_countries = ['Vietnam', 'USA', 'UK', 'Singapore', 'Japan']
validator.check_values_in_list('country', valid_countries)

validator.print_report()

---
## ðŸŽ¯ EXERCISE 5: Referential Integrity

In [None]:
# Create orders data
orders_data = pd.DataFrame({
    'order_id': [1, 2, 3, 4, 5],
    'customer_id': [1, 2, 3, 99, 5],  # customer_id 99 doesn't exist
    'order_date': pd.date_range('2024-01-01', periods=5, freq='D'),
    'total_amount': [100, 200, 150, 300, 250]
})

print("Orders data:")
print(orders_data)

In [None]:
# TODO: Check referential integrity
validator = DataValidator(orders_data, "orders")

# YOUR CODE HERE
validator.check_referential_integrity(
    column='customer_id',
    reference_df=test_data,
    reference_column='customer_id'
)

validator.print_report()

# Show orphaned records
orphaned = orders_data[~orders_data['customer_id'].isin(test_data['customer_id'])]
print("\nOrphaned orders:")
print(orphaned)

---
## ðŸŽ¯ EXERCISE 6: Custom Validations

In [None]:
# TODO: Custom validation - check if created_at is not in future
def check_not_future_date(df):
    """Check if created_at is not in future"""
    future_dates = df[df['created_at'] > pd.Timestamp.now()]
    return len(future_dates) == 0

validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_custom(
    check_name='no_future_dates',
    check_func=check_not_future_date,
    error_message='Found records with future dates'
)

validator.print_report()

In [None]:
# TODO: Custom validation - check if active customers have valid email
def check_active_has_email(df):
    """Active customers must have valid email"""
    active = df[df['status'] == 'active']
    invalid = active[active['email'].isna() | (active['email'] == '')]
    return len(invalid) == 0

validator = DataValidator(test_data, "test_customers")

# YOUR CODE HERE
validator.check_custom(
    check_name='active_has_email',
    check_func=check_active_has_email,
    error_message='Active customers without valid email'
)

validator.print_report()

---
## ðŸŽ¯ EXERCISE 7: Complete Validation Pipeline

In [None]:
# TODO: Run all validations at once
validator = (
    DataValidator(test_data, "test_customers")
    # YOUR CODE HERE - Chain all validations
    .check_not_null(['customer_id', 'customer_name', 'email'])
    .check_unique(['customer_id'])
    .check_data_type({
        'customer_id': 'int',
        'customer_name': 'string',
        'age': 'int'
    })
    .check_range('age', 18, 100)
    .check_pattern('email', r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', 'email format')
    .check_values_in_list('status', ['active', 'inactive', 'pending'])
    .check_values_in_list('country', ['Vietnam', 'USA', 'UK', 'Singapore', 'Japan'])
)

# Print comprehensive report
validator.print_report()

# Get summary
summary = validator.get_summary()
print("\nðŸ“Š Summary:")
for key, value in summary.items():
    print(f"  {key}: {value}")

In [None]:
# Get detailed results as DataFrame
results_df = validator.get_results()
print("\nðŸ“‹ Detailed Results:")
results_df

---
## ðŸŽ¯ EXERCISE 8: Quick Validation with Rules Dictionary

In [None]:
# TODO: Use validate_dataframe helper function
validation_rules = {
    'not_null': ['customer_id', 'customer_name'],
    'unique': ['customer_id'],
    'data_types': {
        'customer_id': 'int',
        'customer_name': 'string',
        'age': 'int'
    },
    'ranges': {
        'age': (18, 100)
    },
    'patterns': {
        'email': (r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$', 'email format')
    },
    'no_duplicates': ['customer_id']
}

# YOUR CODE HERE
validator = validate_dataframe(test_data, validation_rules, "quick_validation")

validator.print_report()

---
## ðŸŽ¯ EXERCISE 9: Validate Database Data

In [None]:
# TODO: Load and validate real database data
db = DatabaseConnector()

# Load customers
customers = db.read_sql("""
    SELECT * FROM analytics.customers
    LIMIT 1000
""")

print(f"Loaded {len(customers)} customers")
customers.head()

In [None]:
# TODO: Validate database customers
db_validation_rules = {
    'not_null': ['customer_id', 'customer_name', 'email', 'country'],
    'unique': ['customer_id', 'email'],
    'data_types': {
        'customer_id': 'int',
        'customer_name': 'string',
        'email': 'string',
        'country': 'string'
    }
}

# YOUR CODE HERE
db_validator = validate_dataframe(customers, db_validation_rules, "database_customers")

db_validator.print_report()

---
## ðŸŽ¯ CHALLENGE: Build Validation Report

In [None]:
# TODO: Create comprehensive validation report
import matplotlib.pyplot as plt
import seaborn as sns

def create_validation_report(validator):
    """
    Create visual validation report
    """
    results = validator.get_results()
    summary = validator.get_summary()
    
    # Create figure
    fig, axes = plt.subplots(2, 2, figsize=(15, 10))
    fig.suptitle(f"Validation Report: {summary['dataset_name']}", fontsize=16, fontweight='bold')
    
    # 1. Pass/Fail pie chart
    ax1 = axes[0, 0]
    pass_fail = results['passed'].value_counts()
    colors = ['#2ecc71', '#e74c3c']
    ax1.pie(pass_fail.values, labels=['Passed', 'Failed'], autopct='%1.1f%%', colors=colors, startangle=90)
    ax1.set_title('Overall Pass Rate')
    
    # 2. Failed checks bar chart
    ax2 = axes[0, 1]
    failed = results[~results['passed']].sort_values('failed_count', ascending=True)
    if len(failed) > 0:
        ax2.barh(failed['check_name'], failed['failed_count'], color='#e74c3c')
        ax2.set_xlabel('Failed Records')
        ax2.set_title('Failed Checks')
    else:
        ax2.text(0.5, 0.5, 'All checks passed!', ha='center', va='center', fontsize=14, color='green')
        ax2.axis('off')
    
    # 3. Summary table
    ax3 = axes[1, 0]
    ax3.axis('tight')
    ax3.axis('off')
    summary_data = [
        ['Total Checks', summary['total_checks']],
        ['Passed', summary['passed']],
        ['Failed', summary['failed']],
        ['Pass Rate', summary['pass_rate']],
        ['Failed Records', summary['total_failed_records']]
    ]
    table = ax3.table(cellText=summary_data, colLabels=['Metric', 'Value'],
                     cellLoc='left', loc='center', colWidths=[0.6, 0.4])
    table.auto_set_font_size(False)
    table.set_fontsize(10)
    table.scale(1, 2)
    ax3.set_title('Summary Statistics')
    
    # 4. Check types distribution
    ax4 = axes[1, 1]
    check_types = results['check_name'].str.split('_').str[0].value_counts()
    ax4.bar(check_types.index, check_types.values, color='#3498db')
    ax4.set_xlabel('Check Type')
    ax4.set_ylabel('Count')
    ax4.set_title('Validation Types')
    ax4.tick_params(axis='x', rotation=45)
    
    plt.tight_layout()
    plt.show()
    
    return fig

# Test with our validator
create_validation_report(validator)

---
## ðŸ“š KEY TAKEAWAYS

### Data Validation Best Practices:

1. **Validate Early** - Check data quality as soon as possible
2. **Comprehensive Checks** - Cover all aspects (nulls, types, ranges, patterns)
3. **Business Rules** - Validate domain-specific rules
4. **Referential Integrity** - Check relationships between tables
5. **Custom Validations** - Create specific checks for your use case
6. **Report & Monitor** - Track validation results over time
7. **Fail Fast** - Stop processing if critical validations fail
8. **Document Rules** - Clear documentation of validation logic