# Data Exploration Example

This notebook demonstrates how to use the data pipeline for exploratory data analysis.

In [None]:
# Import necessary libraries
import sys
sys.path.append('../../src')

import pandas as pd
from data_pipeline.core.config import Config
from data_pipeline.sources import CSVSource, JSONSource, S3Source
from data_pipeline.utils import DataProfiler, SchemaInferrer

## Load Configuration

In [None]:
# Load configuration
config = Config.from_yaml('../../config/environments', 'development')
print(f"Loaded configuration: {config.name}")

## Load Data from CSV

In [None]:
# Initialize CSV source
csv_source = CSVSource({
    'base_path': '../../data/raw',
    'encoding': 'utf-8'
})

# List available CSV files
available_files = csv_source.list_sources()
print("Available CSV files:")
for file in available_files[:10]:  # Show first 10
    print(f"  - {file}")

In [None]:
# Load a sample CSV file (replace with your actual file)
try:
    # Example: load the first CSV file if available
    if available_files:
        sample_file = available_files[0]
        df = csv_source.read(sample_file)
        print(f"Loaded {len(df)} rows from {sample_file}")
        print(f"Columns: {list(df.columns)}")
        display(df.head())
    else:
        # Create sample data for demonstration
        df = pd.DataFrame({
            'id': range(1, 1001),
            'name': [f'Customer_{i}' for i in range(1, 1001)],
            'age': pd.np.random.randint(18, 80, 1000),
            'salary': pd.np.random.randint(30000, 150000, 1000),
            'department': pd.np.random.choice(['Sales', 'Marketing', 'Engineering', 'HR'], 1000),
            'join_date': pd.date_range('2020-01-01', periods=1000, freq='D'),
            'is_active': pd.np.random.choice([True, False], 1000, p=[0.8, 0.2])
        })
        print("Created sample dataset for demonstration")
        display(df.head())
except Exception as e:
    print(f"Error loading data: {e}")
    # Create sample data as fallback
    df = pd.DataFrame({
        'id': range(1, 101),
        'value': pd.np.random.randn(100),
        'category': pd.np.random.choice(['A', 'B', 'C'], 100)
    })
    print("Using fallback sample data")

## Data Profiling

In [None]:
# Initialize data profiler
profiler = DataProfiler({
    'sample_size': 10000,
    'correlation_threshold': 0.7
})

# Generate comprehensive profile
profile = profiler.profile_dataset(df, "Sample Dataset")

# Display basic info
print("Dataset Overview:")
info = profile['dataset_info']
for key, value in info.items():
    print(f"  {key}: {value}")

In [None]:
# Display data quality assessment
print("\nData Quality Summary:")
quality = profile['data_quality']
print(f"  Overall Score: {quality['overall_score']:.1f}%")
print(f"  Completeness: {quality['completeness_score']:.1f}%")
print(f"  Duplicate Rows: {quality['duplicate_rows']} ({quality['duplicate_percentage']:.1f}%)")

if quality['quality_issues']['high_null_columns']:
    print(f"  High Null Columns: {quality['quality_issues']['high_null_columns']}")
if quality['quality_issues']['potential_id_columns']:
    print(f"  Potential ID Columns: {quality['quality_issues']['potential_id_columns']}")

In [None]:
# Display recommendations
print("\nRecommendations:")
for i, rec in enumerate(profile['recommendations'], 1):
    print(f"  {i}. [{rec['priority'].upper()}] {rec['title']}")
    print(f"     {rec['description']}")
    print()

## Schema Inference

In [None]:
# Initialize schema inferrer
schema_inferrer = SchemaInferrer({
    'sample_size': 5000,
    'cardinality_threshold': 50
})

# Infer schema
schema = schema_inferrer.infer_schema(df, "sample_table")

print(f"Schema for table: {schema['table_name']}")
print(f"Total columns: {schema['total_columns']}")
print(f"Total rows analyzed: {schema['total_rows']}")
print(f"Sample rows: {schema['sample_rows']}")

In [None]:
# Display column analysis
print("\nColumn Analysis:")
for col_name, col_info in schema['columns'].items():
    print(f"\n{col_name}:")
    print(f"  Type: {col_info['pandas_dtype']} -> {col_info['inferred_sql_type']}")
    print(f"  Nullable: {col_info['nullable']} ({col_info['null_percentage']:.1f}% nulls)")
    print(f"  Unique values: {col_info['unique_count']} ({col_info['cardinality']} cardinality)")
    print(f"  Quality score: {col_info['data_quality_score']:.1f}%")
    
    if col_info['sample_values']:
        print(f"  Sample values: {col_info['sample_values'][:5]}")
    
    if col_info['patterns']:
        print(f"  Detected patterns: {col_info['patterns']}")
        
    if col_info['anomalies']:
        print(f"  Anomalies: {col_info['anomalies']}")

In [None]:
# Display recommendations from schema inference
print("\nDatabase Recommendations:")

if schema['primary_key_candidates']:
    print(f"Primary Key Candidates: {schema['primary_key_candidates']}")

if schema['foreign_key_candidates']:
    print("Foreign Key Candidates:")
    for fk in schema['foreign_key_candidates']:
        print(f"  - {fk['column']} -> {fk['referenced_table']} (confidence: {fk['confidence']})")

if schema['constraints']['not_null']:
    print(f"NOT NULL constraints recommended: {schema['constraints']['not_null']}")

if schema['indexes_recommended']:
    print("\nIndex Recommendations:")
    for idx in schema['indexes_recommended']:
        print(f"  - {idx['type']} index on {idx['columns']} (reason: {idx['reason']})")

## Generate HTML Report

In [None]:
# Generate HTML report
report_path = profiler.generate_html_report(profile, "../../reports/sample_data_profile.html")
print(f"HTML report generated: {report_path}")
print("Open this file in a web browser to view the detailed report.")

## Next Steps

1. **Data Cleaning**: Based on the profiling results, clean the data using the processors
2. **Schema Creation**: Use the inferred schema to create database tables
3. **Data Validation**: Set up validation rules using Great Expectations
4. **Pipeline Automation**: Create YAML configurations for automated processing
5. **Monitoring**: Set up monitoring and alerting for data quality issues