# Exercises: Python Type System & Validation

Practice what you've learned by completing these exercises.

---

In [1]:
# Import required modules
from pydantic import BaseModel, Field, field_validator, model_validator
from typing import List, Dict, Optional, Union, Literal, Any
from enum import Enum

## Exercise 1: Database Connection Config

**Goal**: Create a `DatabaseConfig` model with proper types and constraints.

**Requirements**:
- `host`: str (required)
- `port`: int (default 5432, must be between 1-65535)
- `database`: str (required)
- `username`: str (required)
- `password`: Optional[str] (for security, make it optional)
- `ssl_enabled`: bool (default True)
- `timeout`: int (default 30, must be positive)

In [2]:
# TODO: Implement DatabaseConfig
class DatabaseConfig(BaseModel):
    host: str
    port: int = Field(default=5432, ge=1, le=65535)
    database: str
    username: str
    password: Optional[str] = None
    ssl_enabled: bool = Field(default=True)
    timeout: int = Field(default=30,ge=0)


# Test your implementation
db = DatabaseConfig(
    host="postgres.example.com",
    database="analytics",
    username="analyst"
)
print(db.model_dump_json(indent=2))

# Test validation - this should fail (port too high)
try:
    bad_db = DatabaseConfig(
        host="localhost",
        port=99999,
        database="test",
        username="user"
    )
except Exception as e:
    print(f"âœ… Validation caught error: {e}")

{
  "host": "postgres.example.com",
  "port": 5432,
  "database": "analytics",
  "username": "analyst",
  "password": null,
  "ssl_enabled": true,
  "timeout": 30
}
âœ… Validation caught error: 1 validation error for DatabaseConfig
port
  Input should be less than or equal to 65535 [type=less_than_equal, input_value=99999, input_type=int]
    For further information visit https://errors.pydantic.dev/2.6/v/less_than_equal


## Exercise 2: Production Environment Validation

**Goal**: Extend `DatabaseConfig` to prevent localhost in production.

**Requirements**:
- Add `environment`: Literal["dev", "staging", "prod"]
- Add a `@model_validator` that ensures:
  - If `environment == "prod"`, `host` cannot be "localhost" or "127.0.0.1"
  - If `environment == "prod"`, `ssl_enabled` must be True

In [3]:
# TODO: Implement ProductionDatabaseConfig
class ProductionDatabaseConfig(BaseModel):
    host: str
    port: int = Field(default=5432, ge=1, le=65535)
    database: str
    username: str
    password: Optional[str] = None
    ssl_enabled: bool = Field(default=True)
    timeout: int = Field(default=30,ge=0)
    environment: Literal["dev","staging","prod"]

    @model_validator(mode="after")
    def check_production(self):
        if self.environment == "prod":
            if self.host == "localhost" or self.host == "127.0.0.1":
                raise ValueError("host cannot be 'localhost' or '127.0.0.1'")
            elif self.ssl_enabled == False:
                raise ValueError("ssl_enabled must be set to True")
        return self

# Test - this should work
dev_db = ProductionDatabaseConfig(
    host="localhost",
    database="test",
    username="dev",
    environment="dev"
)
print("âœ… Dev with localhost OK")

# Test - this should fail (localhost in prod)
try:
    prod_db = ProductionDatabaseConfig(
        host="localhost",
        database="prod_db",
        username="prod_user",
        environment="prod"
    )
except ValueError as e:
    print(f"âœ… Caught prod localhost error: {e}")

# Test - this should fail (SSL disabled in prod)
try:
    prod_db = ProductionDatabaseConfig(
        host="prod.example.com",
        database="prod_db",
        username="prod_user",
        environment="prod",
        ssl_enabled=False
    )
except ValueError as e:
    print(f"âœ… Caught prod SSL error: {e}")

âœ… Dev with localhost OK
âœ… Caught prod localhost error: 1 validation error for ProductionDatabaseConfig
  Value error, host cannot be 'localhost' or '127.0.0.1' [type=value_error, input_value={'host': 'localhost', 'da..., 'environment': 'prod'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error
âœ… Caught prod SSL error: 1 validation error for ProductionDatabaseConfig
  Value error, ssl_enabled must be set to True [type=value_error, input_value={'host': 'prod.example.co...', 'ssl_enabled': False}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


## Exercise 3: SQL Transformation Config

**Goal**: Create a model for SQL transformations.

**Requirements**:
- `name`: str (must be valid Python identifier)
- `sql`: str (required, cannot be empty or whitespace)
- `description`: Optional[str]
- `parameters`: Dict[str, Any] (default empty dict)
- `enabled`: bool (default True)

**Validators**:
- Validate `name` is a valid Python identifier (use `str.isidentifier()`)
- Validate `sql` is not empty or just whitespace (use `str.strip()`)

In [4]:
# TODO: Implement TransformationConfig
class TransformationConfig(BaseModel):
    name: str
    sql: str
    description: Optional[str]
    parameters: Dict[str, Any] = Field(default=dict)
    enabled: bool = True

    @field_validator("name")
    @classmethod
    def validate_name(cls,v: str) -> str:
        if not v.isidentifier():
            raise ValueError(
                "name needs to be a valid python identifier"
            )
    @field_validator("sql")
    @classmethod
    def validate_sql(cls,v: str) -> str:
        if not v.strip():
            raise ValueError(
                "sql cannot be empty or have whitespace"
            )


# Test your implementation
transform = TransformationConfig(
    name="clean_sales",
    sql="SELECT * FROM sales WHERE amount > 0",
    description="Remove negative amounts",
    parameters={"min_amount": 0}
)
print(transform.model_dump_json(indent=2))

# Test - invalid name (has hyphen)
try:
    bad_transform = TransformationConfig(
        name="clean-sales",
        sql="SELECT 1"
    )
except ValueError as e:
    print(f"âœ… Invalid name caught: {e}")

# Test - empty SQL
try:
    bad_transform = TransformationConfig(
        name="test",
        sql="   "  # Just whitespace
    )
except ValueError as e:
    print(f"âœ… Empty SQL caught: {e}")

{
  "name": null,
  "sql": null,
  "description": "Remove negative amounts",
  "parameters": {
    "min_amount": 0
  },
  "enabled": true
}
âœ… Invalid name caught: 2 validation errors for TransformationConfig
name
  Value error, name needs to be a valid python identifier [type=value_error, input_value='clean-sales', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error
description
  Field required [type=missing, input_value={'name': 'clean-sales', 'sql': 'SELECT 1'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/missing
âœ… Empty SQL caught: 2 validation errors for TransformationConfig
sql
  Value error, sql cannot be empty or have whitespace [type=value_error, input_value='   ', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error
description
  Field required [type=missing, input_value={'name': 'test', 'sql': '   '}, input_type=dict]
    For further information 

## Exercise 4: File Format Config

**Goal**: Create configs for different file formats with format-specific options.

**Requirements**:
- Create `FileFormat` enum: CSV, PARQUET, JSON, AVRO
- Create `CompressionType` enum: NONE, GZIP, SNAPPY, LZ4
- Create `FileConfig` with:
  - `path`: str (required)
  - `format`: FileFormat (required)
  - `compression`: CompressionType (default NONE)
  - `options`: Dict[str, Any] (default empty)
  - Add validator: if format is CSV, options can have "delimiter" and "header"
  - Add validator: PARQUET and AVRO cannot use GZIP (not supported)

In [8]:
# TODO: Implement FileFormat, CompressionType, and FileConfig

class FileFormat(str, Enum):
    CSV = "csv"
    PARQUET = "parquet"
    JSON = "json"
    AVRO = "avro"

class CompressionType(str, Enum):
    NONE = "none"
    GZIP = "gzip"
    SNAPPY = "snappy"
    LZ4 = "lz4"

class FileConfig(BaseModel):
    path: str
    format: FileFormat
    compression: CompressionType = CompressionType.NONE
    options: Dict[str, Any] = Field(default_factory=dict)

    @model_validator(mode="after")
    def check_compression_compatibility(self):
        """Ensure compression is compatible with format."""
        # GZIP not supported for PARQUET and AVRO (they have their own)
        if self.format in [FileFormat.PARQUET, FileFormat.AVRO]:
            if self.compression == CompressionType.GZIP:
                raise ValueError(
                    f"{self.format.value} format does not support GZIP compression. "
                    "Use SNAPPY or LZ4 instead."
                )
        elif self.format == FileFormat.CSV:
            if not all(k in self.options for k in ["delimiter", "header"]):
                raise ValueError("Must set delimiter and header when using CSV")



        return self
# Test CSV with options
csv_file = FileConfig(
    path="/data/sales.csv",
    format=FileFormat.CSV,
    options={"delimiter": "|", "header": True}
)
print(csv_file)

# Test Parquet with compression
parquet_file = FileConfig(
    path="/data/sales.parquet",
    format=FileFormat.PARQUET,
    compression=CompressionType.SNAPPY
)
print(parquet_file)

# Test invalid combination (Parquet + GZIP)
try:
    bad_file = FileConfig(
        path="/data/test.parquet",
        format=FileFormat.PARQUET,
        compression=CompressionType.GZIP
    )
except ValueError as e:
    print(f"âœ… Invalid compression caught: {e}")

path='/data/sales.csv' format=<FileFormat.CSV: 'csv'> compression=<CompressionType.NONE: 'none'> options={'delimiter': '|', 'header': True}
path='/data/sales.parquet' format=<FileFormat.PARQUET: 'parquet'> compression=<CompressionType.SNAPPY: 'snappy'> options={}
âœ… Invalid compression caught: 1 validation error for FileConfig
  Value error, parquet format does not support GZIP compression. Use SNAPPY or LZ4 instead. [type=value_error, input_value={'path': '/data/test.parq...ssionType.GZIP: 'gzip'>}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


## Exercise 5: Data Quality Rules

**Goal**: Create a model for data quality validation rules.

**Requirements**:
- `rule_name`: str (valid identifier)
- `column`: str (required)
- `rule_type`: Literal["not_null", "unique", "range", "regex", "custom"]
- `parameters`: Dict[str, Any] (default empty)
- `severity`: Literal["warning", "error"] (default "error")

**Validators**:
- If `rule_type == "range"`, parameters must have "min" or "max"
- If `rule_type == "regex"`, parameters must have "pattern"
- If `rule_type == "custom"`, parameters must have "function"

In [9]:
# TODO: Implement DataQualityRule
class DataQualityRule(BaseModel):
    rule_name: str
    column: str
    rule_type: Literal["not_null", "unique", "range", "regex", "custom"]
    parameters: Dict[str, Any] = Field(default_factory=dict)
    severity: Literal["warning", "error"] = Field(default="error")

    @field_validator("rule_type")
    @classmethod
    def validate_rule_name(cls, v: str) -> str:
        """Ensure rule name is valid identifier."""
        if not v.isidentifier():
            raise ValueError(f"Rule name must be valid identifier: {v}")
        return v
    
    @model_validator(mode="after")
    def check_parameters(self):
        """Validate parameters based on rule type."""
        if self.rule_type == "range":
            if "min" not in self.parameters and "max" not in self.parameters:
                raise ValueError(
                    "Range rule requires at least 'min' or 'max' in parameters"
                )
        
        elif self.rule_type == "regex":
            if "pattern" not in self.parameters:
                raise ValueError(
                    "Regex rule requires 'pattern' in parameters"
                )
        
        elif self.rule_type == "custom":
            if "function" not in self.parameters:
                raise ValueError(
                    "Custom rule requires 'function' in parameters"
                )
        
        return self
# Test not_null rule
rule1 = DataQualityRule(
    rule_name="check_customer_id",
    column="customer_id",
    rule_type="not_null"
)
print(rule1)

# Test range rule
rule2 = DataQualityRule(
    rule_name="check_age_range",
    column="age",
    rule_type="range",
    parameters={"min": 0, "max": 120}
)
print(rule2)

# Test invalid range rule (missing parameters)
try:
    bad_rule = DataQualityRule(
        rule_name="bad_range",
        column="value",
        rule_type="range"
        # Missing min/max!
    )
except ValueError as e:
    print(f"âœ… Missing range parameters caught: {e}")

rule_name='check_customer_id' column='customer_id' rule_type='not_null' parameters={} severity='error'
rule_name='check_age_range' column='age' rule_type='range' parameters={'min': 0, 'max': 120} severity='error'
âœ… Missing range parameters caught: 1 validation error for DataQualityRule
  Value error, Range rule requires at least 'min' or 'max' in parameters [type=value_error, input_value={'rule_name': 'bad_range'...', 'rule_type': 'range'}, input_type=dict]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error


## Exercise 6: Complete ETL Pipeline Config

**Goal**: Combine all previous models into a complete ETL configuration.

**Requirements**:
- `name`: str (required)
- `source`: FileConfig (required)
- `transformations`: List[TransformationConfig] (at least one required)
- `quality_rules`: List[DataQualityRule] (default empty)
- `destination`: DatabaseConfig (required)
- `schedule`: Optional[str] (cron expression)
- `enabled`: bool (default True)

**Validators**:
- Ensure `transformations` list has at least one item
- Ensure all transformation names are unique
- Ensure all quality rule names are unique

In [18]:
# TODO: Implement ETLPipelineConfig
class ETLPipelineConfig(BaseModel):
    name: str
    source: FileConfig
    transformations: List[TransformationConfig] = Field(min_length=1)
    quality_rules: List[DataQualityRule] = Field(default_factory=list)
    destination: DatabaseConfig
    schedule: Optional[str] = None
    enabled: bool = Field(default=True)

    @field_validator('transformations')
    @classmethod
    def check_unique_transformation_names(cls, transformations: List[TransformationConfig]):
        """Ensure all transformation names are unique."""
        names = [t.name for t in transformations]
        if len(names) != len(set(names)):
            duplicates = [name for name in names if names.count(name) > 1]
            raise ValueError(f"Duplicate transformation names: {set(duplicates)}")
        return transformations
    
    @field_validator('quality_rules')
    @classmethod
    def check_unique_rule_names(cls, rules: List[DataQualityRule]):
        """Ensure all quality rule names are unique."""
        if not rules:
            return rules
        names = [r.rule_name for r in rules]
        if len(names) != len(set(names)):
            duplicates = [name for name in names if names.count(name) > 1]
            raise ValueError(f"Duplicate quality rule names: {set(duplicates)}")
        return rules
# Create a complete ETL pipeline
etl = ETLPipelineConfig(
    name="daily_sales_etl",
    source=FileConfig(
        path="/data/sales.csv",
        format=FileFormat.CSV,
        options={"delimiter": ",", "header": True}
    ),
    transformations=[
        TransformationConfig(
            name="filter_valid",
            sql="SELECT * FROM source WHERE amount > 0",
            description="Remove invalid sales"

        ),
        TransformationConfig(
            name="add_timestamp",
            sql="SELECT *, CURRENT_TIMESTAMP as processed_at FROM filtered",
            description="Add processing timestamp"
        )
    ],
    quality_rules=[
        DataQualityRule(
            rule_name="check_amount",
            column="amount",
            rule_type="range",
            parameters={"min": 0}
        )
    ],
    destination=DatabaseConfig(
        host="warehouse.example.com",
        database="analytics",
        username="etl_user"
    ),
    schedule="0 2 * * *"  # Daily at 2 AM
)
print(etl.model_dump_json(indent=2))

{
  "name": "daily_sales_etl",
  "source": {
    "path": "/data/sales.csv",
    "format": "csv",
    "compression": "none",
    "options": {
      "delimiter": ",",
      "header": true
    }
  },
  "transformations": [
    {
      "name": "filter_valid",
      "sql": "SELECT * FROM source WHERE amount > 0",
      "description": "Remove invalid sales"
    },
    {
      "name": "add_timestamp",
      "sql": "SELECT *, CURRENT_TIMESTAMP as processed_at FROM filtered",
      "description": "Add processing timestamp"
    }
  ],
  "quality_rules": [
    {
      "rule_name": "check_amount",
      "column": "amount",
      "rule_type": "range",
      "parameters": {
        "min": 0
      }
    }
  ],
  "destination": {
    "host": "warehouse.example.com",
    "database": "analytics",
    "username": "etl_user",
    "password": null
  },
  "schedule": "0 2 * * *",
  "enabled": true
}


## Exercise 7: Advanced - Union Types

**Goal**: Handle multiple source types in one pipeline.

**Requirements**:
- Create `APIConfig` model:
  - `url`: str (required)
  - `method`: Literal["GET", "POST"] (default "GET")
  - `headers`: Dict[str, str] (default empty)
  - `timeout`: int (default 30, must be positive)
- Create `Source` type alias as Union[FileConfig, DatabaseConfig, APIConfig]
- Modify ETLPipelineConfig to accept `source: Source`

In [20]:
class APIConfig(BaseModel):
    """API data source configuration."""
    url: str
    method: Literal["GET", "POST"] = "GET"
    headers: Dict[str, str] = Field(default_factory=dict)
    timeout: int = Field(default=30, gt=0)
    
    @field_validator('url')
    @classmethod
    def validate_url(cls, v: str) -> str:
        """Ensure URL starts with http:// or https://."""
        if not v.startswith(("http://", "https://")):
            raise ValueError("URL must start with http:// or https://")
        return v

# Union type for flexible sources
Source = Union[FileConfig, DatabaseConfig, APIConfig]

class FlexibleETLConfig(BaseModel):
    """ETL config that accepts any source type."""
    name: str
    source: Source
    transformations: List[TransformationConfig] = Field(min_length=1)
    quality_rules: List[DataQualityRule] = Field(default_factory=list)
    destination: DatabaseConfig
    schedule: Optional[str] = None
    enabled: bool = True

# Test with file source
file_etl = FlexibleETLConfig(
    name="file_pipeline",
    source=FileConfig(
        path="/data/file.csv",
        format=FileFormat.PARQUET
    ),
    transformations=[
        TransformationConfig(name="transform1", sql="SELECT * FROM source")
    ],
    destination=DatabaseConfig(
        host="warehouse.com",
        database="analytics",
        username="etl"
    )
)
print("âœ… File source ETL")

# Test with API source
api_etl = FlexibleETLConfig(
    name="api_pipeline",
    source=APIConfig(
        url="https://api.example.com/data",
        method="GET",
        headers={"Authorization": "Bearer token"}
    ),
    transformations=[
        TransformationConfig(name="parse_json", sql="SELECT * FROM json_table(source)")
    ],
    destination=DatabaseConfig(
        host="warehouse.com",
        database="analytics",
        username="etl"
    )
)
print("âœ… API source ETL")

# Test with database source
db_etl = FlexibleETLConfig(
    name="db_pipeline",
    source=DatabaseConfig(
        host="source-db.example.com",
        database="production",
        username="reader"
    ),
    transformations=[
        TransformationConfig(name="aggregate", sql="SELECT category, SUM(amount) FROM source GROUP BY category")
    ],
    destination=DatabaseConfig(
        host="warehouse.com",
        database="analytics",
        username="etl"
    )
)
print("âœ… Database source ETL")

print("\nâœ… All source types work with FlexibleETLConfig!")

âœ… File source ETL
âœ… API source ETL
âœ… Database source ETL

âœ… All source types work with FlexibleETLConfig!


## Bonus Exercise: Schema Migration

**Challenge**: Create a model for database schema migrations.

**Requirements**:
- `version`: str (format: "vX.Y.Z" where X, Y, Z are integers)
- `description`: str (required)
- `up_sql`: str (SQL to apply migration)
- `down_sql`: str (SQL to rollback migration)
- `applied_at`: Optional[str] (ISO timestamp)
- `checksum`: Optional[str] (MD5 hash of up_sql)

**Validators**:
- Validate version format with regex
- Ensure up_sql and down_sql are not empty
- Auto-compute checksum from up_sql if not provided

In [21]:
# TODO: Implement SchemaMigration
# Hint: Use field_validator and model_validator
# Hint: For checksum, use hashlib.md5

import hashlib
import re

class SchemaMigration(BaseModel):
    """Database schema migration model."""
    version: str
    description: str
    up_sql: str
    down_sql: str
    applied_at: Optional[str] = None
    checksum: Optional[str] = None
    
    @field_validator('version')
    @classmethod
    def validate_version(cls, v: str) -> str:
        """Ensure version follows vX.Y.Z format."""
        pattern = r'^v\d+\.\d+\.\d+$'
        if not re.match(pattern, v):
            raise ValueError(
                f"Version must be in format vX.Y.Z (e.g., v1.0.0), got: {v}"
            )
        return v
    
    @field_validator('up_sql', 'down_sql')
    @classmethod
    def validate_sql_not_empty(cls, v: str) -> str:
        """Ensure SQL is not empty or whitespace."""
        if not v.strip():
            raise ValueError("SQL cannot be empty or whitespace")
        return v
    
    @model_validator(mode="after")
    def compute_checksum(self):
        """Auto-compute checksum from up_sql if not provided."""
        if self.checksum is None:
            self.checksum = hashlib.md5(self.up_sql.encode()).hexdigest()
        return self

# Test migration
migration = SchemaMigration(
    version="v1.0.0",
    description="Add user_id column to orders table",
    up_sql="ALTER TABLE orders ADD COLUMN user_id INTEGER NOT NULL",
    down_sql="ALTER TABLE orders DROP COLUMN user_id"
)
print(migration.model_dump_json(indent=2))
print(f"\nâœ… Checksum auto-computed: {migration.checksum}")

# Test invalid version
try:
    bad_migration = SchemaMigration(
        version="1.0.0",  # Missing 'v' prefix
        description="Test",
        up_sql="SELECT 1",
        down_sql="SELECT 0"
    )
except ValueError as e:
    print(f"\nâœ… Invalid version caught: {e}")

# Test manual checksum
migration_with_checksum = SchemaMigration(
    version="v2.0.0",
    description="Add index",
    up_sql="CREATE INDEX idx_user_id ON orders(user_id)",
    down_sql="DROP INDEX idx_user_id",
    checksum="custom_checksum_123"
)
print(f"\nâœ… Manual checksum preserved: {migration_with_checksum.checksum}")

{
  "version": "v1.0.0",
  "description": "Add user_id column to orders table",
  "up_sql": "ALTER TABLE orders ADD COLUMN user_id INTEGER NOT NULL",
  "down_sql": "ALTER TABLE orders DROP COLUMN user_id",
  "applied_at": null,
  "checksum": "935995d44a3d3b96e8c38a14e15e877b"
}

âœ… Checksum auto-computed: 935995d44a3d3b96e8c38a14e15e877b

âœ… Invalid version caught: 1 validation error for SchemaMigration
version
  Value error, Version must be in format vX.Y.Z (e.g., v1.0.0), got: 1.0.0 [type=value_error, input_value='1.0.0', input_type=str]
    For further information visit https://errors.pydantic.dev/2.6/v/value_error

âœ… Manual checksum preserved: custom_checksum_123


---

## ðŸŽ‰ Completion Checklist

- [ ] Exercise 1: DatabaseConfig with constraints
- [ ] Exercise 2: Production environment validation
- [ ] Exercise 3: SQL transformation config
- [ ] Exercise 4: File format config with validators
- [ ] Exercise 5: Data quality rules
- [ ] Exercise 6: Complete ETL pipeline config
- [ ] Exercise 7: Union types for multiple sources
- [ ] Bonus: Schema migration model

Once complete, check your solutions against `solutions.ipynb`!