# Data Validation

## Data Validation Made Easy with Pandera

### Basic Building Blocks

In [None]:
import pandas as pd

# Create sample data with mixed age types
df = pd.DataFrame(
	{
		"customer_id": [1, 2, 3, 4, 5],
		"age": [25, 30, 35, 40, 45],
		"transaction_amount": [100.0, 50.0, 75.0, 125.0, 200.0],
	}
)

In [None]:
import pandera.pandas as pa


# Define the schema
schema = pa.DataFrameSchema(
	{
		"customer_id": pa.Column(
			int, checks=pa.Check.ge(1), unique=True
		),  # <1>
		"age": pa.Column(
			int, checks=pa.Check.between(0, 120)
		),  # <2>
		"transaction_amount": pa.Column(
			float, checks=pa.Check.ge(0)
		),  # <3>
	}
)
# Validate the DataFrame
validated_df = schema.validate(df)  # <4>
print(validated_df)

In [None]:
# Example of validation failure
invalid_df = pd.DataFrame(
    {
        "customer_id": [1, 2, 2, 4, 5],  # Duplicate ID
        "age": [25, 150, -5, 40, 45],  # Invalid ages
        "transaction_amount": [100.00, 50.00, 75.00, 125.00, 200.00],
    }
)

# This will raise SchemaError
try:
    schema.validate(invalid_df)
except pa.errors.SchemaError as err:
    print('SchemaError:', err)

### Checks

In [None]:
check_is_even = pa.Check(lambda s: s % 2 == 0)

schema = pa.DataFrameSchema(
	{"column1": pa.Column(int, check_is_even)}
)
schema.validate(pd.DataFrame({"column1": [2, 4, 6, 8]}))

#### Built-in Checks

In [None]:
from datetime import datetime

customer_schema = pa.DataFrameSchema(
    {
        "customer_id": pa.Column(
            str, checks=pa.Check.str_length(min_value=5)
        ),
        "email": pa.Column(str, checks=pa.Check.str_contains("@")),
        "signup_date": pa.Column(
            datetime, checks=pa.Check.le(datetime.now())
        ),  # Date not in future
    }
)

In [None]:
customer = pd.DataFrame(
    {
        "customer_id": ["CUST01", "CUST02", "CUST03"],
        "email": ["john@mail.com", "jane@mail.com", "bob@mail.com"],
        "signup_date": ["2023-01-01", "2023-02-15", "2023-03-30"],
    }
)
customer["signup_date"] = pd.to_datetime(customer["signup_date"])

# Validate data
validated_df = customer_schema.validate(customer)
print("Validation passed!")
print(validated_df)

#### Column Check Groups

In [None]:
# Create sample sales data
df = pd.DataFrame(
	{
		"store": ["NY", "CA", "NY", "CA"],
		"profit": [200.0, 300.0, 300.0, 400.0],
	}
)

# Define schema with wide check using groupby
schema = pa.DataFrameSchema(
	{
		"store": pa.Column(str),
		"profit": pa.Column(
			float,
			# Check CA stores have higher average profit than NY
			pa.Check(
				lambda g: g["CA"].mean() > g["NY"].mean(),
				groupby="store",
			),
		),
	}
)

# Validate the DataFrame
validated_df = schema.validate(df)
print("Validation passed!")

#### Wide Checks

In [None]:
# Create sample sales data
df = pd.DataFrame({
    "revenue": [1000.0, 1500.0, 1200.0],
    "expenses": [800.0, 1200.0, 900.0],
    "profit": [200.0, 300.0, 300.0],
})

# Define schema with wide check
schema = pa.DataFrameSchema(
    columns={
        "revenue": pa.Column(float),
        "expenses": pa.Column(float),
        "profit": pa.Column(float),
    },
    checks=pa.Check(
        lambda df: df["profit"] == df["revenue"] - df["expenses"]
    ),
)

validated_df = schema.validate(df)
print("Validation passed!")

### Validation Decorator

#### Check Input

In [None]:
from pandera import check_input


input_schema = pa.DataFrameSchema(
    {
        "name": pa.Column(str),
        "age": pa.Column(int, pa.Check.between(0, 120)),
        "score": pa.Column(float, pa.Check.between(0, 100)),
    }
)


@check_input(input_schema)
def calculate_grade(data: pd.DataFrame):
    data["grade"] = pd.cut(
        data["score"],
        bins=[0, 70, 80, 90, 100],
        labels=["F", "C", "B", "A"],
        include_lowest=True,
    )
    return data

In [None]:
df = pd.DataFrame(
    {
        "name": ["John", "Jane", "Bob"],
        "age": [25, 30, 35],
        "score": [95.5, 88.3, 92.7],
    }
)
result = calculate_grade(df)
print(result)

#### Check Output

In [None]:
from pandera import check_output

output_schema = pa.DataFrameSchema(
	{
		"name": pa.Column(str),
		"age": pa.Column(int, pa.Check.between(0, 120)),
		"score": pa.Column(float, pa.Check.between(0, 100)),
		"grade": pa.Column(
			str, pa.Check(lambda x: x.isin(["A", "B", "C", "F"]))
		),
	}
)


@check_input(input_schema)
@check_output(output_schema)
def calculate_grade(data: pd.DataFrame):
	data["grade"] = pd.cut(
		data["score"],
		bins=[0, 70, 80, 90, 100],
		labels=["F", "C", "B", "A"],
		include_lowest=True,
	)
	return data

#### Check Both Inputs and Outputs

In [None]:
from pandera import check_io


@check_io(data=input_schema, out=output_schema)
def calculate_grade(data: pd.DataFrame):
    data["grade"] = pd.cut(
        data["score"],
        bins=[0, 70, 80, 90, 100],
        labels=["F", "C", "B", "A"],
        include_lowest=True,
    )
    return data

In [None]:
df = pd.DataFrame(
    {
        "name": ["John", "Jane", "Bob"],
        "age": [25, 30, 35],
        "score": [95.5, 88.3, 92.7],
    }
)
result = calculate_grade(df)
print(result)

### Other Arguments for Column Validation

#### Deal with Null Values

In [None]:
schema = pa.DataFrameSchema(
	{
		"id": pa.Column(int),  # Does not allow nulls
		"name": pa.Column(str, nullable=True),  # Allows nulls
		"age": pa.Column(float, nullable=True),  # Allows nulls
	}
)

df = pd.DataFrame(
	{
		"id": [1, 2, 3],
		"name": ["John", None, "Mary"],
		"age": [25.0, 30.0, None],
	}
)

validated_df = schema.validate(df)
print("Validation passed!")

#### Deal with Duplicates

In [None]:
# Define schema with unique constraint
schema = pa.DataFrameSchema(
	{
		"id": pa.Column(int, unique=True),  # Must be unique
		"name": pa.Column(str),  # Duplicates allowed
	}
)


df = pd.DataFrame(
	{"id": [1, 1, 2], "name": ["John", "Jane", "Mary"]}
)

try:
	validated_df = schema.validate(df)
except pa.errors.SchemaError as e:
	print("SchemaError:", e)

##### Required Columns

In [None]:
# Define schema with required columns
schema = pa.DataFrameSchema(
	{
		"id": pa.Column(int),  # Required column
		"name": pa.Column(str),  # Required column
		"age": pa.Column(int, required=False),  # Optional column
	}
)


df = pd.DataFrame(
	{"id": [1, 2, 3], "name": ["John", "Jane", "Mary"]}
)

validated_df = schema.validate(df)
print("Validation passed!")

#### Match Patterns

In [None]:
# Define schema using regex to match column patterns
schema = pa.DataFrameSchema({
    # Match any column starting with 'score_'
    'score_.*': pa.Column(float, regex=True, nullable=True),
    # Regular columns without regex
    'student_id': pa.Column(int),
    'name': pa.Column(str)
})

df = pd.DataFrame({
    'student_id': [1, 2, 3],
    'name': ['John', 'Mary', 'Bob'],
    'score_math': [85.5, 90.0, None],
    'score_science': [88.0, None, 92.5],
    'score_history': [78.5, 88.5, 95.0],
})

validated_df = schema.validate(df)
print("Validation passed!")

### Schema Model

In [None]:
from pandera.typing import Series, DataFrame
import hashlib


class CustomerSchema(pa.DataFrameModel):
	customer_id: Series[str] = pa.Field(
		str_length={"min_value": 5, "max_value": 10}
	)
	email: Series[str] = pa.Field(str_contains="@")


class AnonymizedCustomerSchema(pa.DataFrameModel):
	customer_id: Series[str] = pa.Field(
		str_length={"min_value": 5, "max_value": 10}
	)
	anonymized_email: Series[str] = pa.Field(
		str_length={"min_value": 32, "max_value": 32}
	)

### Export and Load From a YAML File

#### Export to YAML

In [None]:
from pathlib import Path

# Define the schema (reusing our existing schema)
schema = pa.DataFrameSchema(
    {
        "customer_id": pa.Column(
            int, checks=pa.Check.ge(1), unique=True
        ),
        "age": pa.Column(
            int, checks=pa.Check.between(0, 120)
        ),
        "transaction_amount": pa.Column(
            float, checks=pa.Check.ge(0)
        ),
    }
)

# Export schema to YAML
yaml_schema = schema.to_yaml()
print("Exported YAML schema:")
print(yaml_schema)

In [None]:
# Save schema to a YAML file for team sharing
schema_file = Path("customer_schema.yml")

# Write YAML schema to file
with schema_file.open("w") as f:
    f.write(yaml_schema)

print(f"Schema saved to {schema_file}")
print(f"File exists: {schema_file.exists()}")
print(f"File size: {schema_file.stat().st_size} bytes")

In [None]:
# Load schema from YAML file
with schema_file.open("r") as f:
    yaml_content = f.read()

# Import the schema from YAML
import pandera as pa
loaded_schema = pa.io.from_yaml(yaml_content)

print("Schema loaded successfully from YAML!")
print(f"Schema type: {type(loaded_schema)}")
print(f"Schema columns: {list(loaded_schema.columns.keys())}")

In [None]:
# Validate data using the imported schema
# Using our original data from earlier examples
test_df = pd.DataFrame(
    {
        "customer_id": [1, 2, 3, 4, 5],
        "age": [25, 30, 35, 40, 45],
        "transaction_amount": [100.0, 50.0, 75.0, 125.0, 200.0],
    }
)

print("Testing loaded schema with valid data:")
validated_df = loaded_schema.validate(test_df)
print("✓ Validation successful!")
print(validated_df.head())

In [None]:
# Test with invalid data to confirm the loaded schema works properly
invalid_test_df = pd.DataFrame(
    {
        "customer_id": [1, 2, 2, 4, 5],  # Duplicate ID
        "age": [25, 150, -5, 40, 45],  # Invalid ages
        "transaction_amount": [100.0, 50.0, 75.0, 125.0, 200.0],
    }
)

print("Testing loaded schema with invalid data:")
try:
    loaded_schema.validate(invalid_test_df)
    print("This should not print")
except pa.errors.SchemaError as e:
    print("✓ Schema correctly caught validation errors:")
    print(f"  Error: {str(e)[:100]}...")

## Best Practices for Data Validation

### Validate Data at the Point of Entry

In [None]:
def analyze_sales_data(sales_df: pd.DataFrame) -> dict:
	# Problems only discovered during processing
	revenue = sales_df["price"] * sales_df["quantity"]

	return {
		"total_revenue": revenue.sum(),
		"max_sale": sales_df["quantity"].max(),
	}


if __name__ == "__main__":
	# Data with issues
	data = pd.DataFrame(
		{
			"price": [50, 100, "invalid", 75],
			"quantity": [5, 3, 2, "error"],
		}
	)
	try:
		results = analyze_sales_data(data)
		print(results)
	except Exception as e:
		print(f"Error during analysis: {e}")

In [None]:
# Define schema for sales DataFrame
sales_schema = pa.DataFrameSchema(
    {
        "price": pa.Column(float, checks=[pa.Check.ge(0)]),
        "quantity": pa.Column(int, checks=[pa.Check.ge(0)]),
    }
)


@check_input(sales_schema)
def analyze_sales_data(sales_df: pd.DataFrame) -> dict:
    revenue = sales_df["price"] * sales_df["quantity"]

    return {
        "total_revenue": revenue.sum(),
        "max_sale": sales_df["quantity"].max(),
    }

### Validate Only Critical Columns

In [None]:
# Only validate columns used in the calculation
schema = pa.DataFrameSchema(
	{
		"amount": pa.Column(float, checks=pa.Check.gt(0)),
		"store": pa.Column(
			str, checks=pa.Check.isin(["A", "B"])
		),
	}
)


@pa.check_input(schema)
def get_amount_by_store(df):
	return df.groupby("store")["amount"].sum()

In [None]:
df = pd.DataFrame(
	{
		"customer_id": [1, 2, 3],
		"amount": [100.0, 200.0, 300.0],
		"date": ["2023-01-01", "2023-01-02", "2023-01-03"],
		"store": ["A", "B", "A"],
	}
)
amount_by_store = get_amount_by_store(df)