# PANDERA SCHEMA

In [8]:
import pandas as pd
import pandera as pa

In [18]:
# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9, 10],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4, -10],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1", "value_1"],
})

In [19]:
# define schema
schema = pa.DataFrameSchema({
    "column1": pa.Column(int, checks=pa.Check.le(max_value=10)),  #valores no maximos de 10
    "column2": pa.Column(float, checks=pa.Check.lt(-1.2)), #valores no maximo de -1.2
    "column3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"), #valores que inicien con value_
        # define custom checks as functions that take a series as input and
        # outputs a boolean or boolean Series
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ]),
})

validated_df = schema.validate(df, lazy=True)
print(validated_df)

   column1  column2  column3
0        1     -1.3  value_1
1        4     -1.4  value_2
2        0     -2.9  value_3
3       10    -10.1  value_2
4        9    -20.4  value_1
5       10    -10.0  value_1


In [35]:
schema = pa.DataFrameSchema({
    # built-in python types
    "int_column": pa.Column(int),
    "float_column": pa.Column(float),
    "str_column": pa.Column(str),

    # pandas dtype string aliases
    "int_column2": pa.Column("int64"),
    "float_column2": pa.Column("float64"),
    # pandas > 1.0.0 support native "string" type
    "str_column2": pa.Column("str"),

    # pandera DataType
    "int_column3": pa.Column(pa.Int),
    "float_column3": pa.Column(pa.Float),
    "str_column3": pa.Column(pa.String),
})

## Schema Model

In [52]:
from pandera.typing import Series

class Schema(pa.SchemaModel):

    column1: Series[int] = pa.Field(le=10)
    column2: Series[float] = pa.Field(lt=-1.2)
    column3: Series[str] = pa.Field(str_startswith="value_")

    @pa.check("column3")
    def column_3_check(cls, series: Series[str]) -> Series[bool]:
        """Check that column3 values have two elements after being split with '_'"""
        return series.str.split("_", expand=True).shape[1] == 2

Schema.validate(df, lazy=True)

SchemaErrors: A total of 4 schema errors were found.

Error Counts
------------
- column_not_in_dataframe: 2
- schema_component_check: 2

Schema Error Summary
--------------------
                                                  failure_cases  n_failure_cases
schema_context  column  check                                                   
DataFrameSchema <NA>    column_in_dataframe  [column3, column2]                2
Column          column1 dtype('int64')                [float64]                1
                        not_nullable                      [nan]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


## Informative Errors

In [54]:
import pandas as pd

from pandera import Column, DataFrameSchema, Int, Check

simple_schema = DataFrameSchema({
    "column1": Column(
        Int, Check(lambda x: 0 <= x <= 10, element_wise=True,
                   error="range checker [0, 10]"))
})

# validation rule violated
fail_check_df = pd.DataFrame({
    "column1": [-20, 5, 10, 30],
})

simple_schema.validate(fail_check_df, lazy=True)

SchemaErrors: A total of 1 schema errors were found.

Error Counts
------------
- schema_component_check: 1

Schema Error Summary
--------------------
                                             failure_cases  n_failure_cases
schema_context column  check                                               
Column         column1 range checker [0, 10]     [-20, 30]                2

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


In [56]:
# column name mis-specified
wrong_column_df = pd.DataFrame({
   "foo": ["bar"] * 10,
   "baz": [1] * 10
})

simple_schema.validate(wrong_column_df, lazy=True)

SchemaErrors: A total of 1 schema errors were found.

Error Counts
------------
- column_not_in_dataframe: 1

Schema Error Summary
--------------------
                                           failure_cases  n_failure_cases
schema_context  column check                                             
DataFrameSchema <NA>   column_in_dataframe     [column1]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```
