# Pandera

> pandera is a Union.ai open source project that provides a flexible and expressive API for performing data validation on dataframe-like objects to make data processing pipelines more readable and robust. Dataframes contain information that pandera explicitly validates at runtime.

**REF**: https://pandera.readthedocs.io/en/stable/

## Last Updated

2023-11-18

In [2]:
import pandas as pd
import pandera as pa

In [27]:
# Data to validate.

df = pd.DataFrame({
    "col1": [1, 4, 0, 10, 9, None],
    "col2": [-1.3, -1.4, -2.9, -10.1, -20.4, -10], 
    "col3": ["value_1", "value_2", "value_3", "value_2", "value_1", "value_1"],
})
df = df.astype({"col1": pd.Int64Dtype(), "col2": float})

In [30]:
# Define the schema.

schema = pa.DataFrameSchema({
    "col1": pa.Column(pd.Int64Dtype(), checks=pa.Check.le(10), nullable=True),
    "col2": pa.Column(float, checks=pa.Check.lt(-1.2, ignore_na=True)),
    "col3": pa.Column(str, checks=[
        pa.Check.str_startswith("value_"),
        pa.Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
    ])
})

validated_df = schema(df)
print(validated_df)

   col1  col2     col3
0     1  -1.3  value_1
1     4  -1.4  value_2
2     0  -2.9  value_3
3    10 -10.1  value_2
4     9 -20.4  value_1
5  <NA> -10.0  value_1


In [34]:
# Alternatively, you could do a dataframe model.

from pandera.typing import Series

class Schema(pa.DataFrameModel):
    col1: pd.Int64Dtype() = pa.Field(le=10, nullable=True)
    col2: float = pa.Field(lt=-1.2)
    col3: str = pa.Field(str_startswith="value_")

    @pa.check("col3")
    def col_3_check(cls, series: Series[str]) -> Series[bool]:
        return series.str.split("_", expand=True).shape[1] == 2
    
Schema.validate(df)

Unnamed: 0,col1,col2,col3
0,1.0,-1.3,value_1
1,4.0,-1.4,value_2
2,0.0,-2.9,value_3
3,10.0,-10.1,value_2
4,9.0,-20.4,value_1
5,,-10.0,value_1
