# DataFrame Schemas

In [5]:
import pandas as pd 
import pandera as pa

In [10]:
# data to validate
df = pd.DataFrame({
    "column1": [1, 4, 0, 10, 9, 10],
    "column2": [-1.3, -1.4, -2.9, -10.1, -20.4, -10],
    "column3": ["value_1", "value_2", "value_3", "value_2", "value_1", "value_1"],
})

In [11]:
import pandera as pa
from pandera import Column, DataFrameSchema, Check, Index

schema = DataFrameSchema(
    {
        "column1": Column(int),
        "column2": Column(float, Check(lambda s: s < -1.2)),
        # you can provide a list of validators
        "column3": Column(str, [
           Check(lambda s: s.str.startswith("value")),
           Check(lambda s: s.str.split("_", expand=True).shape[1] == 2)
        ]),
    },
    index=Index(int),
    strict=True,
    coerce=True,
)

schema.validate(df, lazy=True)

Unnamed: 0,column1,column2,column3
0,1,-1.3,value_1
1,4,-1.4,value_2
2,0,-2.9,value_3
3,10,-10.1,value_2
4,9,-20.4,value_1
5,10,-10.0,value_1


# Column Validation
## Null Values in Columns

In [1]:
import numpy as np
import pandas as pd
import pandera as pa

from pandera import Check, Column, DataFrameSchema

df = pd.DataFrame({"column1": [5, 1, np.nan]})

non_null_schema = DataFrameSchema({
    "column1": Column(float, Check(lambda x: x > 0))
})

non_null_schema.validate(df, lazy=True)

SchemaErrors: A total of 1 schema errors were found.

Error Counts
------------
- schema_component_check: 1

Schema Error Summary
--------------------
                                    failure_cases  n_failure_cases
schema_context column  check                                      
Column         column1 not_nullable         [nan]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


In [2]:
null_schema = DataFrameSchema({
    "column1": Column(float, Check(lambda x: x > 0), nullable=True) #nullable: acepta que haya Not a number
})

print(null_schema.validate(df))

   column1
0      5.0
1      1.0
2      NaN


## Coercing Types on Columns

In [3]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema

df = pd.DataFrame({"column1": [1, 2, 3]})
schema = DataFrameSchema({
    "column1": Column(str, coerce=True) #fuerza a que se acepte un tipo de valor por otro
})

validated_df = schema.validate(df)
print(validated_df)

  column1
0       1
1       2
2       3


In [5]:
df = pd.DataFrame({"column1": [1., 2., 3, np.nan]})
schema = DataFrameSchema({
    "column1": Column(int, coerce=True, nullable=True)
})
#un integer no acepta que haya nan 
validated_df = schema.validate(df, lazy=True)

SchemaErrors: A total of 2 schema errors were found.

Error Counts
------------
- schema_component_check: 2

Schema Error Summary
--------------------
                                             failure_cases  n_failure_cases
schema_context column  check                                               
Column         column1 coerce_dtype('int64')         [nan]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


In [6]:
schema_object = DataFrameSchema({
    "column1": Column(object, coerce=True, nullable=True)
})
schema_float = DataFrameSchema({
    "column1": Column(float, coerce=True, nullable=True)
})

print(schema_object.validate(df).dtypes)
print(schema_float.validate(df).dtypes)

column1    object
dtype: object
column1    float64
dtype: object


## Required Columns

In [7]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema

df = pd.DataFrame({"column2": ["hello", "pandera"]})
schema = DataFrameSchema({
    "column1": Column(int, required=False), #un required como false acepta que las columnas no esten en el df
    "column2": Column(str)
})

validated_df = schema.validate(df)
print(validated_df)

   column2
0    hello
1  pandera


In [8]:
schema = DataFrameSchema({
    "column1": Column(int, required=True), # cuando es true, es obligatoria que haya la columna
    "column2": Column(str),
})

schema.validate(df)

SchemaError: column 'column1' not in dataframe
   column2
0    hello
1  pandera

# Ordered Columns
## Stand-alone Column Validation

In [11]:
import pandas as pd
import pandera as pa

df = pd.DataFrame({
    "column1": [1, 2, 3],
    "column2": ["a", "b", "c"],
})

column1_schema = pa.Column(int, name="column1")
column2_schema = pa.Column(str, name="column2")

# pass the dataframe as an argument to the Column object callable
df = column1_schema(df)
validated_df = column2_schema(df)

# or explicitly use the validate method
df = column1_schema.validate(df)
validated_df = column2_schema.validate(df)

# use the DataFrame.pipe method to validate two columns
validated_df = df.pipe(column1_schema).pipe(column2_schema)
print(validated_df)

   column1 column2
0        1       a
1        2       b
2        3       c
   column1 column2
0        1       a
1        2       b
2        3       c


## Column Regex Pattern Matching

In [14]:
import numpy as np
import pandas as pd
import pandera as pa

categories = ["A", "B", "C"]

np.random.seed(100)

dataframe = pd.DataFrame({
    "cat_var_1": np.random.choice(categories, size=100),
    "cat_var_2": np.random.choice(categories, size=100),
    "num_var_1": np.random.uniform(0, 10, size=100),
    "num_var_2": np.random.uniform(20, 30, size=100),
})

schema = pa.DataFrameSchema({
    "num_var_.+": pa.Column(
        float,
        checks=pa.Check.greater_than_or_equal_to(0),
        regex=True,
    ),
    "cat_var_.+": pa.Column(
        pa.Category,
        checks=pa.Check.isin(categories),
        coerce=True,
        regex=True,
    ),
})

print(schema.validate(dataframe).head())

  cat_var_1 cat_var_2  num_var_1  num_var_2
0         A         A   6.804147  24.743304
1         A         C   3.684308  22.774633
2         A         C   5.911288  28.416588
3         C         A   4.790627  21.951250
4         C         B   4.504166  28.563142
num_var_.+


In [15]:
np.random.seed(100)

dataframe = pd.DataFrame({
    ("cat_var_1", "y1"): np.random.choice(categories, size=100),
    ("cat_var_2", "y2"): np.random.choice(categories, size=100),
    ("num_var_1", "x1"): np.random.uniform(0, 10, size=100),
    ("num_var_2", "x2"): np.random.uniform(0, 10, size=100),
})

schema = pa.DataFrameSchema({
    ("num_var_.+", "x.+"): pa.Column(
        float,
        checks=pa.Check.greater_than_or_equal_to(0),
        regex=True,
    ),
    ("cat_var_.+", "y.+"): pa.Column(
        pa.Category,
        checks=pa.Check.isin(categories),
        coerce=True,
        regex=True,
    ),
})

print(schema.validate(dataframe).head())

  cat_var_1 cat_var_2 num_var_1 num_var_2
         y1        y2        x1        x2
0         A         A  6.804147  4.743304
1         A         C  3.684308  2.774633
2         A         C  5.911288  8.416588
3         C         A  4.790627  1.951250
4         C         B  4.504166  8.563142


## Handling Dataframe Columns not in the Schema

In [16]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema

schema = DataFrameSchema(
    {"column1": Column(int)},
    strict=True) #comprueba que una columna debe ser estricta

df = pd.DataFrame({"column2": [1, 2, 3]})

schema.validate(df)

SchemaError: column 'column2' not in DataFrameSchema {'column1': <Schema Column(name=column1, type=DataType(int64))>}

In [17]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema

df = pd.DataFrame({"column1": ["drop", "me"],"column2": ["keep", "me"]})
schema = DataFrameSchema({"column2": Column(str)}, strict='filter') 
#elimina la columna durante la validación

validated_df = schema.validate(df)
print(validated_df)

  column2
0    keep
1      me


## Validando el orden de las columnas

In [18]:
import pandas as pd
import pandera as pa

schema = pa.DataFrameSchema(
    columns={"a": pa.Column(int), "b": pa.Column(int)}, ordered=True
) # ordered valida el orden de las columnas
df = pd.DataFrame({"b": [1], "a": [1]})
print(schema.validate(df))

SchemaError: column 'b' out-of-order

## Validando la unicidad conjunta de las columnas

In [24]:
import pandas as pd
import pandera as pa

schema = pa.DataFrameSchema(
    columns={col: pa.Column(int) for col in ["a", "b", "c"]}, #valida que las columnas sean unicas
    unique=["a", "c"],
)
df = pd.DataFrame.from_records([
    {"a": 1, "b": 2, "c": 3},
    {"a": 1, "b": 2, "c": 3},
])
print(df)
schema.validate(df, lazy=True)

   a  b  c
0  1  2  3
1  1  2  3


SchemaErrors: A total of 1 schema errors were found.

Error Counts
------------
- duplicates: 1

Schema Error Summary
--------------------
                                                  failure_cases  n_failure_cases
schema_context  column check                                                    
DataFrameSchema a      multiple_fields_uniqueness           [1]                1
                c      multiple_fields_uniqueness           [3]                1

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


## Validación de índice

In [20]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema, Index, Check

schema = DataFrameSchema(
   columns={"a": Column(int)},
   index=Index(
       str,
       Check(lambda x: x.str.startswith("index_")))) #especificar el indice

df = pd.DataFrame(
    data={"a": [1, 2, 3]},
    index=["index_1", "index_2", "index_3"])

print(schema.validate(df))

         a
index_1  1
index_2  2
index_3  3


In [22]:
df = pd.DataFrame(
    data={"a": [1, 2, 3]},
    index=["foo1", "foo2", "foo3"])

schema.validate(df, lazy=True)

SchemaErrors: A total of 1 schema errors were found.

Error Counts
------------
- schema_component_check: 1

Schema Error Summary
--------------------
                                     failure_cases  n_failure_cases
schema_context column check                                        
Index          <NA>   <lambda>  [foo2, foo3, foo1]                3

Usage Tip
---------

Directly inspect all errors by catching the exception:

```
try:
    schema.validate(dataframe, lazy=True)
except SchemaErrors as err:
    err.failure_cases  # dataframe of schema errors
    err.data  # invalid dataframe
```


# Validación MultiIndex
## Columnas MultiIndex

In [23]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema, Index

schema = DataFrameSchema({
    ("foo", "bar"): Column(int),
    ("foo", "baz"): Column(str)
})

df = pd.DataFrame({
    ("foo", "bar"): [1, 2, 3],
    ("foo", "baz"): ["a", "b", "c"],
})
print(df)
print(schema.validate(df))

  foo    
  bar baz
0   1   a
1   2   b
2   3   c
  foo    
  bar baz
0   1   a
1   2   b
2   3   c


## Índices MultiIndex

In [25]:
import pandas as pd
import pandera as pa

from pandera import Column, DataFrameSchema, Index, MultiIndex, Check

schema = DataFrameSchema(
    columns={"column1": Column(int)},
    index=MultiIndex([
        Index(str,
              Check(lambda s: s.isin(["foo", "bar"])), #define el indice
              name="index0"),
        Index(int, name="index1"),
    ])
)

df = pd.DataFrame(
    data={"column1": [1, 2, 3]},
    index=pd.MultiIndex.from_arrays(
        [["foo", "bar", "foo"], [0, 1,2 ]],
        names=["index0", "index1"]
    )
)

print(schema.validate(df))

               column1
index0 index1         
foo    0             1
bar    1             2
foo    2             3


# Obtener tipos de datos de Pandas

In [26]:
import pandas as pd
import pandera as pa

schema = pa.DataFrameSchema(
    columns={
      "column1": pa.Column(int),
      "column2": pa.Column(pa.Category),
      "column3": pa.Column(bool)
    },
)

df = (
    pd.DataFrame.from_dict(
        {
            "a": {"column1": 1, "column2": "valueA", "column3": True},
            "b": {"column1": 1, "column2": "valueB", "column3": True},
        },
        orient="index",
    )
    .astype({col: str(dtype) for col, dtype in schema.dtypes.items()})
    .sort_index(axis=1)
)

print(schema.validate(df))

   column1 column2  column3
a        1  valueA     True
b        1  valueB     True


# Transformaciones DataFrameSchema

In [27]:
import pandas as pd
import pandera as pa

data = pd.DataFrame({"col1": range(1, 6)})

schema = pa.DataFrameSchema(
    columns={"col1": pa.Column(int, pa.Check(lambda s: s >= 0))}, #valida que sea mayor que 0
    strict=True)

transformed_schema = schema.add_columns({
    "col2": pa.Column(str, pa.Check(lambda s: s == "value")), #validad que sea igual a value
    "col3": pa.Column(float, pa.Check(lambda x: x == 0.0)), #validad que sea igual a 0.0
})

# validate original data
data = schema.validate(data)

# transformation usand o pandas
transformed_data = data.assign(col2="value", col3=0.0)

# validate transformed data
print(transformed_schema.validate(transformed_data))

   col1   col2  col3
0     1  value   0.0
1     2  value   0.0
2     3  value   0.0
3     4  value   0.0
4     5  value   0.0


In [30]:
import pandera as pa

schema = pa.DataFrameSchema(
    columns={
        "col1": pa.Column(int, pa.Check(lambda s: s >= 0)),
        "col2": pa.Column(str, pa.Check(lambda x: x <= 0)),
        "col3": pa.Column(object, pa.Check(lambda x: x == 0)),
    },
    strict=True,
)

new_schema = schema.remove_columns(["col2", "col3"])
print(new_schema)

<Schema DataFrameSchema(
    columns={
        'col1': <Schema Column(name=col1, type=DataType(int64))>
    },
    checks=[],
    coerce=False,
    dtype=None,
    index=None,
    strict=True
    name=None,
    ordered=False
)>


In [31]:
import pandera as pa

from pandera import Column, DataFrameSchema, Check, Index

schema = DataFrameSchema(
    {
        "column1": Column(int),
        "column2": Column(float)
    },
    index=Index(int, name = "column3"),
    strict=True,
    coerce=True,
)
print(schema.set_index(["column1"], append = True))

<Schema DataFrameSchema(
    columns={
        'column2': <Schema Column(name=column2, type=DataType(float64))>
    },
    checks=[],
    coerce=True,
    dtype=None,
    index=<Schema MultiIndex(
        indexes=[
            <Schema Index(name=column3, type=DataType(int64))>
            <Schema Index(name=column1, type=DataType(int64))>
        ]
        coerce=False,
        strict=False,
        name=None,
        ordered=True
    )>,
    strict=True
    name=None,
    ordered=False
)>
