In [1]:
import pandera
import fugue
import pandas as pd
import numpy as np
import pyspark
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset

In [2]:
from pyspark.sql import SparkSession

spark = SparkSession \
    .builder \
    .getOrCreate()

In [3]:
a = np.random.randint(1, 100, 1000)
b = np.random.randint(1, 100, 1000)
test1 = pd.DataFrame({'a':a, 'b':b})
test2 = test1 + 100
test1['partition'] = 'a'
test2['partition'] = 'b'
test = test1.append(test2)

In [4]:
spark_test = spark.createDataFrame(test)
spark_test = spark_test.withColumnRenamed("a","col1")
sparkdf = SparkDFDataset(spark_test)
sparkdf.expect_column_values_to_be_between("col1", 
                                            min_value=0, 
                                            max_value=9, 
                                            mostly=0.95,
                                            result_format="SUMMARY")

{
  "success": false,
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 2000,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1037,
    "unexpected_percent": 51.849999999999994,
    "unexpected_percent_total": 51.849999999999994,
    "unexpected_percent_nonmissing": 51.849999999999994,
    "partial_unexpected_list": [
      96,
      98,
      96,
      97,
      99,
      98,
      96,
      96,
      98,
      96,
      96,
      98,
      97,
      98,
      97,
      98,
      98,
      97,
      98,
      98
    ],
    "partial_unexpected_index_list": null,
    "partial_unexpected_counts": [
      {
        "value": 98,
        "count": 9
      },
      {
        "value": 96,
        "count": 6
      },
      {
        "value": 97,
        "count": 4
      },
      {
        "value": 99,
        "count": 1
      }
    ]
  }
}

In [None]:
# # schema: *, filled:double
# def fillna(df:Iterable[Dict[str,Any]], value:float=0) -> Iterable[Dict[str,Any]]:
#     for row in df:
#         row["filled"] = (row["value"] or value)
#         yield row

# with FugueWorkflow(SparkExecutionEngine) as dag:
#     df = dag.load("file.parquet").transform(fillna)

In [None]:
from great_expectations.dataset.sparkdf_dataset import SparkDFDataset

sparkdf = SparkDFDataset(sparkdf)

sparkdf.expect_column_values_to_be_between("col1", 
                                            min_value=0, 
                                            max_value=95, 
                                            mostly=0.95,
                                            result_format="SUMMARY")

In [None]:
import pandera as pa
from pandera import Column, Check, DataFrameSchema

price_check = pa.DataFrameSchema({
    "price": Column(pa.Float, Check.in_range(min_value=5,max_value=10)),
})
price_check.validate(df)

# schema: *
def price_validation(df:pd.DataFrame) -> pd.DataFrame:
    price_check.validate(df)
    return df

with FugueWorkflow(SparkExecutionEngine) as dag:
    df = df.transform(price_validation)

In [None]:
import pandera as pa
from pandera import Column, Check, DataFrameSchema

price_check_FL = pa.DataFrameSchema({
    "price": Column(pa.Float, Check.in_range(min_value=5,max_value=10)),
})

price_check_CA = pa.DataFrameSchema({
    "price": Column(pa.Float, Check.in_range(min_value=10,max_value=15)),
})

price_checks = {'CA': price_check_CA, 'FL': price_check_FL}

# schema: *
def price_validation(df:pd.DataFrame) -> pd.DataFrame:
    location = df['location'].iloc[0]
    check = price_checks[location]
    check.validate(df)
    return df

with FugueWorkflow(SparkExecutionEngine) as dag:
    df = df.partition(by=["location"]).transform(price_validation)

In [None]:
import pandera as pa
from pandera import Column, Check, DataFrameSchema

schema_test1 = pa.DataFrameSchema({
    "a": Column(pa.Int, Check.is_b(100)),
})

schema_test2 = pa.DataFrameSchema({
    "a": Column(pa.Int, Check.greater_than(99))
})

partition_schema = {"a": schema_test1, "b": schema_test2}

In [None]:
# schema: *
def validator(df:pd.DataFrame) -> pd.DataFrame:
    partition = df['partition'].iloc[0]
    schema = partition_schema[partition]
    schema.validate(df)
    return df

In [None]:
from fugue import FugueWorkflow
from fugue_spark import SparkExecutionEngine

with FugueWorkflow(SparkExecutionEngine) as dag:
    df = dag.df(test)
    df = df.partition(by=["partition"]).transform(validator)
    df.show(5)