In [0]:
%pip install great_expectations

In [0]:
#pip install great_expectations


#1- Import the following libraries
import great_expectations as gx
# Pandas -> 
import pandas as pd

# -> Line to check gx core version: print(gx.__version__)

# 2- Download and read the sample data into a Pandas DataFrame.
df = spark.read.table("lnd_databricks_workspace.bronze.bronze_plv").toPandas()

#3- A Data Context object serves as the entrypoint for interacting with GX components.
context = gx.get_context()

#4- Connect to data and create a Batch.
# Define a Data Source, Data Asset, Batch Definition, and Batch. The Pandas DataFrame is provided to the Batch Definition at runtime to create the Batch.

data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})


#Create an Expectation.
#Expectations are a fundamental component of GX. They allow you to explicitly define the state to which your data should conform.
#Run the following code to define an Expectation that the contents of the column passenger_count consist of values ranging from 2 to 6:
expectation = gx.expectations.ExpectColumnValuesToBeBetween(
    column="inseecommuneprinc", min_value=0, max_value=99999
)

#Run and get the results!
validation_result = batch.validate(expectation)

#print(validation_result)

In [0]:
import dlt
from pyspark.sql.functions import current_timestamp, year, col
from datetime import datetime
from pyspark.sql import SparkSession
import pandas as pd
import great_expectations as gx


@dlt.table(
    comment="Table controle qualité qui regroupe les résultats des tests GE",
    partition_cols=["annee"]
)
def qualite_donnees_plv():
    # Download and read the sample data into a Pandas DataFrame.
    df = spark.read.table("lnd_databricks_workspace.bronze.bronze_plv").toPandas()

    # A Data Context object serves as the entrypoint for interacting with GX components.
    context = gx.get_context()

    # Connect to data and create a Batch.
    # Define a Data Source, Data Asset, Batch Definition, and Batch. The Pandas DataFrame is provided to the Batch Definition at runtime to create the Batch.

    data_source = context.data_sources.add_pandas("pandas")
    data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

    batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
    batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

    #Create an Expectation.
    #Expectations are a fundamental component of GX. They allow you to explicitly define the state to which your data should conform.
    #Run the following code to define an Expectation that the contents of the column passenger_count consist of values ranging from 2 to 6:
    expectation = gx.expectations.ExpectColumnValuesToBeBetween(
        column="inseecommuneprinc", min_value=0, max_value=99999
    )

    #Run and get the results!
    validation_result = batch.validate(expectation)

    # On prépare un petit dictionnaire plat à partir du résultat précédent
    data = [{
        "expectation_type": str(validation_result["expectation_config"]["type"]),
        "column": str(validation_result["expectation_config"]["kwargs"].get("column")),
        "min_value": validation_result["expectation_config"]["kwargs"].get("min_value"),
        "max_value": validation_result["expectation_config"]["kwargs"].get("max_value"),
        "severity": str(validation_result["expectation_config"].get("severity", None)),
        "success": bool(validation_result["success"]),
        "error_message": str(validation_result.get("exception_info", {}).get("exception_message", "")),
        "date_test": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }]

    # Convertir en DataFrame Spark
    pdf = pd.DataFrame(data)
    df = spark.createDataFrame(pdf)

    # Ajoute l'annee de l'enregistrement
    df = df.withColumn(
        "annee",
        year(current_timestamp())
    )
    return df