In [1]:
import pandas as pd
from datetime import datetime

In [2]:
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.core.yaml_handler import YAMLHandler
from great_expectations.data_context.types.base import (
    DataContextConfig,
    InMemoryStoreBackendDefaults,
    FilesystemStoreBackendDefaults
)
# from great_expectations.util import get_context
from gx_demo_dataprep import trips_sample_n
yaml = YAMLHandler()

In [3]:
trips_df = trips_sample_n(10000)
display(trips_df)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
1877889,1,2023-01-20 19:55:59,2023-01-20 20:07:20,1.0,2.00,1.0,N,230,237,1,13.50,5.0,0.5,4.00,0.00,1.0,24.00,2.5,0.00
2315708,2,2023-01-25 10:39:32,2023-01-25 10:53:53,1.0,2.19,1.0,N,236,142,1,15.60,0.0,0.5,2.94,0.00,1.0,22.54,2.5,0.00
2727528,1,2023-01-28 23:33:57,2023-01-28 23:46:14,1.0,3.50,1.0,N,148,140,1,16.30,3.5,0.5,4.25,0.00,1.0,25.55,2.5,0.00
1460191,2,2023-01-16 17:52:59,2023-01-16 18:21:22,5.0,9.90,1.0,N,138,237,1,41.50,5.0,0.5,11.66,6.55,1.0,69.96,2.5,1.25
2519411,2,2023-01-27 07:04:13,2023-01-27 07:31:40,1.0,7.65,1.0,N,132,95,2,35.20,0.0,0.5,0.00,0.00,1.0,37.95,0.0,1.25
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
558682,2,2023-01-07 14:14:39,2023-01-07 14:17:20,2.0,0.46,1.0,N,262,263,1,5.10,0.0,0.5,1.82,0.00,1.0,10.92,2.5,0.00
685283,1,2023-01-08 19:11:42,2023-01-08 19:22:07,1.0,0.80,1.0,N,186,170,1,8.60,2.5,0.5,5.00,0.00,1.0,17.60,2.5,0.00
3009108,1,2023-01-07 11:02:18,2023-01-07 11:17:11,,0.00,,,162,143,0,12.36,0.0,0.5,0.00,0.00,1.0,16.36,,
465008,1,2023-01-06 16:47:11,2023-01-06 16:51:14,1.0,0.60,1.0,N,164,234,1,5.80,5.0,0.5,5.00,0.00,1.0,17.30,2.5,0.00


In [4]:
# INITIALIZING THE DATA CONTEXT
root_directory = "/home/kbaafi/Desktop/gx/gx_store_backend" # Must be an absolute path
data_context_configuration = DataContextConfig(
    store_backend_defaults=FilesystemStoreBackendDefaults(
        root_directory=root_directory
    )
    # store_backend_defaults=InMemoryStoreBackendDefaults()
)

data_context = gx.get_context(project_config=data_context_configuration)

In [5]:
# CONFIGURE DATASOURCE

datasource_name = "nyc_yellowcab_trips"

datasource_config = {
    "name": datasource_name,
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        datasource_name: {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["batch_id", "batch_datetime"],
        }
    },
}

data_context.test_yaml_config(yaml.dump(datasource_config))
data_context.add_datasource(**datasource_config)

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	nyc_yellowcab_trips:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x7fc5942b7400>

In [6]:
# CREATE BATCH REQUEST
dataset_name = datasource_name

batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name=dataset_name,
    data_asset_name=dataset_name,
    runtime_parameters={"batch_data": trips_df},  # our nyc trips dataframe
    batch_identifiers={
        "batch_id": f"{dataset_name}_{datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')}",
        "batch_datetime": datetime.strftime(datetime.now(),'%Y%m%d%H%M%S')
        
    },
)

In [7]:
# WAYS OF SETTING UP YOUR EXPECTATIONS AND EVALUATING THEM

#  1. Create an empty Expectation Suite and  and use an Expectation Validator to evaluate your expectations
#  2. Create a set of Expectations and evalutate them using a Checkpoint

In [8]:
# Using an empty Expectation suite and use a validator to evaluate your expectations

expectation_suite_name = f"{dataset_name}_expectation_suite"

data_context.add_or_update_expectation_suite(
    expectation_suite_name=expectation_suite_name
)

validator = data_context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

In [9]:
validator.expect_column_values_to_not_be_null(column="passenger_count", mostly=.9)
validator.expect_column_values_to_not_be_null(column="trip_distance")

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "success": true,
  "result": {
    "element_count": 10000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  }
}

In [10]:
validator.save_expectation_suite(discard_failed_expectations=False)

In [11]:
# CREATING A CHECKPOINT TO VALIDATE EXPECTATIONS
checkpoint_name = f"{dataset_name}_checkpoint"

checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "run_name_template": f"{dataset_name}_%Y%m%d-%H%M%S",
}

data_context.add_or_update_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "nyc_yellowcab_trips_checkpoint",
  "profilers": [],
  "run_name_template": "nyc_yellowcab_trips_%Y%m%d-%H%M%S",
  "runtime_configuration": {},
  "validations": []
}

In [12]:
# VALIDATING RESULTS USING CHECKPOINT
checkpoint_result = data_context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name
        }
    ]
)

print(checkpoint_result)

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

{
  "run_id": {
    "run_time": "2023-05-01T10:40:48.592822+03:00",
    "run_name": "nyc_yellowcab_trips_20230501-074048"
  },
  "run_results": {
    "ValidationResultIdentifier::nyc_yellowcab_trips_expectation_suite/nyc_yellowcab_trips_20230501-074048/20230501T074048.592822Z/bc66a628ab0a3649aaec88ce58c5e758": {
      "validation_result": {
        "meta": {
          "great_expectations_version": "0.16.10",
          "expectation_suite_name": "nyc_yellowcab_trips_expectation_suite",
          "run_id": {
            "run_time": "2023-05-01T10:40:48.592822+03:00",
            "run_name": "nyc_yellowcab_trips_20230501-074048"
          },
          "batch_spec": {
            "data_asset_name": "nyc_yellowcab_trips",
            "batch_data": "PandasDataFrame"
          },
          "batch_markers": {
            "ge_load_time": "20230501T074048.595409Z",
            "pandas_data_fingerprint": "576842a3e4f884973842dfea58588f29"
          },
          "active_batch_definition": {
       

In [13]:
assert(checkpoint_result['success']==True)