In [1]:
import pandas as pd
from datetime import datetime

In [2]:
import great_expectations as gx
from great_expectations.core.batch import RuntimeBatchRequest
from great_expectations.core.yaml_handler import YAMLHandler
from great_expectations.data_context.types.base import (
    DataContextConfig,
    InMemoryStoreBackendDefaults,
)
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.core.expectation_suite import ExpectationSuite
from gx_demo_dataprep import trips_sample_n
yaml = YAMLHandler()

In [3]:
trips_df = trips_sample_n(1000)
display(trips_df)

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
2839059,2,2023-01-30 11:44:06,2023-01-30 11:59:14,1.0,3.46,1.0,N,161,112,1,19.1,0.0,0.5,5.93,6.55,1.0,35.58,2.5,0.0
2175355,2,2023-01-23 19:23:23,2023-01-23 19:26:30,1.0,0.38,1.0,N,163,237,1,5.1,2.5,0.5,2.32,0.00,1.0,13.92,2.5,0.0
2951391,1,2023-01-31 16:18:07,2023-01-31 16:40:08,1.0,1.50,1.0,N,237,143,1,14.9,5.0,0.5,2.00,0.00,1.0,23.40,2.5,0.0
949954,2,2023-01-11 18:04:18,2023-01-11 18:30:36,1.0,6.19,1.0,N,87,163,1,31.7,2.5,0.5,7.64,0.00,1.0,45.84,2.5,0.0
1459612,1,2023-01-16 17:06:50,2023-01-16 17:22:42,1.0,1.70,1.0,N,161,100,1,13.5,2.5,0.5,5.00,0.00,1.0,22.50,2.5,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1744412,1,2023-01-19 15:54:16,2023-01-19 16:09:11,2.0,1.00,1.0,N,186,230,2,10.7,2.5,0.5,0.00,0.00,1.0,14.70,2.5,0.0
671599,2,2023-01-08 16:43:35,2023-01-08 16:54:55,1.0,2.39,1.0,N,249,170,1,13.5,0.0,0.5,2.62,0.00,1.0,20.12,2.5,0.0
9293,1,2023-01-01 01:03:01,2023-01-01 01:12:46,1.0,2.40,1.0,N,79,141,1,12.8,3.5,0.5,3.56,0.00,1.0,21.36,2.5,0.0
2125149,2,2023-01-23 10:54:39,2023-01-23 11:19:54,1.0,9.96,1.0,N,162,138,1,42.2,5.0,0.5,11.55,6.55,1.0,69.30,2.5,0.0


In [4]:
# INITIALIZING THE DATA CONTEXT
data_context_configuration = DataContextConfig(
    store_backend_defaults=InMemoryStoreBackendDefaults()
)

data_context = gx.get_context(project_config=data_context_configuration)

In [5]:
# CONFIGURE DATASOURCE

datasource_name = "nyc_yellowcab_trips"

datasource_config = {
    "name": datasource_name,
    "class_name": "Datasource",
    "execution_engine": {"class_name": "PandasExecutionEngine"},
    "data_connectors": {
        datasource_name: {
            "class_name": "RuntimeDataConnector",
            "batch_identifiers": ["batch_id", "batch_datetime"],
        }
    },
}

data_context.test_yaml_config(yaml.dump(datasource_config))
data_context.add_datasource(**datasource_config)

Attempting to instantiate class from config...
	Instantiating as a Datasource, since class_name is Datasource
	Successfully instantiated Datasource


ExecutionEngine class name: PandasExecutionEngine
Data Connectors:
	nyc_yellowcab_trips:RuntimeDataConnector

	Available data_asset_names (0 of 0):
		Note : RuntimeDataConnector will not have data_asset_names until they are passed in through RuntimeBatchRequest

	Unmatched data_references (0 of 0): []



<great_expectations.datasource.new_datasource.Datasource at 0x7f54e4ff6a40>

In [6]:
# CREATE BATCH REQUEST
dataset_name = datasource_name

batch_request = RuntimeBatchRequest(
    datasource_name=datasource_name,
    data_connector_name=dataset_name,
    data_asset_name=dataset_name,
    runtime_parameters={"batch_data": trips_df},  # our nyc trips dataframe
    batch_identifiers={
        "batch_id": f"{dataset_name}_{datetime.strftime(datetime.now(), '%Y%m%d%H%M%S')}",
        "batch_datetime": datetime.strftime(datetime.now(),'%Y%m%d%H%M%S')
        
    },
)

In [7]:
# WAYS OF SETTING UP YOUR EXPECTATIONS AND EVALUATING THEM

#  1. Create an empty Expectation Suite and  and use an Expectation Validator to evaluate your expectations
#  2. Create a set of Expectations and evalutate them using a Checkpoint

In [8]:
# 2. Building a set of expectations and Evaluating them using a GX Checkpoint

expectation_suite_name = f"{dataset_name}_expectation_suite"

expectation_config_dicts = [
    {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
            "column": "passenger_count"
        }
    },
    {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
            "column": "trip_distance"
        }
    }
]

expectations: list = []
expectation_suite = ExpectationSuite(expectation_suite_name=expectation_suite_name)

for item in expectation_config_dicts:
    config = ExpectationConfiguration(**item)
    expectations.append(config)

expectation_suite.add_expectation_configurations(expectations)

data_context.add_or_update_expectation_suite(expectation_suite=expectation_suite)


{
  "expectations": [
    {
      "kwargs": {
        "column": "passenger_count"
      },
      "expectation_type": "expect_column_values_to_not_be_null",
      "meta": {}
    },
    {
      "kwargs": {
        "column": "trip_distance"
      },
      "expectation_type": "expect_column_values_to_not_be_null",
      "meta": {}
    }
  ],
  "data_asset_type": null,
  "ge_cloud_id": null,
  "expectation_suite_name": "nyc_yellowcab_trips_expectation_suite",
  "meta": {
    "great_expectations_version": "0.16.10"
  }
}

In [9]:
# CREATING A CHECKPOINT TO VALIDATE EXPECTATIONS
checkpoint_name = f"{dataset_name}_checkpoint"

checkpoint_config = {
    "name": checkpoint_name,
    "config_version": 1.0,
    "class_name": "SimpleCheckpoint",
    "run_name_template": f"{dataset_name}_%Y%m%d-%H%M%S",
}

data_context.add_or_update_checkpoint(**checkpoint_config)

{
  "action_list": [
    {
      "name": "store_validation_result",
      "action": {
        "class_name": "StoreValidationResultAction"
      }
    },
    {
      "name": "store_evaluation_params",
      "action": {
        "class_name": "StoreEvaluationParametersAction"
      }
    },
    {
      "name": "update_data_docs",
      "action": {
        "class_name": "UpdateDataDocsAction",
        "site_names": []
      }
    }
  ],
  "batch_request": {},
  "class_name": "Checkpoint",
  "config_version": 1.0,
  "evaluation_parameters": {},
  "module_name": "great_expectations.checkpoint",
  "name": "nyc_yellowcab_trips_checkpoint",
  "profilers": [],
  "run_name_template": "nyc_yellowcab_trips_%Y%m%d-%H%M%S",
  "runtime_configuration": {},
  "validations": []
}

In [10]:
# VALIDATING RESULTS USING CHECKPOINT
checkpoint_result = data_context.run_checkpoint(
    checkpoint_name=checkpoint_name,
    validations=[
        {
            "batch_request": batch_request,
            "expectation_suite_name": expectation_suite_name
        }
    ]
)

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

In [11]:
assert(checkpoint_result['success']==True)

AssertionError: 