# Load Dataframe to GreatExpectations (YellowTaxi)

In [45]:
import great_expectations as gx
import glob
import pandas

df = gx.read_parquet('../data/yellow_taxi/yellow_tripdata_2023-01.parquet')

In [46]:
df.head()

Unnamed: 0,VendorID,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,RatecodeID,store_and_fwd_flag,PULocationID,DOLocationID,payment_type,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,airport_fee
0,2,2023-01-01 00:32:10,2023-01-01 00:40:36,1.0,0.97,1.0,N,161,141,2,9.3,1.0,0.5,0.0,0.0,1.0,14.3,2.5,0.0
1,2,2023-01-01 00:55:08,2023-01-01 01:01:27,1.0,1.1,1.0,N,43,237,1,7.9,1.0,0.5,4.0,0.0,1.0,16.9,2.5,0.0
2,2,2023-01-01 00:25:04,2023-01-01 00:37:49,1.0,2.51,1.0,N,48,238,1,14.9,1.0,0.5,15.0,0.0,1.0,34.9,2.5,0.0
3,1,2023-01-01 00:03:48,2023-01-01 00:13:25,0.0,1.9,1.0,N,138,7,1,12.1,7.25,0.5,0.0,0.0,1.0,20.85,0.0,1.25
4,2,2023-01-01 00:10:29,2023-01-01 00:21:19,1.0,1.43,1.0,N,107,79,1,11.4,1.0,0.5,3.28,0.0,1.0,19.68,2.5,0.0


In [47]:
df.count()

VendorID                 3066766
tpep_pickup_datetime     3066766
tpep_dropoff_datetime    3066766
passenger_count          2995023
trip_distance            3066766
RatecodeID               2995023
store_and_fwd_flag       2995023
PULocationID             3066766
DOLocationID             3066766
payment_type             3066766
fare_amount              3066766
extra                    3066766
mta_tax                  3066766
tip_amount               3066766
tolls_amount             3066766
improvement_surcharge    3066766
total_amount             3066766
congestion_surcharge     2995023
airport_fee              2995023
dtype: int64

##### Add Expectations

In [48]:
pass_cnt = df.expect_column_values_to_be_between(column='passenger_count', min_value=0, max_value=9) 
assert pass_cnt.success==True

In [49]:
vendors = [1, 2]
vendor_in_set = df.expect_column_values_to_be_in_set('VendorID', value_set=vendors)
assert vendor_in_set.success == True

In [50]:
flg_check = df.expect_column_value_lengths_to_be_between('store_and_fwd_flag', min_value=1, max_value=1)
assert flg_check.success == True

In [51]:
col_cnt =df.expect_table_column_count_to_be_between(min_value=19, max_value=19)
assert col_cnt.success==True

##### Generate Expectations Suite from manually generated expectaitons above

In [52]:
df.get_expectation_suite()

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_column_values_to_be_between",
      "kwargs": {
        "column": "passenger_count",
        "min_value": 0,
        "max_value": 9
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_in_set",
      "kwargs": {
        "column": "VendorID",
        "value_set": [
          1,
          2
        ]
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_value_lengths_to_be_between",
      "kwargs": {
        "column": "store_and_fwd_flag",
        "min_value": 1,
        "max_value": 1
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_table_column_count_to_be_between",
      "kwargs": {
        "min_value": 19,
        "max_value": 19
      },
      "meta": {}
    }
  ],
  "data_asset_type": "Dataset",
  "meta": {
    "great_expectations_version": "0.18.8"
  }
}

In [53]:
import json
with open("../gx/expectations/GreatExpectations_Demo_Manual.json", "w") as f:
    f.write(json.dumps(df.get_expectation_suite().to_json_dict()))

# Import Your Suite & Validatethe Next Dataset (YellowTaxi)

In [54]:
import json
my_expextation_suite = json.load(open('../gx/expectations/GreatExpectations_Demo_Manual.json'))
my_df = gx.read_parquet("../data/yellow_taxi/yellow_tripdata_2023-02.parquet", expectation_suite=my_expextation_suite)
my_df.validate()

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "passenger_count",
          "min_value": 0,
          "max_value": 9
        },
        "meta": {}
      },
      "result": {
        "element_count": 2913955,
        "missing_count": 76817,
        "missing_percent": 2.6361766053353604,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": false,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "VendorID",
          "value

##### Fix Validation Issue (New Vendor) & ReValidate

In [55]:
vendors = [1, 2, 6]
my_df.expect_column_values_to_be_in_set('VendorID', value_set=vendors)
my_df.validate()

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_between",
        "kwargs": {
          "column": "passenger_count",
          "min_value": 0,
          "max_value": 9
        },
        "meta": {}
      },
      "result": {
        "element_count": 2913955,
        "missing_count": 76817,
        "missing_percent": 2.6361766053353604,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "unexpected_percent_nonmissing": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_set",
        "kwargs": {
          "column": "VendorID",
          "value_s

###### Overwrite Last Manual Save

In [56]:
with open("../gx/expectations/GreatExpectations_Demo_Manual.json", "w") as f:
    f.write(json.dumps(df.get_expectation_suite().to_json_dict()))

# User Configurable Profiling of New Dataset (GreenTaxi)

In [57]:
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
dataset = gx.read_parquet("../data/green_taxi/green_tripdata_2023-01.parquet")
profiler = UserConfigurableProfiler(dataset)
suite = profiler.build_suite()

Profiling:   0%|          | 0/20 [00:00<?, ?it/s, Column=VendorID]

Creating an expectation suite with the following expectations:

Table-Level Expectations
expect_table_columns_to_match_ordered_list
expect_table_row_count_to_be_between

Expectations by Column
Column Name: DOLocationID | Column Data Type: INT | Cardinality: MANY
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_set
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: PULocationID | Column Data Type: INT | Cardinality: MANY
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_set
expect_column_values_to_be_in_type_list
expect_column_val

In [58]:
with open('../gx/expectations/GreatExpectations_Demo_Auto.json','w') as f:
    f.write(json.dumps(dataset.get_expectation_suite().to_json_dict()))

# Import New Dataset and Validate from auto-generated profiler

In [59]:
import json
my_expectation_suite = json.load(open('../gx/expectations/GreatExpectations_Demo_Auto.json'))
my_df = gx.read_parquet("../data/green_taxi/green_tripdata_2023-02.parquet", expectation_suite=my_expectation_suite)
my_df.validate()

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "kwargs": {
          "column_list": [
            "VendorID",
            "lpep_pickup_datetime",
            "lpep_dropoff_datetime",
            "store_and_fwd_flag",
            "RatecodeID",
            "PULocationID",
            "DOLocationID",
            "passenger_count",
            "trip_distance",
            "fare_amount",
            "extra",
            "mta_tax",
            "tip_amount",
            "tolls_amount",
            "ehail_fee",
            "improvement_surcharge",
            "total_amount",
            "payment_type",
            "trip_type",
            "congestion_surcharge"
          ]
        },
        "meta": {}
      },
      "result": {
        "observed_value": [
          "VendorID",
          "lpep_pickup_datetime",
          "lpep_dropoff_datetime",
          "store_an

#### Rerun profiler over both files

In [60]:
#Only Needed First Time
context = gx.get_context()
datasource_name = "green_taxi"
path_to_folder = "../data/green_taxi/"
datasource = context.sources.add_pandas_filesystem(
    name=datasource_name, base_directory=path_to_folder
)
asset_name = "green_taxi_asset"
batching_regex = r"green_tripdata_(?P<year>\d{4})-(?P<month>\d{2})\.parquet"
datasource.add_parquet_asset(name=asset_name, batching_regex=batching_regex)

ParquetAsset(name='green_taxi_asset', type='parquet', id=None, order_by=[], batch_metadata={}, batching_regex=re.compile('green_tripdata_(?P<year>\\d{4})-(?P<month>\\d{2})\\.parquet'), connect_options={}, splitter=None, engine='auto', columns=None, storage_options=None, use_nullable_dtypes=None, dtype_backend=None, kwargs=None)

In [61]:
green_taxis = context.get_datasource("green_taxi").get_asset("green_taxi_asset")

In [62]:
print(green_taxis.batch_request_options)
batch_request = green_taxis.build_batch_request({"year": '2023', })
batches = green_taxis.get_batch_list_from_batch_request(batch_request)
for batch in batches:
    print(batch.batch_spec)

('year', 'month', 'path')
{'path': '../data/green_taxi/green_tripdata_2023-01.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-02.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-03.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-04.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-05.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-06.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-07.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tripdata_2023-08.parquet', 'reader_method': 'read_parquet', 'reader_options': {}}
{'path': '../data/green_taxi/green_tri

In [63]:
context.add_or_update_expectation_suite("GreatExpectations_Demo_Auto")
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="GreatExpectations_Demo_Auto",
)

In [66]:
profiler = UserConfigurableProfiler(validator, excluded_expectations = [
    "expect_column_max_to_be_between",
    "expect_column_mean_to_be_between", 
    "expect_column_median_to_be_between",
    "expect_column_min_to_be_between",
    "expect_column_proportion_of_unique_values_to_be_between",
    "expect_column_quantile_values_to_be_between",
    "expect_column_values_to_be_in_set",
    "expect_table_columns_to_match_ordered_list",
    "expect_table_row_count_to_be_between", 
])
suite = profiler.build_suite()

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling:   0%|          | 0/20 [00:00<?, ?it/s, Column=VendorID]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Creating an expectation suite with the following expectations:


Expectations by Column
Column Name: DOLocationID | Column Data Type: INT | Cardinality: MANY
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: PULocationID | Column Data Type: INT | Cardinality: MANY
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: RatecodeID | Column Data Type: FLOAT | Cardinality: VERY_FEW
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: VendorID | Column Data Type: INT | Cardinality: TWO
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: congestion_surcharge | Column Data Type: FLOAT | Cardinality: VERY_FEW
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: ehail_fee | Column Data Type: FLOAT | Cardinality: NONE
expect_column_values_to_be_in_type_list
expect_column_values_to_be_null


Column Name: ext

In [67]:
with open("../gx/expectations/GreatExpectations_Demo_Auto.json", "w") as f:
    f.write(json.dumps(validator.get_expectation_suite().to_json_dict()))

In [68]:
import json
my_expectation_suite = json.load(open('../gx/expectations/GreatExpectations_Demo_Auto.json'))
my_df = gx.read_parquet("../data/green_taxi/green_tripdata_2023-02.parquet", expectation_suite=my_expectation_suite)
my_df.validate()

{
  "success": true,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_not_be_null",
        "kwargs": {
          "column": "VendorID"
        },
        "meta": {}
      },
      "result": {
        "element_count": 64809,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_column_values_to_be_in_type_list",
        "kwargs": {
          "type_list": [
            "BIGINT",
            "BYTEINT",
            "ByteType()",
            "INT",
            "INT64",
            "INTEGER",
            "Int16Dtype",
            "Int32Dtype",
            "Int64