In [21]:
import great_expectations as gx
from great_expectations.checkpoint import Checkpoint

# Create a DataContext as an entry point to the GX Python API
context = gx.get_context()

In [22]:
datasource_name = "ds_postgres"
my_connection_string = (
    "postgresql+psycopg2://k6:k6@localhost:5432/k6"
)

pg_datasource = context.sources.add_postgres(
    name=datasource_name, connection_string=my_connection_string
)

In [23]:
pg_datasource.add_table_asset(
    name="postgres_data", table_name='streaming', schema_name="staging"
)

TableAsset(name='postgres_data', type='table', id=None, order_by=[], batch_metadata={}, splitter=None, table_name='streaming', schema_name='staging')

In [24]:
batch_request = pg_datasource.get_asset("postgres_data").build_batch_request()

In [25]:
expectation_suite_name = "validate_data"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

print(validator.head())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

                                     id  labels  \
0  2c1f86cc-a95d-4b06-8e40-148f535396a9       0   
1  dec242b5-d4f4-42fe-adf5-4850074c8e39       0   
2  5c57e24e-bd48-40bd-a872-f48b834f3ddd       0   
3  ddab93d1-19cc-466c-9c33-1ec88a0359b6       1   
4  f0a11d68-4423-42e9-bb9c-a579e68162f9       0   

                                           input_ids  \
0  [101, 2043, 2017, 2360, 20868, 3089, 10440, 35...   
1  [101, 1061, 1054, 2057, 2725, 2023, 1024, 6986...   
2  [101, 1000, 28144, 12043, 1010, 5925, 3985, 27...   
3  [101, 4931, 1012, 1012, 1012, 2054, 2003, 2009...   
4  [101, 1000, 1027, 1027, 2047, 1027, 1027, 1063...   

                                      attention_mask  
0  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
1                        [1, 1, 1, 1, 1, 1, 1, 1, 1]  
2  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
3  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  
4  [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...  


In [26]:
validator.expect_column_values_to_not_be_null("id")
validator.expect_column_values_to_not_be_null("labels")
validator.expect_column_values_to_not_be_null("input_ids")
validator.expect_column_values_to_not_be_null("attention_mask")

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1137,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [27]:
uuid_v4_regex = r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
validator.expect_column_values_to_match_regex("id", uuid_v4_regex, mostly=1.0)
validator.expect_column_values_to_be_unique("id")

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/10 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1137,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [28]:
validator.expect_column_values_to_be_in_set("labels", [0, 1])

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 1137,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [29]:
json_array_of_ints = r"^\s*\[\s*\d+(?:\s*,\s*\d+)*\s*\]\s*$"
validator.expect_column_values_to_match_regex("input_ids", json_array_of_ints)
validator.expect_column_values_to_match_regex("input_ids", r"^\s*\[\s*101\b", mostly=0.99)

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1137,
    "unexpected_count": 1137,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "[101, 2043, 2017, 2360, 20868, 3089, 10440, 3508, 3252, 1010, 2054, 3599, 2079, 2017, 2812, 1029, 1045, 2444, 2006, 2019, 20868, 3089, 11644, 3796, 2029, 3594, 8964, 11867, 6657, 11451, 1010, 1998, 1045, 1005, 1049, 2025, 2521, 2013, 1996, 7205, 3149, 1012, 15177, 2063, 2035, 2298, 1996, 2168, 2802, 3103, 9447, 8464, 1012, 1045, 1005, 2310, 2288, 1037, 2210, 8622, 2051, 2085, 2000, 6723, 2070, 2062, 7171, 1010, 2295, 1045, 2123, 1005, 1056, 2228, 1045, 1005, 2222, 2131, 2041, 2004, 2521, 2004, 8108, 2669, 1012, 2288, 2151, 11186, 1029, 1032, 2831, 102]",
      "[101, 1061, 1054, 2057, 2725, 2023, 1024, 6986, 102]",
      "[101, 1000, 28144, 12043, 1010, 5925, 3985, 2758, 2023, 2003, 1996, 2171, 2005, 2008, 2792, 1012, 1045, 2079, 2113, 2045, 2003, 2525, 2019, 2792, 2007, 2008, 2171, 1010, 2021, 5925, 2758, 2009, 1005, 1055

In [30]:
mask_pattern = r"^\s*\[\s*[01](?:\s*,\s*[01])*\s*\]\s*$"
validator.expect_column_values_to_match_regex("attention_mask", mask_pattern)
validator.expect_column_values_to_match_regex("attention_mask", r"^\s*\[\s*1\b", mostly=1.0)

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

{
  "success": false,
  "result": {
    "element_count": 1137,
    "unexpected_count": 1137,
    "unexpected_percent": 100.0,
    "partial_unexpected_list": [
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1]",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]",
      "[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1

In [32]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="staging_checkpoint",
    validator=validator
)

# Get the result after validator
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)

Calculating Metrics: 0it [00:00, ?it/s]

Opening in existing browser session.
