In [8]:
import great_expectations as gx
import re

# Create a DataContext as an entry point to the GX Python API
context = gx.get_context()

In [9]:
datasource_name = "my_ds2"
my_connection_string = (
    "postgresql+psycopg2://k6:k6@localhost:5432/k6"
)

pg_datasource = context.sources.add_postgres(
    name=datasource_name, connection_string=my_connection_string
)

In [10]:
pg_datasource.add_table_asset(
    name="my_ds2", table_name='test_1', schema_name="staging"
)

TableAsset(name='my_ds2', type='table', id=None, order_by=[], batch_metadata={}, splitter=None, table_name='test_1', schema_name='staging')

In [11]:
batch_request = pg_datasource.get_asset("my_ds2").build_batch_request()

In [12]:
expectation_suite_name = "validate_data"
context.add_or_update_expectation_suite(expectation_suite_name=expectation_suite_name)
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name,
)

print(validator.head())

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

   labels                                          input_ids  \
0       0  {101,7526,2339,1996,10086,2015,2081,2104,2026,...   
1       0  {101,1040,1005,22091,2860,999,2002,3503,2023,4...   
2       0  {101,4931,2158,1010,1045,1005,1049,2428,2025,2...   
3       0  {101,1000,2062,1045,2064,1005,1056,2191,2151,2...   
4       0  {101,2017,1010,2909,1010,2024,2026,5394,1012,2...   

                                      attention_mask  
0  {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1...  
1  {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1...  
2  {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1...  
3  {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1...  
4        {1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1}  


In [15]:
print(validator.columns())

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

['labels', 'input_ids', 'attention_mask']


In [16]:
validator.expect_column_values_to_not_be_null("labels")
validator.expect_column_values_to_not_be_null("input_ids")
validator.expect_column_values_to_not_be_null("attention_mask")
validator.expect_column_values_to_be_in_set("labels", [0, 1])

if "id" in validator.columns():
    validator.expect_column_values_to_not_be_null("id")
    validator.expect_column_values_to_be_unique("id")
    
    uuid_v4_regex = r"^[0-9a-f]{8}-[0-9a-f]{4}-[1-5][0-9a-f]{3}-[89ab][0-9a-f]{3}-[0-9a-f]{12}$"
    validator.expect_column_values_to_match_regex("id", uuid_v4_regex, mostly=1.0)

    # Áp dụng regex cho JSON array dạng [1, 2, 3]
    json_array_of_ints = r"^\s*\[\s*\d+(?:\s*,\s*\d+)*\s*\]\s*$"
    mask_pattern = r"^\s*\[\s*[01](?:\s*,\s*[01])*\s*\]\s*$"
    
    validator.expect_column_values_to_match_regex("input_ids", json_array_of_ints)
    validator.expect_column_values_to_match_regex("attention_mask", mask_pattern)

else:
    # Nếu là test_1, dùng dạng Postgres array string: {1,2,3}
    pg_array_of_ints = r"^\s*\{\s*\d+(?:\s*,\s*\d+)*\s*\}\s*$"
    mask_pg_pattern = r"^\s*\{\s*[01](?:\s*,\s*[01])*\s*\}\s*$"

    validator.expect_column_values_to_match_regex("input_ids", pg_array_of_ints)
    validator.expect_column_values_to_match_regex("attention_mask", mask_pg_pattern)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/11 [00:00<?, ?it/s]

In [17]:
# Similar to a single file, create a checkpoint to validate the result
# Define the checkpoint
checkpoint = context.add_or_update_checkpoint(
    name="staging_checkpoint",
    validator=validator
)

# Get the result after validator
checkpoint_result = checkpoint.run()

# Quick view on the validation result
context.view_validation_result(checkpoint_result)

Calculating Metrics: 0it [00:00, ?it/s]

Opening in existing browser session.
