In [2]:
from dataclasses import dataclass
import string

import great_expectations as gx
import numpy as np
import pandas as pd

In [3]:
# The data context will allow us to access a bunch of utility and
# convenience methods; it's the entry point for GX in the Py API.
context = gx.get_context()

In [7]:
# Let's make some synthetic data for use in GX.


@dataclass
class Record:
    """Represent a Record."""

    field_int: int | None
    field_nonneg_int: int | None
    field_required_int: int | None
    field_float: float | None
    field_str: str | None


def create_random_records(n: int, missing_data_percent: float = 0.1) -> list["Record"]:
    """Create random records."""

    # We purposely put some malformed data in here.
    field_ints = np.random.randint(-100, 100, size=n)
    field_nonneg_int = np.random.randint(-10, 100, size=n)
    field_required_int = np.random.randint(-100, 100, size=n)
    field_float = np.random.rand(n)
    field_str = np.random.choice(list(string.ascii_lowercase), size=n, replace=True)

    fields = {
        "field_int": field_ints,
        "field_nonneg_int": field_nonneg_int,
        "field_required_int": field_required_int,
        "field_float": field_float,
        "field_str": field_str,
    }

    df = pd.DataFrame(fields)

    # Create random nulls.
    df = df.mask(
        np.random.choice(
            [True, False],
            size=df.shape,
            p=[missing_data_percent, 1 - missing_data_percent],
        )
    )

    return df

df = create_random_records(100)
df.head()

Unnamed: 0,field_int,field_nonneg_int,field_required_int,field_float,field_str
0,-6.0,74.0,89.0,0.149443,h
1,-85.0,39.0,-57.0,0.991908,b
2,,76.0,-37.0,,j
3,-76.0,30.0,-72.0,0.797462,m
4,-58.0,,29.0,0.918519,n


In [10]:
# Create a validator for reading a dataframe.
# A validator stores Expectations about data it's associated with, and performing introspections on the data.

validator = context.sources.pandas_default.read_dataframe(df)

In [11]:
# Domain knowledge Expectations:
validator.expect_column_values_to_not_be_null("field_required_int")
validator.expect_column_values_to_be_between("field_nonneg_int", min_value=0)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "meta": {},
  "result": {
    "element_count": 100,
    "unexpected_count": 8,
    "unexpected_percent": 8.421052631578947,
    "partial_unexpected_list": [
      -5.0,
      -6.0,
      -9.0,
      -5.0,
      -7.0,
      -5.0,
      -6.0,
      -2.0
    ],
    "missing_count": 5,
    "missing_percent": 5.0,
    "unexpected_percent_total": 8.0,
    "unexpected_percent_nonmissing": 8.421052631578947
  },
  "success": false
}

In [12]:
# This told us some stuff that was bad.  Uh-oh.
# Let's do a checkpoint, this allows us to repeat validation.

checkpoint = gx.checkpoint.SimpleCheckpoint(name="quickstart_checkpoint", data_context=context, validator=validator)

# Run the checkpoint.
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

In [15]:
# Gx compiles this stuff into a nice little data doc.
context.build_data_docs()

{'local_site': 'file:///tmp/tmppxeqjb2w/index.html'}