In [1]:
from dataclasses import dataclass
import string

import great_expectations as gx
import numpy as np
import pandas as pd

In [2]:
# The data context will allow us to access a bunch of utility and
# convenience methods; it's the entry point for GX in the Py API.
context = gx.get_context()

In [6]:
# Let's make some synthetic data for use in GX.

def create_random_records(n: int, missing_data_percent: float = 0.1) -> pd.DataFrame:
    """Create random records."""

    # We purposely put some malformed data in here.
    field_ints = np.random.randint(-100, 100, size=n)
    field_nonneg_int = np.random.randint(-10, 100, size=n)
    field_required_int = np.random.randint(-100, 100, size=n)
    field_float = np.random.rand(n)
    field_str = np.random.choice(list(string.ascii_lowercase), size=n, replace=True)

    fields = {
        "field_int": field_ints,
        "field_nonneg_int": field_nonneg_int,
        "field_required_int": field_required_int,
        "field_float": field_float,
        "field_str": field_str,
    }

    df = pd.DataFrame(fields)

    # Create random nulls.
    df = df.mask(
        np.random.choice(
            [True, False],
            size=df.shape,
            p=[missing_data_percent, 1 - missing_data_percent],
        )
    )

    return df

df = create_random_records(1000)
df.head()

Unnamed: 0,field_int,field_nonneg_int,field_required_int,field_float,field_str
0,-86.0,,-1.0,0.484639,b
1,-9.0,99.0,67.0,0.037128,j
2,42.0,,-94.0,0.159722,y
3,,92.0,58.0,0.772233,e
4,76.0,,-84.0,0.96032,c


In [7]:
# Create a validator for reading a dataframe.
# A validator stores Expectations about data it's associated with, 
# and performing introspections on the data.

validator = context.sources.pandas_default.read_dataframe(df)

In [9]:
# Domain knowledge Expectations:
validator.expect_column_values_to_not_be_null("field_required_int")
validator.expect_column_values_to_be_between("field_nonneg_int", min_value=0)

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": false,
  "meta": {},
  "result": {
    "element_count": 1000,
    "unexpected_count": 92,
    "unexpected_percent": 10.32547699214366,
    "partial_unexpected_list": [
      -8.0,
      -8.0,
      -10.0,
      -6.0,
      -2.0,
      -7.0,
      -7.0,
      -4.0,
      -7.0,
      -6.0,
      -6.0,
      -8.0,
      -1.0,
      -9.0,
      -9.0,
      -1.0,
      -10.0,
      -2.0,
      -7.0,
      -6.0
    ],
    "missing_count": 109,
    "missing_percent": 10.9,
    "unexpected_percent_total": 9.2,
    "unexpected_percent_nonmissing": 10.32547699214366
  }
}

In [10]:
# This told us some stuff that was bad.  Uh-oh.
# Let's do a checkpoint, this allows us to repeat validation.

checkpoint = gx.checkpoint.SimpleCheckpoint(name="quickstart_checkpoint", data_context=context, validator=validator)

# Run the checkpoint.
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/15 [00:00<?, ?it/s]

In [11]:
# Gx compiles this stuff into a nice little data doc.
context.build_data_docs()

{'local_site': 'file:///tmp/tmp7yyndvd8/index.html'}