In [1]:
import great_expectations as ge


# Get some reference data

In [2]:
df_ref = ge.read_csv("data/reference.csv")
df_new = ge.read_csv("data/test.csv")


In [3]:
df_ref.head()

Unnamed: 0.1,Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,117,7.7,3.8,6.7,2.2,virginica
1,145,6.7,3.0,5.2,2.3,virginica
2,115,6.4,3.2,5.3,2.3,virginica
3,61,5.9,3.0,4.2,1.5,versicolor
4,100,6.3,3.3,6.0,2.5,virginica


In [13]:
df_ref['target'].unique()

array(['virginica', 'versicolor', 'setosa'], dtype=object)

In [4]:
# "test.csv" contains some data quality issues!

df_new.tail()

Unnamed: 0.1,Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
45,92,5.8,2.6,4.0,1.2,versicolor
46,77,6.7,3.0,5.0,1.7,versicolor
47,106,4.9,2.5,4.5,1.7,virginica
48,53,5.5,2.3,4.0,1.3,versicolor
49,138,6.0,3.0,4.8,123.456,bob


# Setup an expectation and apply it to the reference data

Examples
df_ref.expect_column_max_to_be_between("sepal length (cm)", 1,5)
df_ref.expect_column_max_to_be_between("sepal length (cm)", 1,100)


## Things to try

sepal length (cm)
petal width (cm)



In [5]:
#df_ref.expect_column_max_to_be_between(COLNAME, FROM, TO)

COLUMN_NAME = "sepal length (cm)"

In [6]:
df_ref[COLUMN_NAME].min()


4.3

In [7]:
df_ref[COLUMN_NAME].max()


7.7

In [10]:
#df_ref.expect_column_max_to_be_between(COLNAME, FROM, TO)
df_ref.expect_column_max_to_be_between(COLUMN_NAME, 4.3, 7.7)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": 7.7,
    "element_count": 100,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [None]:
# View expectations

In [11]:
expectations = df_ref.get_expectation_suite()
expectations

{
  "expectation_suite_name": "default",
  "ge_cloud_id": null,
  "data_asset_type": "Dataset",
  "expectations": [
    {
      "kwargs": {
        "column": "sepal length (cm)",
        "min_value": 4.3,
        "max_value": 7.7
      },
      "expectation_type": "expect_column_max_to_be_between",
      "meta": {}
    }
  ],
  "meta": {
    "great_expectations_version": "0.15.23"
  }
}

In [None]:
# Apply expectations

In [12]:
# Apply the expectations to the new "unseen" dataset
df_new.validate(expectations)



{
  "statistics": {
    "evaluated_expectations": 1,
    "successful_expectations": 0,
    "unsuccessful_expectations": 1,
    "success_percent": 0.0
  },
  "evaluation_parameters": {},
  "results": [
    {
      "expectation_config": {
        "kwargs": {
          "column": "sepal length (cm)",
          "min_value": 4.3,
          "max_value": 7.7
        },
        "expectation_type": "expect_column_max_to_be_between",
        "meta": {}
      },
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": false,
      "result": {
        "observed_value": 7.9,
        "element_count": 50,
        "missing_count": null,
        "missing_percent": null
      },
      "meta": {}
    }
  ],
  "success": false,
  "meta": {
    "great_expectations_version": "0.15.23",
    "expectation_suite_name": "default",
    "run_id": {
      "run_name": null,
      "run_time": "2022-09-29T13:29:34.27977

In [14]:
# Add some more expectations

df_ref.expect_column_values_to_be_in_set("target", ['virginica', 'versicolor', 'setosa'])


{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 100,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [15]:
expectations = df_ref.get_expectation_suite()
df_new.validate(expectations)

{
  "statistics": {
    "evaluated_expectations": 2,
    "successful_expectations": 0,
    "unsuccessful_expectations": 2,
    "success_percent": 0.0
  },
  "evaluation_parameters": {},
  "results": [
    {
      "expectation_config": {
        "kwargs": {
          "column": "sepal length (cm)",
          "min_value": 4.3,
          "max_value": 7.7
        },
        "expectation_type": "expect_column_max_to_be_between",
        "meta": {}
      },
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": false,
      "result": {
        "observed_value": 7.9,
        "element_count": 50,
        "missing_count": null,
        "missing_percent": null
      },
      "meta": {}
    },
    {
      "expectation_config": {
        "kwargs": {
          "column": "target",
          "value_set": [
            "virginica",
            "versicolor",
            "setosa"
          ]
        },


# Interactive sessions

Run this from command line
1. make expectations
   1. command line: great_expectations suite new
   1. answer some prompts, launches jupyter notebook
   1. fill out jupyter notebook, writes yaml file and launches html docs
1. apply expectations
   1. command line:  great_expectations checkpoint new
