# Import Great Expectations

In [1]:
import pandas as pd
import great_expectations as ge

# Import data

In [2]:
filepath = "./data.csv"
df = ge.read_csv(filepath)
# convert data type of "hired_date" column from object to date-time
df['hired_date'] =  pd.to_datetime(df['hired_date'], format='%Y-%M-%d')
df.head(20)

Unnamed: 0,employee_id,employee_name,employee_age,employee_location,hired_date,salary,department,employer_customer_id
0,1,Thomas Daniels,30,Alabama,2018-01-05 00:04:00,112000,sale,25.0
1,2,Madison Carter,32,Colorado,2018-01-25 00:06:00,96000,marketing,
2,3,Joe Stephens,25,Florida,2018-01-20 00:09:00,88000,marketing,56.0
3,4,Steven Davis,32,Washington,2018-01-01 00:11:00,126000,development,54.0
4,5,Mary Dougherty,24,Colorado,2018-01-03 00:12:00,110000,sales,78.0
5,6,Andrew Jackson,34,Delaware,2019-01-15 00:02:00,102000,marketing,11.0
6,7,Michael Cohen,28,Florida,2019-01-20 00:05:00,99000,development,33.0
7,8,Diane Bradford,25,Washington,2019-01-01 00:08:00,100000,sales,89.0
8,9,Brittany Hicks,30,Colorado,2019-01-17 00:10:00,91000,marketing,
9,10,Marcus Brewer,26,Florida,2019-01-05 00:12:00,82000,development,78.0


# Check Data Types

In [3]:
df.dtypes

employee_id                      int64
employee_name                   object
employee_age                     int64
employee_location               object
hired_date              datetime64[ns]
salary                           int64
department                      object
employer_customer_id           float64
dtype: object

# Different Expectation Test

In [11]:
# validate, the column "employee_location" only include the state name 
#[Alabama, Colorado, Delaware, Florida, and Washington]
print('expect_employee_location_to Alabama, Colorado, Delaware, Florida, and Washington',
df.expect_column_values_to_be_in_set('employee_location',['Alabama','Colorado','Delaware','Florida','Washington']))

expect_employee_location_to Alabama, Colorado, Delaware, Florida, and Washington {
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_location",
      "value_set": [
        "Alabama",
        "Colorado",
        "Delaware",
        "Florida",
        "Washington"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_in_set"
  }
}


In [14]:
print('expect_department_to be sales, marketing, or development',
df.expect_column_values_to_be_in_set('department',['sales','marketing','development']))

expect_department_to be sales, marketing, or development {
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 1,
    "unexpected_percent": 5.0,
    "unexpected_percent_total": 5.0,
    "unexpected_percent_nonmissing": 5.0,
    "partial_unexpected_list": [
      "sale"
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "department",
      "value_set": [
        "sales",
        "marketing",
        "development"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_in_set"
  }
}


In [18]:
print(df.expect_table_columns_to_match_ordered_list(
    ['employee_id','employee_name','employee_age','employee_location',
     'hired_date','salary','department','employer_customer_id']))

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      "employee_id",
      "employee_name",
      "employee_age",
      "employee_location",
      "hired_date",
      "salary",
      "department",
      "employer_customer_id"
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column_list": [
        "employee_id",
        "employee_name",
        "employee_age",
        "employee_location",
        "hired_date",
        "salary",
        "department",
        "employer_customer_id"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_columns_to_match_ordered_list"
  }
}


In [17]:
print(df.expect_column_to_exist('employee_age'))
#The test will show success = True, because column 'employee_age' does exist in the table

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {},
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  }
}


In [19]:
#validate column 'customer_age' exist in the table
print(df.expect_column_to_exist('customer_age'))
#The test will be success = False, because column 'customer_age' does not exist in the table

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {},
  "expectation_config": {
    "kwargs": {
      "column": "customer_age",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_to_exist"
  }
}


In [20]:
#validate total colums count in the table
print(df.expect_table_column_count_to_be_between(min_value=0, max_value=8))
# The test will pass if max_value=8 or more. The test will fail if max_value=7 or less.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 8
  },
  "expectation_config": {
    "kwargs": {
      "min_value": 0,
      "max_value": 8,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_column_count_to_be_between"
  }
}


In [22]:
print(df.expect_table_column_count_to_equal(value = 9))
# The test will pass if value=8. The test will fail if value is other tahn 9.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 8
  },
  "expectation_config": {
    "kwargs": {
      "value": 9,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_column_count_to_equal"
  }
}


In [23]:
print(df.expect_table_row_count_to_be_between(min_value=0, max_value=20))
# The test will pass if max_value=20 or more. The test will fail if max_value=19 or less.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 20
  },
  "expectation_config": {
    "kwargs": {
      "min_value": 0,
      "max_value": 20,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_be_between"
  }
}


In [24]:
print(df.expect_table_row_count_to_equal(value = 20))
# The test will pass if value= 20. The test will fail if value is other than 20.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 20
  },
  "expectation_config": {
    "kwargs": {
      "value": 20,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_table_row_count_to_equal"
  }
}


In [13]:
print(df.expect_column_values_to_be_unique('hired_date'))
#the test will pass because all the employees were hired on different date.

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_unique",
    "meta": {},
    "kwargs": {
      "column": "hired_date",
      "result_format": "BASIC"
    }
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "success": true
}


In [26]:
print(df.expect_column_values_to_be_unique('salary'))
#the test will fail because all the some employees salary were is same.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 4,
    "unexpected_percent": 20.0,
    "unexpected_percent_total": 20.0,
    "unexpected_percent_nonmissing": 20.0,
    "partial_unexpected_list": [
      99000,
      100000,
      99000,
      100000
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "salary",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_unique"
  }
}


In [27]:
print(df.expect_column_values_to_not_be_null('employer_customer_id'))
#the test will fail because column 'employer_customer_id' has some null values.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "unexpected_count": 2,
    "unexpected_percent": 10.0,
    "unexpected_percent_total": 10.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "employer_customer_id",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_not_be_null"
  }
}


In [28]:
print(df.expect_column_values_to_be_of_type('employee_id', 'object'))
#the test will fail because column 'employer_id' has int64 data type. 

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "int64"
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_id",
      "type_": "object",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_of_type__aggregate"
  }
}


In [30]:
print(df.expect_column_values_to_be_in_type_list('employer_customer_id',['int64','object','float64']))
#the test will fail because column 'employer_customer_id' has float64 data type. 

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": "float64"
  },
  "expectation_config": {
    "kwargs": {
      "column": "employer_customer_id",
      "type_list": [
        "int64",
        "object",
        "float64"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "_expect_column_values_to_be_in_type_list__aggregate"
  }
}


In [18]:
print(df.expect_column_values_to_not_be_in_set('employee_age', [19,39], mostly=0.89))
# the test will show pass even we have provided the age is 19 between 39. 

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_not_be_in_set",
    "meta": {},
    "kwargs": {
      "column": "employee_age",
      "value_set": [
        19,
        39
      ],
      "mostly": 0.89,
      "result_format": "BASIC"
    }
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 2,
    "unexpected_percent": 10.0,
    "unexpected_percent_total": 10.0,
    "unexpected_percent_nonmissing": 10.0,
    "partial_unexpected_list": [
      39,
      39
    ]
  },
  "meta": {},
  "success": true
}


In [32]:
print(df.expect_column_values_to_be_between('employee_age', min_value=20, max_value=30))
# the test will show fail/false. beacuse all there are emloyees in the column, whose age in more than 30.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 7,
    "unexpected_percent": 35.0,
    "unexpected_percent_total": 35.0,
    "unexpected_percent_nonmissing": 35.0,
    "partial_unexpected_list": [
      32,
      32,
      34,
      39,
      36,
      39,
      37
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 20,
      "max_value": 30,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_between"
  }
}


In [33]:
print(df.expect_column_values_to_be_increasing('employee_id'))
# the test will show pass beacuse all the values of column employee_id is increasing incremently.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_id",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_increasing"
  }
}


In [35]:
print(df.expect_column_values_to_be_decreasing('employee_id'))
# the test will show fail/false beacuse all the values of column employee_id is mot decreasing incremently.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 19,
    "unexpected_percent": 95.0,
    "unexpected_percent_total": 95.0,
    "unexpected_percent_nonmissing": 95.0,
    "partial_unexpected_list": [
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      9,
      10,
      11,
      12,
      13,
      14,
      15,
      16,
      17,
      18,
      19,
      20
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_id",
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_be_decreasing"
  }
}


In [38]:
print(df.expect_column_value_lengths_to_be_between('employee_location', min_value=7, max_value=9, mostly =0.80))
#The test will be pass/true because 80 % values character length are between 7 and 9.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 4,
    "unexpected_percent": 20.0,
    "unexpected_percent_total": 20.0,
    "unexpected_percent_nonmissing": 20.0,
    "partial_unexpected_list": [
      "Washington",
      "Washington",
      "Washington",
      "Washington"
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_location",
      "min_value": 7,
      "max_value": 9,
      "mostly": 0.8,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_value_lengths_to_be_between"
  }
}


In [39]:
print(df.expect_column_value_lengths_to_equal('department', 10))
#The test will fail because not all the values character length is exactly equal to 10.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 20,
    "unexpected_percent": 100.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0,
    "partial_unexpected_list": [
      "sale",
      "marketing",
      "marketing",
      "development",
      "sales",
      "marketing",
      "development",
      "sales",
      "marketing",
      "development",
      "sales",
      "marketing",
      "sales",
      "development",
      "marketing",
      "development",
      "sales",
      "development",
      "marketing",
      "sales"
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "department",
      "value": 10,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_value_lengths_to_e

In [40]:
print(df.expect_column_values_to_match_regex('employee_location', '[washi]+', mostly=None))
#In this case, the test will be passed becesuse the regex '[washi]+' matches some of the values of the column 'employee_location'

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_location",
      "regex": "[washi]+",
      "mostly": null,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_match_regex"
  }
}


In [41]:
print(df.expect_column_values_to_not_match_regex('department', 'le', mostly=None))
# In this case, the test will false/fail because, regex 'le' matches some of the values of the column 'department'

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 7,
    "unexpected_percent": 35.0,
    "unexpected_percent_total": 35.0,
    "unexpected_percent_nonmissing": 35.0,
    "partial_unexpected_list": [
      "sale",
      "sales",
      "sales",
      "sales",
      "sales",
      "sales",
      "sales"
    ]
  },
  "expectation_config": {
    "kwargs": {
      "column": "department",
      "regex": "le",
      "mostly": null,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_not_match_regex"
  }
}


In [42]:
print(df.expect_column_values_to_match_regex_list("department", ['le','mark','ve'], match_on='any', mostly=None))
#In this case, the test will pass/true, because some of the values of the column 'department' does matches the regex list.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "department",
      "regex_list": [
        "le",
        "mark",
        "ve"
      ],
      "match_on": "any",
      "mostly": null,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_match_regex_list"
  }
}


In [43]:
print(df.expect_column_values_to_not_match_regex_list("department", ['lle','markk','vve']))
#In this case, the test will pass/true, because some of the values of the column 'department' does matches the regex list.


{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "expectation_config": {
    "kwargs": {
      "column": "department",
      "regex_list": [
        "lle",
        "markk",
        "vve"
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_values_to_not_match_regex_list"
  }
}


In [49]:
print(df.expect_column_values_to_match_strftime_format('hired_date', '%Y-%M-%d', mostly=None))
# In this case, the test will pass/true, because column'hired_date' datatype matches strftime_format.

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_match_strftime_format",
    "meta": {},
    "kwargs": {
      "column": "hired_date",
      "strftime_format": "%Y-%M-%d",
      "mostly": null,
      "result_format": "BASIC"
    }
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "success": true
}


In [50]:
print(df.expect_column_values_to_be_dateutil_parseable('hired_date', mostly=None))

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_dateutil_parseable",
    "meta": {},
    "kwargs": {
      "column": "hired_date",
      "mostly": null,
      "result_format": "BASIC"
    }
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "success": true
}


In [None]:
Test: expect_column_values_to_be_json_parseable

#write about this test
#works only when date column is string

In [51]:
print(df.expect_column_values_to_be_json_parseable('department', mostly=None))

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_json_parseable",
    "meta": {},
    "kwargs": {
      "column": "department",
      "mostly": null,
      "result_format": "BASIC"
    }
  },
  "result": {
    "element_count": 20,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 20,
    "unexpected_percent": 100.0,
    "unexpected_percent_total": 100.0,
    "unexpected_percent_nonmissing": 100.0,
    "partial_unexpected_list": [
      "sale",
      "marketing",
      "marketing",
      "development",
      "sales",
      "marketing",
      "development",
      "sales",
      "marketing",
      "development",
      "sales",
      "marketing",
      "sales",
      "development",
      "marketing",
      "development",
      "sales",
      "development",
      "marketing",
      "sales"
    ]
  },
  "meta": {},
 

In [44]:
print(df.expect_column_distinct_values_to_be_in_set('employee_age', [30,32]))
# In this case, the test will fail/false, because not all the unique values are not within the given range.


{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      20,
      22,
      24,
      25,
      26,
      28,
      29,
      30,
      32,
      34,
      36,
      37,
      39
    ],
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "value_set": [
        30,
        32
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_distinct_values_to_be_in_set"
  }
}


In [45]:
print(df.expect_column_distinct_values_to_equal_set('employee_age', [
      20,
      22,
      24,
      25,
      26,
      28,
      29,
      30,
      32,
      34,
      36,
      37,
      39
    ]))

#in this case, the test will pass/true, because all the distinct/unique values of the given column matches the given list values.


{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      20,
      22,
      24,
      25,
      26,
      28,
      29,
      30,
      32,
      34,
      36,
      37,
      39
    ],
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "value_set": [
        20,
        22,
        24,
        25,
        26,
        28,
        29,
        30,
        32,
        34,
        36,
        37,
        39
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_distinct_values_to_equal_set"
  }
}


In [46]:
print(df.expect_column_distinct_values_to_contain_set('employee_age', [20,22]))
# In this case, the test will pass, beacuse some of the values does exist in between 20 and 22.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": [
      20,
      22,
      24,
      25,
      26,
      28,
      29,
      30,
      32,
      34,
      36,
      37,
      39
    ],
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "value_set": [
        20,
        22
      ],
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_distinct_values_to_contain_set"
  }
}


In [47]:
print(df.expect_column_mean_to_be_between('employee_age', min_value=25, max_value=30 ))
#In this case, the test will pass, because the total mean is withhin the range.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 29.55,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 25,
      "max_value": 30,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_mean_to_be_between"
  }
}


In [None]:
Test: expect_column_median_to_be_between
In the below test, we'll test that, if the total median of the given column 'employee_age' are within the given range min_value=25, max_value=29.


In [48]:
print(df.expect_column_median_to_be_between('employee_age', min_value=25, max_value=29))
#In this case, the test will fail/false, because the total median is outside the range.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 29.5,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 25,
      "max_value": 29,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_median_to_be_between"
  }
}


In [49]:
print(df.expect_column_quantile_values_to_be_between('employee_age', {
        "quantiles": [0., 0.333, 0.6667, 1.],
        "value_ranges": [[19,21], [25,27], [31,33], [38,39]]
    }))
#In this case, the test will pass/true, because all of the quantiles are withing their given range.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": {
      "quantiles": [
        0.0,
        0.333,
        0.6667,
        1.0
      ],
      "values": [
        20,
        26,
        32,
        39
      ]
    },
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "quantile_ranges": {
        "quantiles": [
          0.0,
          0.333,
          0.6667,
          1.0
        ],
        "value_ranges": [
          [
            19,
            21
          ],
          [
            25,
            27
          ],
          [
            31,
            33
          ],
          [
            38,
            39
          ]
        ]
      },
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_co

In [65]:
print(df.expect_column_stdev_to_be_between('employee_age', min_value=25, max_value=30))

# The test will fail/false, because standard deviation is not in between the given range.

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "expectation_config": {
    "expectation_type": "expect_column_stdev_to_be_between",
    "meta": {},
    "kwargs": {
      "column": "employee_age",
      "min_value": 25,
      "max_value": 30,
      "result_format": "BASIC"
    }
  },
  "result": {
    "observed_value": 5.462551938811248,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {},
  "success": false
}


In [None]:
Test: expect_column_unique_value_count_to_be_between
In the below test, we'll test that, if the unique values of the given column 'employee_age' are within the given range min_value=1, max_value=3.


In [50]:
print(df.expect_column_unique_value_count_to_be_between('employee_age', min_value=1, max_value=3))
#In this case, the test will fail/false because the total count is not between the given range

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 13,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 1,
      "max_value": 3,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_unique_value_count_to_be_between"
  }
}


In [None]:
Test: expect_column_proportion_of_unique_values_to_be_between
In the below test, we'll test that, if the proportion of the unique values of the given column 'employee_age' are within the given range min_value=0, max_value=1.
#For example, in a column containing [1, 2, 2, 3, 3, 3, 4, 4, 4, 4], there are 4 unique values and 10 total values for a proportion of 0.4.

In [51]:
print(df.expect_column_proportion_of_unique_values_to_be_between('employee_age', min_value=0, max_value=1))
# In this case, the test will pass/true, because the proportion_of_unique_values is within the range.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 0.65,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 0,
      "max_value": 1,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_proportion_of_unique_values_to_be_between"
  }
}


In [53]:
#Not sure how it works
#print(df.expect_column_most_common_value_to_be_in_set('employee_age', [25,30]))

In [54]:
print(df.expect_column_sum_to_be_between('employee_age', min_value=200, max_value=400))
#In this case, the test will fail/false, because the sum of the column values are not  within the range.

{
  "success": false,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 591,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 200,
      "max_value": 400,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_sum_to_be_between"
  }
}


In [55]:
print(df.expect_column_min_to_be_between('employee_age', min_value=19, max_value=20))
#In this case, the test will fail/false, because the min value of the given column is not within the range.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 20,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 19,
      "max_value": 20,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_min_to_be_between"
  }
}


In [None]:
Test: expect_column_max_to_be_between
In the below test, we'll test that, if the maximum value of the given column 'employee_age' is within the given range min_value=38, max_value=40.


In [56]:
print(df.expect_column_max_to_be_between('employee_age', min_value=38, max_value=40))
#In this case, the test will fail/false, because the max value of the given column is not within the range.

{
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "observed_value": 39,
    "element_count": 20,
    "missing_count": null,
    "missing_percent": null
  },
  "expectation_config": {
    "kwargs": {
      "column": "employee_age",
      "min_value": 38,
      "max_value": 40,
      "result_format": "BASIC"
    },
    "meta": {},
    "expectation_type": "expect_column_max_to_be_between"
  }
}


In [57]:
df.get_expectation_suite(discard_failed_expectations=False)

{
  "meta": {
    "great_expectations_version": "0.13.19"
  },
  "expectations": [
    {
      "kwargs": {
        "column": "department",
        "value_set": [
          "sales",
          "marketing",
          "development"
        ]
      },
      "meta": {},
      "expectation_type": "expect_column_values_to_be_in_set"
    },
    {
      "kwargs": {
        "column_list": [
          "employee_id",
          "employee_name",
          "employee_age",
          "employee_location",
          "hired_date",
          "salary",
          "department",
          "employer_customer_id"
        ]
      },
      "meta": {},
      "expectation_type": "expect_table_columns_to_match_ordered_list"
    },
    {
      "kwargs": {
        "column": "employee_location",
        "value_set": [
          "Alabama",
          "Colorado",
          "Delaware",
          "Florida",
          "Washington"
        ]
      },
      "meta": {},
      "expectation_type": "expect_column_values_to_be_in_set