### Import libraries and data

In [1]:
import great_expectations as ge
import pandas as pd

In [2]:
url = "https://github.com/hnawaz007/pythondataanalysis/blob/main/ETL%20Pipeline/Pytest/Session%20one/Product.xlsx?raw=true"
# read from url
df=pd.read_excel(url)
df.head()

Unnamed: 0,ProductKey,ProductAlternateKey,ProductSubcategoryKey,WeightUnitMeasureCode,SizeUnitMeasureCode,EnglishProductName,StandardCost,FinishedGoodsFlag,Color,SafetyStockLevel,...,DaysToManufacture,ProductLine,DealerPrice,Class,Style,ModelName,EnglishDescription,StartDate,EndDate,Status
0,1,AR-5381,,,,Adjustable Race,,0,,1000,...,0,,,,,,,2003-07-01,NaT,Current
1,2,BA-8327,,,,Bearing Ball,,0,,1000,...,0,,,,,,,2003-07-01,NaT,Current
2,3,BE-2349,,,,BB Ball Bearing,,0,,800,...,1,,,,,,,2003-07-01,NaT,Current
3,4,BE-2908,,,,Headset Ball Bearings,,0,,800,...,0,,,,,,,2003-07-01,NaT,Current
4,5,BL-2036,,,,Blade,,0,,800,...,1,,,,,,,2003-07-01,NaT,Current


### Convert pandas dataframe to Great Expectation

In [3]:
my_df = ge.from_pandas(df)

In [4]:
type(my_df)

great_expectations.dataset.pandas_dataset.PandasDataset

### GE Data Quality Tests

In [5]:
# check number of rows in the dataset
my_df.expect_table_row_count_to_equal(1000)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": false,
  "result": {
    "observed_value": 606
  },
  "meta": {}
}

### Primary Key Test

In [6]:
my_df.expect_column_to_exist('ProductKey')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {},
  "meta": {}
}

In [7]:
my_df.expect_column_values_to_be_unique('ProductKey')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 606,
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [8]:
my_df.expect_column_values_to_not_be_null('ProductKey')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 606,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [9]:
my_df.expect_column_values_to_be_in_type_list("ProductKey", ["int", "int64"])

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": "int64"
  },
  "meta": {}
}

### Test values in a set (list)

In [10]:
df.ProductLine.unique()

array([nan, 'R ', 'S ', 'M ', 'T '], dtype=object)

In [11]:
my_df.expect_column_values_to_be_in_set("ProductLine", ['R ', 'S ', 'M ', 'T '])

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 606,
    "missing_count": 226,
    "missing_percent": 37.29372937293729,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [12]:
my_df.expect_column_values_to_be_in_set("Color", [ 'Black', 'Silver', 'Red', 'White', 'Blue', 'Multi', 'Yellow','Grey', 'Silver'])

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": false,
  "result": {
    "element_count": 606,
    "missing_count": 254,
    "missing_percent": 41.914191419141915,
    "unexpected_count": 7,
    "unexpected_percent": 1.9886363636363635,
    "unexpected_percent_total": 1.155115511551155,
    "unexpected_percent_nonmissing": 1.9886363636363635,
    "partial_unexpected_list": [
      "Silver/Black",
      "Silver/Black",
      "Silver/Black",
      "Silver/Black",
      "Silver/Black",
      "Silver/Black",
      "Silver/Black"
    ]
  },
  "meta": {}
}

### Check min and Max range of column

In [13]:
df.SafetyStockLevel.unique()

array([1000,  800,  500,   60,    4,  100], dtype=int64)

In [14]:
my_df.expect_column_max_to_be_between("SafetyStockLevel", 1, 1000)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": 1000,
    "element_count": 606,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [15]:
my_df.expect_column_max_to_be_between("DaysToManufacture", 1, 10)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": 4,
    "element_count": 606,
    "missing_count": null,
    "missing_percent": null
  },
  "meta": {}
}

In [16]:
my_df.expect_column_mean_to_be_between("StandardCost", 100, 500)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "observed_value": 434.26582886075965,
    "element_count": 606,
    "missing_count": 211,
    "missing_percent": 34.81848184818482
  },
  "meta": {}
}

### Test Text columns (Nulls)

In [17]:
my_df.expect_column_values_to_not_be_null('EnglishProductName')

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 606,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {}
}

In [18]:
my_df.expect_column_values_to_not_be_null("Color", mostly=0.55)

{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "success": true,
  "result": {
    "element_count": 606,
    "unexpected_count": 254,
    "unexpected_percent": 41.914191419141915,
    "unexpected_percent_total": 41.914191419141915,
    "partial_unexpected_list": []
  },
  "meta": {}
}

### Save your test cases and re-use

In [19]:
my_df.get_expectations_config()

{
  "ge_cloud_id": null,
  "expectations": [
    {
      "meta": {},
      "kwargs": {
        "column": "ProductKey"
      },
      "expectation_type": "expect_column_to_exist"
    },
    {
      "meta": {},
      "kwargs": {
        "column": "ProductKey"
      },
      "expectation_type": "expect_column_values_to_be_unique"
    },
    {
      "meta": {},
      "kwargs": {
        "column": "ProductKey"
      },
      "expectation_type": "expect_column_values_to_not_be_null"
    },
    {
      "meta": {},
      "kwargs": {
        "column": "ProductKey",
        "type_list": [
          "int",
          "int64"
        ]
      },
      "expectation_type": "expect_column_values_to_be_in_type_list"
    },
    {
      "meta": {},
      "kwargs": {
        "column": "ProductLine",
        "value_set": [
          "R ",
          "S ",
          "M ",
          "T "
        ]
      },
      "expectation_type": "expect_column_values_to_be_in_set"
    },
    {
      "meta": {},
      "kwarg

In [20]:
config = my_df.get_expectations_config()

In [21]:
my_df.save_expectation_suite('product.data.expectations.json')

In [22]:
df2 = ge.read_excel(url)

In [23]:
df2.validate(expectation_suite=config)

{
  "success": true,
  "results": [
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "ProductKey"
        },
        "expectation_type": "expect_column_to_exist"
      },
      "success": true,
      "result": {},
      "meta": {}
    },
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "meta": {},
        "kwargs": {
          "column": "ProductKey"
        },
        "expectation_type": "expect_column_values_to_be_unique"
      },
      "success": true,
      "result": {
        "element_count": 606,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
  

### Test with Config file

In [24]:
#
test_results = df2.validate(expectation_suite="product.data.expectations.json")

In [25]:
# Take action based on the results
if test_results["success"]:
    print ("Awesome. All Data Quality Tests are green!")
else:
    raise Exception("You've got issues.")

Awesome. All Data Quality Tests are green!
