# Great Expectations Experience 001 #

In [None]:
import pandas as pd
import great_expectations as gx
import warnings

warnings.filterwarnings("ignore")

In [None]:
# Create a context for GX
context = gx.get_context()

# Read the data file into Pandas dataframe
df = pd.read_csv("https://aiverse-sample-data-files.s3.eu-central-1.amazonaws.com/csv/bigmart_retail_sales.csv")

# Convert Pandas dataframe to GX dataframe
dfGE = gx.from_pandas(df)

# Display dataframe
display(dfGE)

# 1 - Check if column is NULL : Outlet_Size

We would like to see if there is any NULL value in **Outlet_Size** column. Total rows checked (element_count) and failed rows (unexpected_count) can be reviewed in the result output.

In [None]:
chk_null_outlet_size = dfGE.expect_column_values_to_not_be_null("Outlet_Size")

print(chk_null_outlet_size)

Print total and failed counts.

In [None]:
success = chk_null_outlet_size.success
total = chk_null_outlet_size.result["element_count"]
unexpected = chk_null_outlet_size.result["unexpected_count"]

print(f"Success: {success}")
print(f"Total rows: {total}")
print(f"Unexpected rows (errors): {unexpected}")

# 2 - Check if column is between given values  : Outlet_Establishment_Year

In [None]:
dfGE["Outlet_Establishment_Year"] = pd.to_numeric(dfGE["Outlet_Establishment_Year"])

chk_is_between_est_year = dfGE.expect_column_values_to_be_between("Outlet_Establishment_Year", 2004, 2010)

print(chk_is_between_est_year)

Print total and failed counts.

In [None]:
success = chk_is_between_est_year.success
total = chk_is_between_est_year.result["element_count"]
unexpected = chk_is_between_est_year.result["unexpected_count"]

print(f"Success: {success}")
print(f"Total rows: {total}")
print(f"Unexpected rows (errors): {unexpected}")

# 3 - Check if distinct column values are in a given list : Item_Type

We have a list that consists of all item types in the data set except one (**Soft Drinks** for this example). If we check whether Item_Type is in our list or not, the **success** value of the result will be **false**.

## Failed example ##

In [None]:
item_types_list_1 = ['Dairy', 'Meat', 'Fruits and Vegetables', 'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods', 
                     'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned', 'Breads', 'Starchy Foods', 'Others', 'Seafood']

chk_is_in_list_item_type = dfGE.expect_column_distinct_values_to_equal_set("Item_Type", item_types_list_1)

print(chk_is_in_list_item_type)

Print total count and success.

In [None]:
total = chk_is_in_list_item_type.result["element_count"]
success = chk_is_in_list_item_type.success
print(f"Total rows: {total}")
print(f"Success: {success}")

## Passed example

After adding **Soft Drinks** to Item Types list, let's run the expectation again. This time, **success** will be **true** because all distinct values completely match with our list.

In [None]:
item_types_list_2 = ['Dairy', 'Soft Drinks', 'Meat', 'Fruits and Vegetables', 'Household', 'Baking Goods', 'Snack Foods', 'Frozen Foods', 
                     'Breakfast', 'Health and Hygiene', 'Hard Drinks', 'Canned', 'Breads', 'Starchy Foods', 'Others', 'Seafood']

chk_is_in_list_item_type = dfGE.expect_column_distinct_values_to_equal_set("Item_Type", item_types_list_2)

print(chk_is_in_list_item_type)

Print total and failed counts.

In [None]:
total = chk_is_in_list_item_type.result["element_count"]
success = chk_is_in_list_item_type.success
print(f"Total rows: {total}")
print(f"Success: {success}")