# Data Context

In [1]:
# Create a data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

# Connect to A `Datasource`

In [2]:
# Give a name to a Datasource. This name must be unique between Datasources.
datasource_name = 'csv-dairy-data'
datasource = context.sources.add_pandas(datasource_name)

# Give a name to a data asset
asset_name = 'dairy-data'
path_to_data = 'dags/P2M3_Heru_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

# Create an Expectation Suite

In [4]:
# Creat an expectation suite
expectation_suite_name = 'expectation-dairy-dataset'
context.add_or_update_expectation_suite(expectation_suite_name)

# Create a validator using above expectation suite
validator = context.get_validator(
    batch_request = batch_request,
    expectation_suite_name = expectation_suite_name
)

# Check the validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,location,total_land_area_(acres),number_of_cows,farm_size,date,product_id,product_name,brand,quantity_(liters/kg),price_per_unit,...,quantity_sold_(liters/kg),price_per_unit_(sold),approx._total_revenue(inr),customer_location,sales_channel,quantity_in_stock_(liters/kg),minimum_stock_threshold_(liters/kg),reorder_quantity_(liters/kg),id,location_code
0,Telangana,310.84,96,Medium,2022-02-17,5,Ice Cream,Dodla Dairy,222.4,85.72,...,7,82.24,575.68,Madhya Pradesh,Wholesale,215,19.55,64.03,1,IN-TG
1,Uttar Pradesh,19.19,44,Large,2021-12-01,1,Milk,Amul,687.48,42.61,...,558,39.24,21895.92,Kerala,Wholesale,129,43.17,181.1,2,IN-UP
2,Tamil Nadu,581.69,24,Medium,2022-02-28,4,Yogurt,Dodla Dairy,503.48,36.5,...,256,33.81,8655.36,Madhya Pradesh,Online,247,15.1,140.83,3,IN-TN
3,Telangana,908.0,89,Small,2019-06-09,3,Cheese,Britannia Industries,823.36,26.52,...,601,28.92,17380.92,Rajasthan,Online,222,74.5,57.68,4,IN-TG
4,Maharashtra,861.95,21,Medium,2020-12-14,8,Buttermilk,Mother Dairy,147.77,83.85,...,145,83.07,12045.15,Jharkhand,Retail,2,76.02,33.4,5,IN-MH


## Expectations

### Expectations Not to be Empty

In [5]:
# Expectation 1 : Column `approx._total_revenue(inr)` can not contain missing values

validator.expect_column_values_to_not_be_null('approx._total_revenue(inr)')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4325,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations to be Unique

In [6]:
# Expectation 2 : Column `id` must be unique

validator.expect_column_values_to_be_unique('id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4325,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations to be in Between

In [10]:
# Expectation 3 : Column `price_per_unit` must be less than $INR 100

validator.expect_column_values_to_be_between(
    column='price_per_unit', min_value=0, max_value=100
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4325,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations to be in Set

In [11]:
# Expectation 4 : Column `product_id` must contain one of the following 10 things :
# 1 = Milk
# 2 = Butter
# 3 = Cheese
# 4 = Yogurt
# 5 = Ice Cream
# 6 = Curd
# 7 = Lassi
# 8 = Buttermilk
# 9 = Paneer
# 10 = Ghee

validator.expect_column_values_to_be_in_set('product_id', [1,2,3,4,5,6,7,8,9,10])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 4325,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations to be in type list

In [12]:
# Expectation 5 : Column `quantity_(liters/kg)` must in form of integer or float

validator.expect_column_values_to_be_in_type_list('quantity_(liters/kg)', ['integer', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations the number of rows to be between

In [13]:
# Expectation 6 : Table row must be below 5000

validator.expect_table_row_count_to_be_between(min_value=1, max_value=5000)

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 4325
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

### Expectations of the unique value count

In [14]:
# Expectation 7 : column `product_name` unique count must be 10

validator.expect_column_unique_value_count_to_be_between(column='product_name', min_value=1,max_value=10)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 10
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [17]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

## Checkpoints

A `Checkpoint` is the primary means for validating data in a production deployment of Great Expectations.

`Checkpoints` provide a convenient abstraction for **bundling the Validation of a Batch (or Batches) of data against an Expectation Suite (or several)**, as well as the Actions that should be taken after the validation.

In [18]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [19]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/30 [00:00<?, ?it/s]

## Data Docs

`Data Docs` translate `Expectations`, `Validation Results`, and other metadata into human-readable documentation. Automatically compiling your data documentation from your data tests in the form of `Data Docs`. **Data Docs are rendered as HTML files.** As such, you can open them with any browser

In [20]:
# Build data docs

context.build_data_docs()

{'local_site': 'file:///Users/Heru/HCK18/Phase 2/GXMile/TestAirFlow/gx/uncommitted/data_docs/local_site/index.html'}