# 1. Import libraries

In [12]:
import pandas as pd
import great_expectations as gx

# 2. Load Data

In [13]:
# Import data clean
df = pd.read_csv('data/P2M3_ilham_maulud_data_clean.csv')

context = gx.get_context()

In [14]:
data_source = context.data_sources.add_pandas("pandas")
data_asset = data_source.add_dataframe_asset(name="pd dataframe asset")

batch_definition = data_asset.add_batch_definition_whole_dataframe("batch definition")
batch = batch_definition.get_batch(batch_parameters={"dataframe": df})

# 3. 7 Expectations 

## 3.1 To be unique

In [None]:
# Check if the invoice_id column is unique or not
expect1 = gx.expectations.ExpectColumnValuesToBeUnique(column="invoice_id")

In [None]:
# Check the total column between min and max
expect2 = gx.expectations.ExpectColumnValuesToBeBetween(column="total", min_value=0)

In [None]:
# Ensuring all values in the customer_type column are only within a specific list
expect3 = gx.expectations.ExpectColumnValuesToBeInSet(column="customer_type", value_set=["Member", "Normal"])

In [None]:
# Ensuring all values in the quantity column are of integer data type
expect4 = gx.expectations.ExpectColumnValuesToBeInTypeList(column="quantity", type_list=["int", "int64"])

In [None]:
# Check the range in the rating column
expect5 = gx.expectations.ExpectColumnMeanToBeBetween(column="rating", min_value=5, max_value=10)

In [None]:
# Ensuring the unit_price column does not contain empty values (null/NaN)
expect6 = gx.expectations.ExpectColumnValuesToNotBeNull(column="unit_price")

In [None]:
# Ensuring the median of the quantity column is between 1 and 10
expect7 = gx.expectations.ExpectColumnMedianToBeBetween(column="quantity", min_value=1, max_value=10)

In [None]:
# Check the results
results = []
for exp in [expect1, expect2, expect3, expect4, expect5, expect6, expect7]:
    result = batch.validate(exp)
    results.append(result)

# Show validation results
for r in results:
    print(r)

Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 422.25it/s]
Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 448.49it/s]
Calculating Metrics: 100%|██████████| 10/10 [00:00<00:00, 618.51it/s]
Calculating Metrics: 100%|██████████| 1/1 [00:00<00:00, 181.81it/s] 
Calculating Metrics: 100%|██████████| 4/4 [00:00<00:00, 305.08it/s]
Calculating Metrics: 100%|██████████| 8/8 [00:00<00:00, 601.46it/s] 


Calculating Metrics: 100%|██████████| 4/4 [00:00<00:00, 291.34it/s] 

{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_unique",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "invoice_id"
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0,
    "partial_unexpected_counts": [],
    "partial_unexpected_index_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}
{
  "success": true,
  "expectation_config": {
    "type": "expect_column_values_to_be_between",
    "kwargs": {
      "batch_id": "pandas-pd dataframe asset",
      "column": "total",
      "min_value": 0.0
    },
    "meta": {}
  },
  "result": {
    "element_count": 1000,
    "unexpected_count": 0,




Output explanation:

1. Invoice ID uniqueness

All 1000 invoice IDs are unique, there are no duplicates. Indicating that each transaction is properly recorded and can be used as a primary key.

2. total values between 0 and above the minimum of 0

All total values ≥ 0, no negative transactions. Valid for sales analysis and revenue calculation.

3. customer_type values in set ["Member", "Normal"]

All values are valid according to the expected categories. There are no typos or strange values, making customer segmentation easier.

4. quantity values in type list ["int", "int64"]

All values are of integer type (int64). Ensuring that the quantity can be counted, summed, or analyzed without error.

5. rating mean between 5 and 10

Average customer rating = 6.9727, which is within the expected range. Overall customer satisfaction is good, not too low.

6. unit_price not null

There are no blank values for the unit price. All transactions have price information, which is essential for total and revenue calculations.

7. quantity median between 1 and 10

Median quantity = 5, as expected. Shows a reasonable distribution of purchases, with no extreme anomalies.

Conclusion from the generated output:

The dataset is clean and consistent enough for further analysis. There were no major issues such as duplicate IDs, empty values, or incorrect data types. This made the Exploratory Data Analysis (EDA) process and dashboard creation in Kibana smoother and more reliable.
