# **Introduction**

## **Profil**

Nama    : Intan Mukti Pebriana<br>
Batch   : HCK 019<br>
Objective   : Melakukan data validation dengan Great Expectation

---

In [41]:
# !pip install -q "great-expectations==0.18.19"

In [1]:
# Membuat data context

from great_expectations.data_context import FileDataContext

context = FileDataContext.create(project_root_dir='./')

In [2]:
# Buat Datasource dengan Pandas
datasource_name = 'csv-data-intan-new'
datasource = context.sources.add_pandas(datasource_name)

# Nama dan path lokal untuk Data Asset
asset_name = 'melb-housing-new'
path_to_data = 'D:\Bootcamp Intan\Phase2\P2M3_Intan_data_clean.csv'
asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Membuat batch request
batch_request = asset.build_batch_request()

# Membuat Expectation Suite
expectation_suite_name = 'expectation-melb-housing-new'
context.add_or_update_expectation_suite(expectation_suite_name)

# Membuat validator
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name=expectation_suite_name
)

# Memeriksa validator
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,order_id,suburb,address,rooms,type,price,method,sellerg,date,distance,...,bathroom,car,land_size,building_area,year_built,council_area,lattitude,longtitude,region_name,property_count
0,2016-0001,Surrey Hills,1/10 Florence Rd,2,u,813000.0,S,Fletchers,2016-01-28,11.2,...,1,2,108,85,0,Whitehorse,-37.8276,145.1023,Southern Metropolitan,5457
1,2016-0002,Surrey Hills,999A Riversdale Rd,3,h,1205000.0,S,Fletchers,2016-01-28,11.2,...,1,2,490,129,0,Whitehorse,-37.8361,145.1006,Southern Metropolitan,5457
2,2016-0003,Fawkner,220 McBryde St,3,h,645000.0,SP,YPA,2016-02-04,12.4,...,1,4,661,129,0,Moreland,-37.7009,144.9776,Northern Metropolitan,5070
3,2016-0004,Elwood,4 Wilton Gr,3,t,1395000.0,S,Pride,2016-02-04,7.7,...,2,1,400,126,1996,Port Phillip,-37.8829,144.9797,Southern Metropolitan,8989
4,2016-0005,Richmond,234 Coppin St,3,h,1102000.0,S,Dingle,2016-02-04,2.6,...,2,0,194,129,0,Yarra,-37.8265,145.0022,Northern Metropolitan,14949


In [3]:
# Expectation 1 : Kolom `order_id` bersifat unique

validator.expect_column_values_to_be_unique('order_id')

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [4]:
# Expectation 2 : Kolom `car` dari 0 sampai 10

validator.expect_column_values_to_be_between(
    column='car', min_value=0, max_value=10
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [8]:
# Expectation 3 : Kolom `method` dapat berisi salah satu dari kategori berikut :
# 1 = S - property sold/ properti terjual
# 2 = SP - property sold prior/ properti terjual sebelum lelang
# 3 = PI - property passed in/ properti tidak terjual pada lelang dan dinyatakan "passed in" (tidak laku)
# 4 = SA - sold after auction/ properti terjual setelah lelang
# 5 = VB - vendor bid/ tawaran yang diajukan oleh penjual


validator.expect_column_values_to_be_in_set('method', ['S','SP','PI','SA','VB'])

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [11]:
# Expectation 4 : Kolom `land_size` dapat bertipe data float dan integer

validator.expect_column_values_to_be_in_type_list('price', ['int', 'float'])

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": "float64"
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [12]:
# Expectation 5 : Kolom date tidak terdapat missing value

validator.expect_column_values_to_not_be_null('date')

Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [13]:
# Expectation 6 : Kolom rooms  mencakup nilai 5 dan 10 di antara nilai-nilai uniknya.

validator.expect_column_distinct_values_to_contain_set("rooms",[5, 8])

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": [
      1,
      2,
      3,
      4,
      5,
      6,
      7,
      8,
      10
    ],
    "details": {
      "value_counts": [
        {
          "value": 1,
          "count": 681
        },
        {
          "value": 2,
          "count": 3648
        },
        {
          "value": 3,
          "count": 5877
        },
        {
          "value": 4,
          "count": 2686
        },
        {
          "value": 5,
          "count": 595
        },
        {
          "value": 6,
          "count": 67
        },
        {
          "value": 7,
          "count": 10
        },
        {
          "value": 8,
          "count": 8
        },
        {
          "value": 10,
          "count": 1
        }
      ]
    }
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [15]:
# Expectation 7 : Mengecek apakah nilai tengah (median) dari kolom "bedroom" masuk dalam kisaran yang valid.

validator.expect_column_median_to_be_between("bedroom")

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 3.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
# Expectation 8: Membandingkan nilai antara dua kolom untuk memastikan bahwa harga properti tidak lebih kecil dari jumlah properti.

validator.expect_column_pair_values_a_to_be_greater_than_b(
    column_A="price",
    column_B="property_count",
    or_equal=True
)

Calculating Metrics:   0%|          | 0/7 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
# Expectation 9: Memastikan jumlah nama wilayah unik berada dalam kisaran yang diinginkan.
validator.expect_column_unique_value_count_to_be_between(
    column="region_name",
    min_value=5,
    max_value=10
)

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "observed_value": 8
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
# Expectation 10: Mengecek bahwa kolom "post_code" tidak mengandung nilai spesifik, dalam contoh ini 3300.
validator.expect_column_values_to_not_be_in_set("post_code",
    value_set=[3300]
)

Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 13573,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [21]:
# Save into Expectation Suite

validator.save_expectation_suite(discard_failed_expectations=False)

In [22]:
# Create a checkpoint

checkpoint_1 = context.add_or_update_checkpoint(
    name = 'checkpoint_1',
    validator = validator,
)

In [23]:
# Run a checkpoint

checkpoint_result = checkpoint_1.run()

Calculating Metrics:   0%|          | 0/63 [00:00<?, ?it/s]