In [1]:
import great_expectations as ge
import pandas as pd
import logging
from typing import Dict, Tuple, Any



In [2]:
df = pd.read_csv("../data/01_raw/house-pricing.csv")
df_model_input = pd.read_csv("../data/05_model_input/X_train_transformed.csv")

In [3]:
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
gdf = ge.from_pandas(df)

In [5]:
gdf.expect_column_values_to_not_be_null("Id")


{
  "result": {
    "element_count": 1460,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "success": true,
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [6]:
gdf.validate()

{
  "success": true,
  "meta": {
    "great_expectations_version": "0.16.14",
    "expectation_suite_name": "default",
    "run_id": {
      "run_name": null,
      "run_time": "2023-06-09T11:35:03.389560+01:00"
    },
    "batch_kwargs": {
      "ge_batch_id": "495e642e-06b1-11ee-87f0-acde48001122"
    },
    "batch_markers": {},
    "batch_parameters": {},
    "validation_time": "20230609T103503.389475Z",
    "expectation_suite_meta": {
      "great_expectations_version": "0.16.14"
    }
  },
  "statistics": {
    "evaluated_expectations": 1,
    "successful_expectations": 1,
    "unsuccessful_expectations": 0,
    "success_percent": 100.0
  },
  "results": [
    {
      "result": {
        "element_count": 1460,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "partial_unexpected_list": []
      },
      "success": true,
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_

In [7]:
def check_nulls(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
    """Check for nulls in the dataframe.
    Args:
    --
        df (pd.DataFrame): Dataframe to check for nulls.
    Returns:
    --
        df (pd.DataFrame): Dataframe with nulls removed.
        describe_to_dict (dict): Description of the dataframe.
    """
    
    gdf = ge.from_pandas(df)
    gdf.expect_column_values_to_not_be_null("Id")
    return gdf

In [8]:
import yaml
with open('../conf/base/parameters/data_quality.yml') as f:
    parameters = yaml.load(f, Loader=yaml.loader.SafeLoader)

logger = logging.getLogger(__name__)

def check_ranges(df: pd.DataFrame, parameters : Dict[str, Any]) -> Tuple[pd.DataFrame, Dict]:
    """Check for set of itens in categorcial variables.
    Args:
    --
        df (pd.DataFrame): Dataframe to check for nulls.
    Returns:
    --
        df (pd.DataFrame): Dataframe with nulls removed.
        describe_to_dict (dict): Description of the dataframe.
    """
    
    num_cols = df.select_dtypes(include=['number']).columns
    ranges = parameters["num_quality_ranges"]
    gdf = ge.from_pandas(df)
    for column in num_cols:
        gdf.expect_column_values_to_be_between(column,ranges['min'],ranges['max'])
    
    validation_results = gdf.validate()
    failed_expectations = [result for result in validation_results["results"] if not result["success"]]
    
    logger.info(
        f"Total Expectations: {len(validation_results['results'])}"
        f"Failed Expectations: {len(failed_expectations)}"
    )
    
    if failed_expectations:
        collect_errors = []
        for idx, failed_expectation in enumerate(failed_expectations, start=1):
            collect_errors.append(
                f"\nFailed Expectation {idx}:"
                f"  Expectation Type: {failed_expectation['expectation_config']['expectation_type']}"
                f"  Column: {failed_expectation['expectation_config']['kwargs']['column']}"
                f"  Details: {failed_expectation['result']}")
    
        raise Exception(
            f"Data Quality Validation Failed: {collect_errors}"
        )
   
    return df

In [9]:
check_ranges(df_model_input, parameters)

Exception: Data Quality Validation Failed: ["\nFailed Expectation 1:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__LotFrontage  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 5, 'unexpected_percent': 0.4280821917808219, 'unexpected_percent_total': 0.4280821917808219, 'unexpected_percent_nonmissing': 0.4280821917808219, 'partial_unexpected_list': [12.77137639050307, 5.1397974431346505, 12.77137639050307, 5.876639548397808, 5.455586916818861]}", "\nFailed Expectation 2:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__LotArea  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 18, 'unexpected_percent': 1.5410958904109588, 'unexpected_percent_total': 1.5410958904109588, 'unexpected_percent_nonmissing': 1.5410958904109588, 'partial_unexpected_list': [5.559054160841287, 8.99373898243268, 5.390553765728527, 7.414503677587988, 25.66385022187101, 36.32605920612729, 10.607744210078415, 13.199683909792718, 6.360707555771685, 8.753267278584888, 37.70226733937147, 6.540635827609264, 50.001823597349706, 9.889003707981278, 10.578566652483133, 6.208254817336332, 10.67509573886086, 11.573764512795575]}", "\nFailed Expectation 3:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__MasVnrArea  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 7, 'unexpected_percent': 0.5993150684931506, 'unexpected_percent_total': 0.5993150684931506, 'unexpected_percent_nonmissing': 0.5993150684931506, 'partial_unexpected_list': [6.307228915662651, 5.180722891566265, 5.554216867469879, 5.240963855421687, 8.301204819277109, 5.548192771084337, 5.873493975903615]}", "\nFailed Expectation 4:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__BsmtFinSF1  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 1, 'unexpected_percent': 0.08561643835616438, 'unexpected_percent_total': 0.08561643835616438, 'unexpected_percent_nonmissing': 0.08561643835616438, 'partial_unexpected_list': [7.294729542302358]}", "\nFailed Expectation 5:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__BsmtFinSF2  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 131, 'unexpected_percent': 11.215753424657535, 'unexpected_percent_total': 11.215753424657535, 'unexpected_percent_nonmissing': 11.215753424657535, 'partial_unexpected_list': [228.0, 627.0, 128.0, 869.0, 645.0, 208.0, 1061.0, 149.0, 872.0, 377.0, 630.0, 544.0, 180.0, 182.0, 532.0, 1057.0, 175.0, 634.0, 311.0, 28.0]}", "\nFailed Expectation 6:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__TotalBsmtSF  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 1, 'unexpected_percent': 0.08561643835616438, 'unexpected_percent_total': 0.08561643835616438, 'unexpected_percent_nonmissing': 0.08561643835616438, 'partial_unexpected_list': [10.158966716343766]}", "\nFailed Expectation 7:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__1stFlrSF  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 1, 'unexpected_percent': 0.08561643835616438, 'unexpected_percent_total': 0.08561643835616438, 'unexpected_percent_nonmissing': 0.08561643835616438, 'partial_unexpected_list': [7.07027027027027]}", "\nFailed Expectation 8:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__LowQualFinSF  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 21, 'unexpected_percent': 1.797945205479452, 'unexpected_percent_total': 1.797945205479452, 'unexpected_percent_nonmissing': 1.797945205479452, 'partial_unexpected_list': [232.0, 528.0, 397.0, 513.0, 80.0, 120.0, 80.0, 420.0, 384.0, 205.0, 234.0, 572.0, 53.0, 515.0, 390.0, 392.0, 481.0, 473.0, 144.0, 360.0]}", "\nFailed Expectation 9:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__GrLivArea  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 1, 'unexpected_percent': 0.08561643835616438, 'unexpected_percent_total': 0.08561643835616438, 'unexpected_percent_nonmissing': 0.08561643835616438, 'partial_unexpected_list': [6.451063829787234]}", "\nFailed Expectation 10:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__WoodDeckSF  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 1, 'unexpected_percent': 0.08561643835616438, 'unexpected_percent_total': 0.08561643835616438, 'unexpected_percent_nonmissing': 0.08561643835616438, 'partial_unexpected_list': [5.101190476190476]}", "\nFailed Expectation 11:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__OpenPorchSF  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 5, 'unexpected_percent': 0.4280821917808219, 'unexpected_percent_total': 0.4280821917808219, 'unexpected_percent_nonmissing': 0.4280821917808219, 'partial_unexpected_list': [6.418918918918919, 5.121621621621622, 6.702702702702703, 7.027027027027027, 5.283783783783784]}", "\nFailed Expectation 12:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__EnclosedPorch  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 160, 'unexpected_percent': 13.698630136986301, 'unexpected_percent_total': 13.698630136986301, 'unexpected_percent_nonmissing': 13.698630136986301, 'partial_unexpected_list': [164.0, 264.0, 242.0, 252.0, 192.0, 112.0, 162.0, 96.0, 81.0, 102.0, 128.0, 228.0, 184.0, 176.0, 36.0, 275.0, 158.0, 112.0, 120.0, 192.0]}", "\nFailed Expectation 13:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__3SsnPorch  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 20, 'unexpected_percent': 1.7123287671232876, 'unexpected_percent_total': 1.7123287671232876, 'unexpected_percent_nonmissing': 1.7123287671232876, 'partial_unexpected_list': [162.0, 144.0, 180.0, 320.0, 407.0, 130.0, 216.0, 196.0, 180.0, 168.0, 290.0, 245.0, 153.0, 216.0, 168.0, 182.0, 144.0, 140.0, 508.0, 304.0]}", "\nFailed Expectation 14:  Expectation Type: expect_column_values_to_be_between  Column: numerical__numerical__ScreenPorch  Details: {'element_count': 1168, 'missing_count': 0, 'missing_percent': 0.0, 'unexpected_count': 97, 'unexpected_percent': 8.304794520547945, 'unexpected_percent_total': 8.304794520547945, 'unexpected_percent_nonmissing': 8.304794520547945, 'partial_unexpected_list': [189.0, 222.0, 155.0, 197.0, 271.0, 128.0, 260.0, 160.0, 120.0, 170.0, 216.0, 80.0, 95.0, 276.0, 116.0, 180.0, 176.0, 161.0, 224.0, 189.0]}"]