In [13]:
import great_expectations as ge
import pandas as pd
import logging
from typing import Dict, Tuple, Any

In [2]:
df = pd.read_csv("../data/01_raw/house-pricing.csv")

In [3]:
df.head()


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [4]:
gdf = ge.from_pandas(df)

In [11]:
gdf.expect_column_values_to_not_be_null("Id")


{
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  },
  "result": {
    "element_count": 1460,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "success": true
}

In [12]:
gdf.validate()

{
  "evaluation_parameters": {},
  "results": [
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "kwargs": {
          "column": "Id",
          "result_format": "BASIC"
        },
        "expectation_type": "expect_column_values_to_not_be_null",
        "meta": {}
      },
      "result": {
        "element_count": 1460,
        "unexpected_count": 0,
        "unexpected_percent": 0.0,
        "unexpected_percent_total": 0.0,
        "partial_unexpected_list": []
      },
      "meta": {},
      "success": true
    },
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "kwargs": {
          "column": "LotFrontage",
          "min_value": 0,
          "max_value": 200,
          "result_format": "BASIC"
        },
      

In [None]:
def preprocess_data(X_train: pd.DataFrame, X_test: pd.DataFrame,
                    parameters: Dict[str, Any]) -> Tuple[pd.DataFrame, Dict, Dict]:

In [15]:
def check_nulls(df: pd.DataFrame) -> Tuple[pd.DataFrame, Dict]:
    """Check for nulls in the dataframe.
    Args:
    --
        df (pd.DataFrame): Dataframe to check for nulls.
    Returns:
    --
        df (pd.DataFrame): Dataframe with nulls removed.
        describe_to_dict (dict): Description of the dataframe.
    """
    
    gdf = ge.from_pandas(df)
    gdf.expect_column_values_to_not_be_null("Id")
    return gdf

In [19]:
import yaml

In [28]:
with open('../conf/base/parameters/data_quality.yml') as f:
    parameters = yaml.load(f, Loader=yaml.loader.SafeLoader)

In [29]:
parameters

{'num_quality_ranges': {'max': 5, 'min': -5}}

In [30]:
df.head()



Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [33]:
def check_ranges(df: pd.DataFrame, parameters : Dict[str, Any]) -> Tuple[pd.DataFrame, Dict]:
    """Check for set of itens in categorcial variables.
    Args:
    --
        df (pd.DataFrame): Dataframe to check for nulls.
    Returns:
    --
        df (pd.DataFrame): Dataframe with nulls removed.
        describe_to_dict (dict): Description of the dataframe.
    """
    
    num_cols = df.select_dtypes(include=['number']).columns
    ranges = parameters["num_quality_ranges"]
    gdf = ge.from_pandas(df)
    for column in num_cols:
        print(column)
        gdf.expect_column_values_to_be_between(column,ranges['min'],ranges['max'])
    return gdf.validate()

In [34]:
check_ranges(df,parameters)

Id
MSSubClass
LotFrontage
LotArea
OverallQual
OverallCond
YearBuilt
YearRemodAdd
MasVnrArea
BsmtFinSF1
BsmtFinSF2
BsmtUnfSF
TotalBsmtSF
1stFlrSF
2ndFlrSF
LowQualFinSF
GrLivArea
BsmtFullBath
BsmtHalfBath
FullBath
HalfBath
BedroomAbvGr
KitchenAbvGr
TotRmsAbvGrd
Fireplaces
GarageYrBlt
GarageCars
GarageArea
WoodDeckSF
OpenPorchSF
EnclosedPorch
3SsnPorch
ScreenPorch
PoolArea
MiscVal
MoSold
YrSold
SalePrice


{
  "evaluation_parameters": {},
  "results": [
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "expectation_config": {
        "kwargs": {
          "column": "Id",
          "min_value": -5,
          "max_value": 5,
          "result_format": "BASIC"
        },
        "expectation_type": "expect_column_values_to_be_between",
        "meta": {}
      },
      "result": {
        "element_count": 1460,
        "missing_count": 0,
        "missing_percent": 0.0,
        "unexpected_count": 1455,
        "unexpected_percent": 99.65753424657534,
        "unexpected_percent_total": 99.65753424657534,
        "unexpected_percent_nonmissing": 99.65753424657534,
        "partial_unexpected_list": [
          6,
          7,
          8,
          9,
          10,
          11,
          12,
          13,
          14,
          15,
          16,
          17,
          18,
          19,


In [8]:
test_results = gdf.validate(expectation_suite="data_expectations.json")

AttributeError: 'PandasDataset' object has no attribute 'checkpoint'