# 07 - `Deepchecks` for `Data validation` and `Model validation`

__Goal__:
1. Data validation with an `html` output
2. Data validation with a `boolean` output
3. Model validation with an `html` output
4. Model validation with a `boolean` output

### Import

In [1]:
%load_ext autoreload
%autoreload 2
import joblib
import os
import shutil

import pandas as pd
from pathlib import Path

from deepchecks.tabular import Dataset as DeepChecksDataset
from deepchecks.tabular.suites import data_integrity
from deepchecks.tabular.suites import model_evaluation

from weather.data.prep_datasets import (
    Dataset,
    transform_dataset_and_create_target,
    prepare_binary_classification_tabular_data,
)
from weather.transformers.skl_transformer_makers import (
    make_dataset_ingestion_transformer,
    make_predictors_feature_engineering_transformer,
    make_remove_horizonless_rows_transformer,
    make_target_creation_transformer,
)
from weather.pipelines.definitions import (
    feature_names,
    metric,
    oldnames_newnames_dict,
    target_choice,
)

### Directory paths

In [2]:
data_dir = Path('../data')
deepchecks_dir = Path('./deepchecks')
deepchecks_dir.mkdir(exist_ok=True)

In [3]:
models_dir = Path.cwd().parent / "models"
models_dir.mkdir(exist_ok=True)
model_subdir  = models_dir / "random_forest"
model_subdir.mkdir(exist_ok=True)

### Utilities

In [4]:
def validate_ingested_data(ingested_df, feature_names, target_choice):
    """Run the data integrity suite on `ingested_df`. Return True if all tests pass, False otherwise."""
    # Populate Dataset parameters
    features = feature_names.numerical + feature_names.categorical + [target_choice.input_name]
    cat_features = [target_choice.input_name]
    # Convert ingested_df into a deepchecks Dataset instance
    ds = DeepChecksDataset(ingested_df, features = features, cat_features = cat_features)
    # Run integrity suite 
    integrity_suite = data_integrity()
    results = integrity_suite.run(ds)
    return results.passed()

def validate_model(dataset, trained_model, trained_predictors_feature_engineering_transformer, excluded_check=5):
    """Run the validation suite minus `WeekSegmentPerformance` on `dataset`. Return True if all tests pass, False otherwise."""
    # Populate train_ds
    X_train = trained_predictors_feature_engineering_transformer.transform(dataset.train_x)
    y_train = dataset.train_y
    train_ds = DeepChecksDataset(X_train, label=y_train, cat_features=[])
    # Populate test_ds
    X_test = trained_predictors_feature_engineering_transformer.transform(dataset.test_x)
    y_test = dataset.test_y
    test_ds = DeepChecksDataset(X_test, label=y_test, cat_features=[])
    # Run model validation suite 
    evaluation_suite = model_evaluation()
    results = evaluation_suite.remove(excluded_check).run(train_ds, test_ds, trained_model)
    return results.passed()

# 1. Data validation with an `html` output

### Load raw data

In [5]:
csv_file_name = 'weather_dataset_raw_development.csv'
df = pd.read_csv(data_dir / csv_file_name)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43848 entries, 0 to 43847
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   S_No                    43848 non-null  int64  
 1   Timestamp               43848 non-null  object 
 2   Location                43848 non-null  object 
 3   Temperature_C           43848 non-null  float64
 4   Apparent_Temperature_C  43848 non-null  float64
 5   Humidity                43848 non-null  float64
 6   Wind_speed_kmph         43848 non-null  float64
 7   Wind_bearing_degrees    43848 non-null  int64  
 8   Visibility_km           43848 non-null  float64
 9   Pressure_millibars      43848 non-null  float64
 10  Weather_conditions      43843 non-null  object 
dtypes: float64(6), int64(2), object(3)
memory usage: 3.7+ MB


###  Ingest

In [6]:
dataset_ingestion_transformer = make_dataset_ingestion_transformer(target_choice, oldnames_newnames_dict)
ingested_df = dataset_ingestion_transformer.transform(df)
ingested_df.head(1)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2006-01-01 00:00:00+00:00,1.161111,0.85,16.6152,139,9.9015,1016.15,rain


### Transform into a `deepchecks.Dataset` object

In [7]:
features = ["Temperature", "Humidity", "Wind_speed",  "Wind_bearing", "Visibility", "Pressure", "Weather"]
cat_features = ["Weather"] # must be a subset of features

ds_name="ingested_data" 
ds = DeepChecksDataset(
    ingested_df,
    features=features,
    cat_features=cat_features,
)

### Check data integrity suite

In [8]:
integrity_suite = data_integrity()
integrity_suite

Data Integrity Suite: [
	0: IsSingleValue
		Conditions:
			0: Does not contain only a single value
	1: SpecialCharacters
		Conditions:
			0: Ratio of samples containing solely special character is less or equal to 0.1%
	2: MixedNulls
		Conditions:
			0: Number of different null types is less or equal to 1
	3: MixedDataTypes
		Conditions:
			0: Rare data types in column are either more than 10% or less than 1% of the data
	4: StringMismatch
		Conditions:
			0: No string variants
	5: DataDuplicates
		Conditions:
			0: Duplicate data ratio is less or equal to 5%
	6: StringLengthOutOfBounds
		Conditions:
			0: Ratio of string length outliers is less or equal to 0%
	7: ConflictingLabels
		Conditions:
			0: Ambiguous sample ratio is less or equal to 0%
	8: OutlierSampleDetection
	9: FeatureLabelCorrelation(ppscore_params={}, random_state=42)
		Conditions:
			0: Features' Predictive Power Score is less than 0.8
	10: FeatureFeatureCorrelation
		Conditions:
			0: Not more than 0 pairs are corre

### Set results directory

In [9]:
deepchecks_subdir = deepchecks_dir / ds_name
deepchecks_subdir.mkdir(exist_ok=True)
print(deepchecks_subdir)

deepchecks\ingested_data


### Data Validation

In [10]:
results = integrity_suite.run(ds)

In [11]:
results_attributes = "\n".join([attributes for attributes in dir(results) if not attributes.startswith("_")])
print(f"Attributes of `results`:\n\n{results_attributes}")

Attributes of `results`:

extra_info
failures
from_json
get_not_passed_checks
get_not_ran_checks
get_passed_checks
html_serializer
ipython_serializer
name
passed
results
results_with_conditions
results_with_display
results_without_conditions
results_without_display
save_as_cml_markdown
save_as_html
select_results
show
show_in_iframe
show_in_window
show_not_interactive
to_json
to_wandb
to_widget
widget_serializer


### Export results

In [12]:
#results.save_as_html("integrity_suite_output.html")
#shutil.move("integrity_suite_output.html", deepchecks_subdir)

# 2. Data validation with a boolean output

### Load raw data and ingest it

In [13]:
csv_file_name = 'weather_dataset_raw_development.csv'
df = pd.read_csv(data_dir / csv_file_name)
dataset_ingestion_transformer = make_dataset_ingestion_transformer(target_choice, oldnames_newnames_dict)
ingested_df = dataset_ingestion_transformer.transform(df)
ingested_df.head(1)

Unnamed: 0,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather
2006-01-01 00:00:00+00:00,1.161111,0.85,16.6152,139,9.9015,1016.15,rain


### Data Validation

In [14]:
data_integrity_suite_passed = validate_ingested_data(ingested_df, feature_names, target_choice)
print(f"The data integrity suite passed: {data_integrity_suite_passed}")

The data integrity suite passed: True


# 3. Model validation with an `html` output

### Load raw data, ingest, transform and split it

In [15]:
# Read 
csv_file_name = 'weather_dataset_raw_development.csv'
df = pd.read_csv(data_dir / csv_file_name)

# Ingest and transform
remove_horizonless_rows_transformer = make_remove_horizonless_rows_transformer(target_choice)
target_creation_transformer = make_target_creation_transformer(target_choice)
transformed_data, created_target = transform_dataset_and_create_target(
    df,   
    dataset_ingestion_transformer,
    remove_horizonless_rows_transformer,
    target_creation_transformer,
)

# Split
dataset = prepare_binary_classification_tabular_data(
    transformed_data,
    created_target,
)

### Load trained model and fitted transformer

In [16]:
trained_model = joblib.load(model_subdir / "model.pkl")
trained_predictors_feature_engineering_transformer = joblib.load(model_subdir / "predictors_feature_eng_pipeline.pkl")

### Populate  `train_ds` and `train_val`

In [17]:
# Populate train_ds
X_train = trained_predictors_feature_engineering_transformer.transform(dataset.train_x)
y_train = dataset.train_y
train_ds = DeepChecksDataset(X_train, label=y_train, cat_features=[])

# Populate test_ds
X_test = trained_predictors_feature_engineering_transformer.transform(dataset.test_x)
y_test = dataset.test_y
test_ds = DeepChecksDataset(X_test, label=y_test, cat_features=[])

### Check model evaluation suite

In [18]:
evaluation_suite = model_evaluation()
evaluation_suite

Model Evaluation Suite: [
	0: TrainTestPerformance
		Conditions:
			0: Train-Test scores relative degradation is less than 0.1
	1: RocReport
		Conditions:
			0: AUC score for all the classes is greater than 0.7
	2: ConfusionMatrixReport
	3: PredictionDrift
		Conditions:
			0: Prediction drift score < 0.15
	4: SimpleModelComparison
		Conditions:
			0: Model performance gain over simple model is greater than 10%
	5: WeakSegmentsPerformance(n_to_show=5)
		Conditions:
			0: The relative performance of weakest segment is greater than 80% of average model performance.
	6: CalibrationScore
	7: RegressionErrorDistribution
		Conditions:
			0: Kurtosis value higher than -0.1
			1: Systematic error ratio lower than 0.01
	8: UnusedFeatures
		Conditions:
			0: Number of high variance unused features is less or equal to 5
	9: BoostingOverfit
		Conditions:
			0: Test score over iterations is less than 5% from the best score
	10: ModelInferenceTime
		Conditions:
			0: Average model inference time for 

### Model Validation

In [19]:
results = evaluation_suite.run(train_ds, test_ds, trained_model)

In [20]:
results_attributes = "\n".join([attributes for attributes in dir(results) if not attributes.startswith("_")])
print(f"Attributes of `results`:\n\n{results_attributes}")

Attributes of `results`:

extra_info
failures
from_json
get_not_passed_checks
get_not_ran_checks
get_passed_checks
html_serializer
ipython_serializer
name
passed
results
results_with_conditions
results_with_display
results_without_conditions
results_without_display
save_as_cml_markdown
save_as_html
select_results
show
show_in_iframe
show_in_window
show_not_interactive
to_json
to_wandb
to_widget
widget_serializer


In [21]:
results.get_not_passed_checks()

[Weak Segments Performance - Train Dataset: {'weak_segments_list':   Accuracy Score          Feature1  \
 0       0.650619  num__Temperature   
 5       0.865618     num__Pressure   
 
                                Feature1 Range Feature2 Feature2 Range  \
 0  (-1.4711160063743591, -1.1533008217811584)                    None   
 5                   (1.0168207883834839, inf)                    None   
 
   % of Data                                 Samples in Segment  
 0      7.27  [2008-11-28 08:00:00+00:00, 2009-02-13 03:00:0...  
 5     13.99  [2008-01-23 06:00:00+00:00, 2006-12-28 09:00:0...  , 'avg_score': 0.915},
 Weak Segments Performance - Test Dataset: {'weak_segments_list':   Accuracy Score          Feature1                             Feature1 Range  \
 0       0.695332  num__Temperature  (-1.518832504749298, -1.1656718254089355)   
 
   Feature2 Feature2 Range % of Data  \
 0                    None      9.29   
 
                                   Samples in Segment  
 0

# 4. Model validation with a `boolean` output

### Load raw data, ingest, transform and split it

In [22]:
# Read 
csv_file_name = 'weather_dataset_raw_development.csv'
df = pd.read_csv(data_dir / csv_file_name)

# Ingest and transform
remove_horizonless_rows_transformer = make_remove_horizonless_rows_transformer(target_choice)
target_creation_transformer = make_target_creation_transformer(target_choice)
transformed_data, created_target = transform_dataset_and_create_target(
    df,   
    dataset_ingestion_transformer,
    remove_horizonless_rows_transformer,
    target_creation_transformer,
)

# Split
dataset = prepare_binary_classification_tabular_data(
    transformed_data,
    created_target,
)

### Load trained model and fitted transformer

In [23]:
trained_model = joblib.load(model_subdir / "model.pkl")
trained_predictors_feature_engineering_transformer = joblib.load(model_subdir / "predictors_feature_eng_pipeline.pkl")

### Model Validation

In [24]:
model_validation_suite_passed = validate_model(dataset, trained_model, trained_predictors_feature_engineering_transformer)
print(f"The model validation suite minus 'WeekSegmentPerformance' passed: {model_validation_suite_passed}")

The model validation suite minus 'WeekSegmentPerformance' passed: True
