# 07b - Check a dataframe with `Deepchecks` testing

__Goal__:

1. Load a dataframe
2. Check the dataframe with `Deepchecks`

### Import

In [10]:
import pandas as pd
import shutil
from pathlib import Path

from deepchecks.tabular import Dataset
from deepchecks.tabular.checks import OutlierSampleDetection
from deepchecks.tabular.suites import data_integrity

In [2]:
deepchecks_dir = Path('../deepchecks')
preprocessed_dataset_name = "dev"
deepchecks_subdir = deepchecks_dir / preprocessed_dataset_name

In [3]:
deepchecks_subdir / (preprocessed_dataset_name+".pkl")

WindowsPath('../deepchecks/dev/dev.pkl')

# 1. Load a dataset

In [4]:
df = pd.read_pickle(deepchecks_subdir / (preprocessed_dataset_name+".pkl"))
df.head(1)

Unnamed: 0,Timestamp,Temperature,Humidity,Wind_speed,Wind_bearing,Visibility,Pressure,Weather,Year,Month
0,2006-01-01 00:00:00+00:00,1.161111,0.85,16.6152,139,9.9015,1016.15,rain,2006,1


# 2. Check the dataframe with `Deepchecks`

### A. Define a Dataset object

In [5]:
label = "Weather"
features = ["Temperature", "Humidity", "Wind_speed",  "Wind_bearing", "Visibility", "Pressure", "Year", "Month"]
cat_features = ["Year", "Month"] # must be a subset of features
index_name = "Timestamp"
name="Dev" 
label_type = "binary"
set_datetime_from_dataframe_index = True
ds = Dataset(
    df,
    label=label,
    features=features,
    cat_features=cat_features,
    index_name=index_name,
    set_datetime_from_dataframe_index=set_datetime_from_dataframe_index,
    label_type=label_type,
)

### B. Run the  full suite of `Deepchecks`

In [6]:
# from deepchecks.tabular.suites import full_suite

# suite = full_suite()

# suite.run(train_dataset=ds_train, test_dataset=ds_test, model=rf_clf)

### C. Run the integrity suite of `Deepcheks`

#### Set and edit the suite:

In [7]:
integrity_suite = data_integrity()
integrity_suite

Data Integrity Suite: [
	0: IsSingleValue
		Conditions:
			0: Does not contain only a single value
	1: SpecialCharacters
		Conditions:
			0: Ratio of samples containing solely special character is less or equal to 0.1%
	2: MixedNulls
		Conditions:
			0: Number of different null types is less or equal to 1
	3: MixedDataTypes
		Conditions:
			0: Rare data types in column are either more than 10% or less than 1% of the data
	4: StringMismatch
		Conditions:
			0: No string variants
	5: DataDuplicates
		Conditions:
			0: Duplicate data ratio is less or equal to 5%
	6: StringLengthOutOfBounds
		Conditions:
			0: Ratio of string length outliers is less or equal to 0%
	7: ConflictingLabels
		Conditions:
			0: Ambiguous sample ratio is less or equal to 0%
	8: OutlierSampleDetection
	9: FeatureLabelCorrelation(ppscore_params={}, random_state=42)
		Conditions:
			0: Features' Predictive Power Score is less than 0.8
	10: FeatureFeatureCorrelation
		Conditions:
			0: Not more than 0 pairs are corre

#### Run the suite:

In [8]:
results = integrity_suite.run(ds)

#### Save the results as an html

In [9]:
results.save_as_html("integrity_suite_output.html")
shutil.move("integrity_suite_output.html", deepchecks_subdir)

'..\\deepchecks\\dev\\integrity_suite_output.html'

### D. Run `outlier sample detection` of `Deepchecks`

#### Run the check:

In [12]:
check = OutlierSampleDetection(nearest_neighbors_percent=0.01, extent_parameter=3)
result = check.run(ds)

#### Save the results as an html

In [13]:
result.save_as_html("outlier_sample_detection_output.html")
shutil.move("outlier_sample_detection_output.html", deepchecks_subdir)

'..\\deepchecks\\dev\\outlier_sample_detection_output.html'