In [1]:
import pandas as pd
import numpy as np
from sklearn import datasets

from evidently.report import Report
from evidently.metrics import DataDriftTable
from evidently.metrics import DatasetDriftMetric

In [2]:
df = datasets.fetch_openml(name='adult', version=2, as_frame='auto').frame
df.head()

  warn(


Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,38.0,Private,89814.0,HS-grad,9.0,Married-civ-spouse,Farming-fishing,Husband,White,Male,0.0,0.0,50.0,United-States,<=50K
2,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
3,44.0,Private,160323.0,Some-college,10.0,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688.0,0.0,40.0,United-States,>50K
4,18.0,,103497.0,Some-college,10.0,Never-married,,Own-child,White,Female,0.0,0.0,30.0,United-States,<=50K


In [3]:
group1 = df[~df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

group2 = df[df.education.isin(['Some-college', 'HS-grad', 'Bachelors'])]

# Comparing statistics

In [4]:
group1_age_stats = group1.age.describe().to_frame('group1_age_stats')
group2_age_stats = group2.age.describe().to_frame('group2_age_stats')

group1_age_stats.join(group2_age_stats)

Unnamed: 0,group1_age_stats,group2_age_stats
count,14155.0,34687.0
mean,40.300318,37.967509
std,14.518017,13.308068
min,17.0,17.0
25%,29.0,27.0
50%,39.0,36.0
75%,50.0,47.0
max,90.0,90.0


## Using KS test

The Kolmogorov-Smirnov (K-S) test is a non-parametric test used to determine if two datasets significantly differ from each other in terms of their distributions.

Null Hypothesis (H0): The K-S test starts with the assumption that the two datasets being compared are sampled from the same distribution.

Alternative Hypothesis (H1): The alternative hypothesis suggests that the two datasets come from different distributions.


In [5]:
from scipy.stats import ks_2samp

# comparing a subset of the same group1
statistic, p_value = ks_2samp(group1.age, group1.age.sample(frac=0.1))

print(statistic)
print(p_value > 0.05)

0.03351099709232136
True


In [6]:
# comparing two groups
statistic, p_value = ks_2samp(group1.age, group2.age)

print(statistic)
print(p_value > 0.05)

0.08272413894832231
False


# Wasserstein distance

The Wasserstein distance, also known as the Earth Mover's Distance (EMD) or Kantorovich-Rubinstein metric, is another measure used to quantify the difference between two probability distributions. 

The Wasserstein distance can be interpreted as the minimum "work" or cost required to transform one distribution into the other.

In [7]:
from scipy.stats import wasserstein_distance

# same group samples
distance = wasserstein_distance(group1.age.values, group1.age.sample(frac=.5, replace=True).values)
distance

0.23067257123505178

In [8]:
from scipy.stats import wasserstein_distance

# comparing groups
distance = wasserstein_distance(group1.age.values, group2.age.values)
distance

2.690774679475269

In [9]:
weights1 = group1.age.values / np.sum(group1.age.values)  # Normalize to ensure the sum of weights is 1
weights2 = group2.age.values / np.sum(group2.age.values)

# comparing groups normed
distance = wasserstein_distance(group1.age.values, group2.age.values, u_weights=weights1, v_weights=weights2)
distance

3.0301644690805345

# Using Evidently

In [10]:
data_drift_dataset_report = Report(metrics=[

    DatasetDriftMetric(),

    DataDriftTable(),    

])

data_drift_dataset_report.run(reference_data=group1[['age']], current_data=group2[['age']])

data_drift_dataset_report

In [11]:
group1.to_csv('group1.csv', index=False)
group2.to_csv('group2.csv', index=False)

# Using Great Expectations

In [12]:
#!great_expectations init or context = gx.get_context()

In [13]:
import great_expectations as gx

In [44]:
context = gx.get_context()  #gx.data_context.DataContext('gx') #for when the init command was used

INFO:great_expectations.data_context.types.base:Created temporary directory 'C:\Users\LUCASA~1\AppData\Local\Temp\tmp3fzsclgl' for ephemeral docs site


In [45]:
# Give your Datasource a name
datasource_name = 'validator'
datasource = context.sources.add_pandas(datasource_name)

# Give your first Asset a name
asset_name = 'group1'
path_to_data = 'group1.csv'

asset = datasource.add_csv_asset(asset_name, filepath_or_buffer=path_to_data)

# Build batch request
batch_request = asset.build_batch_request()

In [46]:
context.add_or_update_expectation_suite("my_expectation_suite")

{
  "expectation_suite_name": "my_expectation_suite",
  "ge_cloud_id": null,
  "expectations": [],
  "data_asset_type": null,
  "meta": {
    "great_expectations_version": "0.17.22"
  }
}

In [47]:
validator = context.get_validator(
    batch_request=batch_request,
    expectation_suite_name="my_expectation_suite",
)
validator.head()

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,class
0,25.0,Private,226802.0,11th,7.0,Never-married,Machine-op-inspct,Own-child,Black,Male,0.0,0.0,40.0,United-States,<=50K
1,28.0,Local-gov,336951.0,Assoc-acdm,12.0,Married-civ-spouse,Protective-serv,Husband,White,Male,0.0,0.0,40.0,United-States,>50K
2,34.0,Private,198693.0,10th,6.0,Never-married,Other-service,Not-in-family,White,Male,0.0,0.0,30.0,United-States,<=50K
3,63.0,Self-emp-not-inc,104626.0,Prof-school,15.0,Married-civ-spouse,Prof-specialty,Husband,White,Male,3103.0,0.0,32.0,United-States,>50K
4,55.0,Private,104996.0,7th-8th,4.0,Married-civ-spouse,Craft-repair,Husband,White,Male,0.0,0.0,10.0,United-States,<=50K


In [48]:
#from great_expectations_experimental.expectations import expect_column_wasserstein_distance_to_be_less_than 

In [18]:
validator.expect_column_values_to_not_be_null("age")


`result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.




Calculating Metrics:   0%|          | 0/6 [00:00<?, ?it/s]

{
  "success": true,
  "result": {
    "element_count": 14155,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": []
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [19]:
validator.expect_column_values_to_be_between("age", auto=True)





Generating Expectations:   0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/2 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/1 [00:00<?, ?it/s]

Profiling Dataset:         0%|          | 0/1 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]

Calculating Metrics:   0%|          | 0/4 [00:00<?, ?it/s]


`result_format` configured at the Validator-level will not be persisted. Please add the configuration to your Checkpoint config or checkpoint_run() method instead.




Calculating Metrics:   0%|          | 0/8 [00:00<?, ?it/s]

{
  "success": true,
  "expectation_config": {
    "expectation_type": "expect_column_values_to_be_between",
    "kwargs": {
      "column": "age",
      "min_value": 17.0,
      "max_value": 90.0,
      "mostly": 1.0,
      "strict_min": false,
      "strict_max": false
    },
    "meta": {
      "auto_generated_at": "20231019T134840.601525Z",
      "great_expectations_version": "0.17.22"
    }
  },
  "result": {
    "element_count": 14155,
    "unexpected_count": 0,
    "unexpected_percent": 0.0,
    "partial_unexpected_list": [],
    "missing_count": 0,
    "missing_percent": 0.0,
    "unexpected_percent_total": 0.0,
    "unexpected_percent_nonmissing": 0.0
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [20]:
validator.save_expectation_suite()

## Defining a Checkpoint and examining the data

In [21]:
# Give your Datasource a name
datasource_name_new = 'newdata'
datasource_new = context.sources.add_pandas(datasource_name_new)

# Give your Asset a name
asset_name_new = 'group2'
path_to_data_new = 'group2.csv'

asset_new = datasource_new.add_csv_asset(asset_name_new, filepath_or_buffer=path_to_data_new)

# Build batch request
batch_request_new = asset_new.build_batch_request()

In [23]:
checkpoint = context.add_or_update_checkpoint(
    name="my_checkpoint",
    validations=[
        {
            "batch_request": batch_request_new,
            "expectation_suite_name": "my_expectation_suite"
        }
    ]
)

In [24]:
checkpoint_result = checkpoint.run()

Calculating Metrics:   0%|          | 0/13 [00:00<?, ?it/s]

In [27]:
checkpoint_result.validation_result_url

In [26]:
context.build_data_docs() #all good with the null and range expectations

{'local_site': 'file://C:\\Users\\LUCASA~1\\AppData\\Local\\Temp\\tmp62a3p9w_\\index.html'}