In [1]:
import great_expectations as ge
import json
import pandas as pd
from urllib.request import urlopen

In [2]:
# Load labeled projects
projects = pd.read_csv("https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/projects.csv")
tags = pd.read_csv("https://raw.githubusercontent.com/GokuMohandas/Made-With-ML/main/datasets/tags.csv")
df = ge.dataset.PandasDataset(pd.merge(projects, tags, on="id"))
print (f"{len(df)} projects")
df.head(5)

955 projects


Unnamed: 0,id,created_on,title,description,tag
0,6,2020-02-20 06:43:18,Comparison between YOLO and RCNN on real world...,Bringing theory to experiment is cool. We can ...,computer-vision
1,7,2020-02-20 06:47:21,"Show, Infer & Tell: Contextual Inference for C...",The beauty of the work lies in the way it arch...,computer-vision
2,9,2020-02-24 16:24:45,Awesome Graph Classification,"A collection of important graph embedding, cla...",graph-learning
3,15,2020-02-28 23:55:26,Awesome Monte Carlo Tree Search,A curated list of Monte Carlo tree search pape...,reinforcement-learning
4,19,2020-03-03 13:54:31,Diffusion to Vector,Reference implementation of Diffusion2Vec (Com...,graph-learning


In [None]:
"""
Can also use the following:
1. expect column pairs a > b
2. row count
3. expect column mean to be between a and b 

Good pratice:
1. Start with table
2. Proceed to column expectations
"""
# Presence of specific features
df.expect_table_columns_to_match_ordered_list(
    column_list=["id", "created_on", "title", "description", "tag"]
)
# Unique combinations of features (detect data leaks!)
df.expect_compound_columns_to_be_unique(column_list=["title", "description"])
# Missing values
df.expect_column_values_to_not_be_null(column="tag")
# Unique values
df.expect_column_values_to_be_unique(column="id")
# Type adherence
df.expect_column_values_to_be_of_type(column="title", type_="str")
# List (categorical) / range (continuous) of allowed values
tags = ["computer-vision", "graph-learning", "reinforcement-learning",
        "natural-language-processing", "mlops", "time-series"]
df.expect_column_values_to_be_in_set(column="tag", value_set=tags)

In [5]:
# Expectation suite; runs all the establisheed expectations
expectation_suite = df.get_expectation_suite(discard_failed_expectations=False)
print(df.validate(expectation_suite=expectation_suite, only_return_failures=True))

{
  "evaluation_parameters": {},
  "success": true,
  "results": [],
  "meta": {
    "great_expectations_version": "0.15.15",
    "expectation_suite_name": "default",
    "run_id": {
      "run_name": null,
      "run_time": "2023-03-08T16:23:26.865853+00:00"
    },
    "batch_kwargs": {
      "ge_batch_id": "e3914e63-bdcb-11ed-8ec1-68545a16436a"
    },
    "batch_markers": {},
    "batch_parameters": {},
    "validation_time": "20230308T162326.865853Z",
    "expectation_suite_meta": {
      "great_expectations_version": "0.15.15"
    }
  },
  "statistics": {
    "evaluated_expectations": 6,
    "successful_expectations": 6,
    "unsuccessful_expectations": 0,
    "success_percent": 100.0
  }
}
