In [1]:
#!pip install great-expectations

In [2]:
import os
import pandas as pd
import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
from great_expectations.data_context.types.resource_identifiers import ExpectationSuiteIdentifier
from great_expectations.exceptions import DataContextError

2022-01-25T14:47:05-0500 - INFO - Great Expectations logging enabled at 20 level by JupyterUX module.


  pd.set_option("display.max_colwidth", -1)


In [3]:
pd.set_option('display.float_format', lambda x: '%.0f' % x)


  and should_run_async(code)


In [4]:
{
  "bucket": "dess.work.temporal.public",
  "filename": "datos_final.csv"
}

{'bucket': 'dess.work.temporal.public', 'filename': 'datos_final.csv'}

In [5]:
from collections import Counter, defaultdict
import os
import pandas as pd
import great_expectations as ge
import great_expectations.jupyter_ux
from great_expectations.core.expectation_configuration import ExpectationConfiguration
from great_expectations.profile.user_configurable_profiler import UserConfigurableProfiler
from great_expectations.data_context.types.resource_identifiers import ExpectationSuiteIdentifier
from great_expectations.exceptions import DataContextError
from storage_service import S3Client

import warnings
warnings.filterwarnings("ignore")

pd.set_option('display.float_format', lambda x: '%.0f' % x)

s3_client = S3Client()


class Expectations:
    def __init__(self,filename,bucket):
        df = ge.read_csv(f'data/{filename}')
        df = df[['PatientNumber', 'date', 'stage', 'activity']]
        df.reset_index(drop = True, inplace = True)
        self.df = df
        self.local_path = 'great_expectations/uncommitted/data_docs/local_site/expectations/'
        self.s3_bucket = bucket
        self.context = ge.data_context.DataContext()
        self.expectation_suite_name = 'my_data_expectations'
        self.suite = self.context.create_expectation_suite(expectation_suite_name=self.expectation_suite_name, overwrite_existing=True)
        
        
    def get_context(self):    
        self.context.save_expectation_suite(expectation_suite=self.suite, expectation_suite_name=self.expectation_suite_name)
        self.suite_identifier = ExpectationSuiteIdentifier(expectation_suite_name=self.expectation_suite_name)
        self.context.build_data_docs(resource_identifiers=[self.suite_identifier])
        self.context.open_data_docs(resource_identifier=self.suite_identifier)

    def get_expectations(self):
        df_ge = ge.from_pandas(self.df)
        batch_kwargs = {
            "datasource": 'my_data_files_dir',
            "dataset": df_ge,
        }
        self.batch = self.context.get_batch(
            batch_kwargs=batch_kwargs,
            expectation_suite_name="my_data_expectations")
        profiler = UserConfigurableProfiler(profile_dataset=self.batch)
        self.suite = profiler.build_suite()
        
        # We need to re-create our batch to link the batch with our new suite
        batch = self.context.get_batch(
        batch_kwargs=batch_kwargs,
        expectation_suite_name=self.expectation_suite_name)

        # Running validation
        #results = self.context.run_validation_operator("action_list_operator", assets_to_validate=[batch])
        #validation_result_identifier = results.list_validation_result_identifiers()[0]
        
        # Saving our expectation suite
        self.context.save_expectation_suite(self.suite, self.expectation_suite_name)

        # Building and opening Data Docs
        #self.context.build_data_docs()
        #self.context.open_data_docs(validation_result_identifier)
        
    def upload_to_s3(self):
        s3_client.upload_file(self.local_path+'my_data_expectations.html', self.s3_bucket)
        
    def clear_results_folder(self):
        os.remove(self.local_path+'my_data_expectations.html')
        
    def run(self):
        self.define_expectations()
        ##self.get_context()
        self.get_expectations()
        ##self.upload_to_s3()
        ##self.clear_results_folder()
        return True
        
    
    def define_expectations(self):
        
        # Verify column names
        expectation_column_names = ExpectationConfiguration(
            expectation_type="expect_table_columns_to_match_set",
            
            kwargs={
                "column_set": ["PatientNumber","date","stage","activity","TriagePriority"],
                'exact_match': True
            },

            meta={
                "notes": {
                    "format": "markdown",
                    "content": "Column names are case sentisitive"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_column_names, overwrite_existing=True)

        # Verify data types
        expectation_data_type_patient_number = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "PatientNumber",
                "type_": 'int',
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "PatientNumber should be an integer value"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_data_type_patient_number, overwrite_existing=True)


        expectation_data_type_activity = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "activity" ,
                "type_": "str" 
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "activity should be a string value"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_data_type_activity, overwrite_existing=True)

        expectation_data_type_event = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "stage" ,
                "type_": "str" 
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "stage should be a string value"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_data_type_event, overwrite_existing=True)

        expectation_data_type_date = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "date",
                "type_": "string"
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "data should be a  string in yyyy-MM-dd HH:mm:ss format "
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_data_type_date, overwrite_existing=True)

        expectation_data_type_triage_priority = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_of_type",
            kwargs={
                "column": "TriagePriority",
                "type_": "str"
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "TriagePriority should be a string value"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_data_type_triage_priority, overwrite_existing=True)

        # Identify missing values  

        expectation_missing_values_patient_number = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "PatientNumber",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_patient_number, overwrite_existing=True)

        expectation_missing_values_stage = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "stage",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_stage, overwrite_existing=True)

        expectation_missing_values_date = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "date",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_date, overwrite_existing=True)

        expectation_missing_values_activity = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "activity",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_activity, overwrite_existing=True)

        expectation_missing_values_triage_priority = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "TriagePriority",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_triage_priority, overwrite_existing=True)

       
        expectation_missing_values_triage_priority = ExpectationConfiguration(
            expectation_type="expect_column_values_to_not_be_null",
            kwargs={
                "column": "TriagePriority",
                "mostly": 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "This data column cannot contain missing values."
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_missing_values_triage_priority, overwrite_existing=True)


        # Check Expected Value  #PatientNumber	date	stage	activity TriagePriority

        expectation_expected_values_severity_index = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "TriagePriority",
                "value_set": ["Orange","Vert","Rouge"],
                "mostly" : 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": "Expected severity index values are:Orange,Vert,Rouge"
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_expected_values_severity_index, overwrite_existing=True)


        expectation_expected_values_Event = ExpectationConfiguration(
            expectation_type="expect_column_values_to_be_in_set",
            kwargs={
                "column": "activity",
                "value_set": ["TriageStartDateTime",
                                "TriageEndDateTime",
                                "biochimieBAStartDateTime",
                                "biochimieTakeBADateTime",
                                "biochimieResultBADateTime",
                                "hematologieBAStartDateTime",
                                "hematologieTakeBADateTime",
                                "hematologieResultBADateTime",
                                "coagulationBAStartDateTime",
                                "coagulationTakeBADateTime",
                                "coagulationResultBADateTime",
                                "RXPrescriptionDateTime",
                                "RXRealizationDateTime",
                                "RMIRealizationDateTime",
                                "RMIResultBADateTime",
                                "biochimeBAStartDateTime",
                                "biochimeTakeBADateTime",
                                "biochimeResultBADateTime"],
                "mostly" : 1
            },
            meta={
                "notes": {
                    "format": "markdown",
                    "content": """ Expected event values are TriageStartDateTime, TriageEndDateTime,biochimieBAStartDateTime,biochimieTakeBADateTime,
                    biochimieResultBADateTime, hematologieBAStartDateTime,  hematologieTakeBADateTime,
                                                                    hematologieResultBADateTime,  coagulationBAStartDateTime,coagulationTakeBADateTime,
                                                                    coagulationResultBADateTime, RXPrescriptionDateTime,RXRealizationDateTime,
                                                                    RMIRealizationDateTime,RMIResultBADateTime,biochimeBAStartDateTime,
                                                                    biochimeTakeBADateTime, biochimeResultBADateTime """
                                                                   
                }
            },
        )
        self.suite.add_expectation(expectation_configuration=expectation_expected_values_Event, overwrite_existing=True)
    

In [6]:
x = Expectations('test.csv','dess.work.temporal.public')
x.run()

    Your data context with this configuration version uses validation_operators, which are being deprecated.  Please update your configuration to be compatible with the version number 3.
2022-01-25T14:47:07-0500 - INFO - 	0 expectation(s) included in expectation_suite.


Profiling:   0%|          | 0/4 [00:00<?, ?it/s, Column=PatientNumber]

2022-01-25T14:47:07-0500 - INFO - 	21 expectation(s) included in expectation_suite.
Creating an expectation suite with the following expectations:

Table-Level Expectations
expect_table_columns_to_match_ordered_list
expect_table_row_count_to_be_between

Expectations by Column
Column Name: PatientNumber | Column Data Type: INT | Cardinality: VERY_MANY
expect_column_max_to_be_between
expect_column_mean_to_be_between
expect_column_median_to_be_between
expect_column_min_to_be_between
expect_column_proportion_of_unique_values_to_be_between
expect_column_quantile_values_to_be_between
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: activity | Column Data Type: STRING | Cardinality: VERY_FEW
expect_column_proportion_of_unique_values_to_be_between
expect_column_values_to_be_in_set
expect_column_values_to_be_in_type_list
expect_column_values_to_not_be_null


Column Name: date | Column Data Type: STRING | Cardinality: VERY_MANY
expect_column_proportion_o

True

In [7]:
x.batch.validate()

2022-01-25T14:47:07-0500 - INFO - 	21 expectation(s) included in expectation_suite.


{
  "results": [
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": true,
      "expectation_config": {
        "ge_cloud_id": null,
        "kwargs": {
          "column_list": [
            "PatientNumber",
            "date",
            "stage",
            "activity"
          ],
          "result_format": "BASIC"
        },
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "meta": {}
      },
      "result": {
        "observed_value": [
          "PatientNumber",
          "date",
          "stage",
          "activity"
        ]
      },
      "meta": {}
    },
    {
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      },
      "success": true,
      "expectation_config": {
        "ge_cloud_id": null,
        "kwargs": {
          "min_value": 6358,
 

In [9]:

x.context.open_data_docs()
