### To test data quality using Greate expectation

In [92]:
import os
import pandas as pd
import great_expectations as ge
from great_expectations.dataset import PandasDataset
from datetime import datetime
import json


In [2]:
data = pd.read_csv("../raw-data/data_split_17.csv")

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,L54320,L,300.5,309.9,1382.0,46.5,134
1,M22001,M,300.4,France,1356.0,44.4,136
2,H36556,H,300.3,309.7,1452.0,,error
3,H36557,H,,309.7,1582.0,,144
4,M22004,M,300.4,309.8,1517.0,40.3,


In [4]:
# Wrap the DataFrame in Great Expectations
ge_df = PandasDataset(df)


In [23]:
ge_df.expect_table_columns_to_match_ordered_list(
    ["Product ID", "Type", "Air temperature [K]", "Process temperature [K]", 
     "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
)
ge_df.expect_column_values_to_not_be_null("Product ID")
ge_df.expect_column_values_to_be_unique("Product ID")
ge_df.expect_column_values_to_match_regex("Product ID", "^[LMH]\\d+$")
ge_df.expect_column_values_to_not_be_null("Type")
# 2. Expect Type to be one of the predefined categories
ge_df.expect_column_values_to_be_in_set(
    column="Type",
    value_set=["L", "M", "H"]
)
# 3. Expect Air temperature [K] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Air temperature [K]",
    min_value=293,
    max_value=313,
    mostly=0.95  # Allow for up to 5% exceptions
)
# 4. Expect Process temperature [K] to be within a realistic range (numeric only)
ge_df.expect_column_values_to_be_of_type(column="Process temperature [K]", type_="float")
ge_df.expect_column_values_to_not_be_null(column="Air temperature [K]", mostly=0.9)
ge_df.expect_column_values_to_be_between(column="Air temperature [K]", min_value=298.0, max_value=310.0, mostly=0.9)
ge_df.expect_column_values_to_not_be_null(column="Rotational speed [rpm]", mostly=0.9)
# 5. Expect Rotational speed [rpm] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Rotational speed [rpm]",
    min_value=500,
    max_value=10000,
    mostly=0.98  # Allow for up to 2% exceptions
)
ge_df.expect_column_values_to_not_be_null(column="Torque [Nm]", mostly=0.9)
# 6. Expect Torque [Nm] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Torque [Nm]",
    min_value=10,
    max_value=80,
    mostly=0.95
)
ge_df.expect_column_values_to_not_be_null(column="Tool wear [min]", mostly=0.9)

# 8. Expect no null values in required columns
required_columns = [
    "Product ID", 
    "Type", 
    "Air temperature [K]", 
    "Process temperature [K]", 
    "Rotational speed [rpm]"
]

for col in required_columns:
    ge_df.expect_column_values_to_not_be_null(column=col)

# 10. Custom handling for errors (e.g., "error" values in Tool wear [min])
ge_df.expect_column_values_to_not_match_regex(
    column="Tool wear [min]",
    regex=r"error"
)


{
  "success": false,
  "result": {
    "element_count": 420,
    "missing_count": 34,
    "missing_percent": 8.095238095238095,
    "unexpected_count": 37,
    "unexpected_percent": 9.585492227979273,
    "unexpected_percent_total": 8.80952380952381,
    "unexpected_percent_nonmissing": 9.585492227979273,
    "partial_unexpected_list": [
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [18]:
# Show validation results
results = ge_df.validate()
print(type(results))

<class 'great_expectations.core.expectation_validation_result.ExpectationSuiteValidationResult'>


In [21]:
import json

# Validate the dataset
results = ge_df.validate()

# Convert results to a JSON-serializable dictionary
results_dict = results.to_json_dict()

# Save to a JSON file
with open("validation_results.json", "w") as f:
    json.dump(results_dict, f)

print("Validation results saved successfully.")



Validation results saved successfully.


In [22]:
# Load validation results
with open("validation_results.json", "r") as f:
    saved_results = json.load(f)

# Use the saved results for logging or comparisons
print(saved_results)

# Apply the expectation suite to a new dataset

new_df = pd.read_csv("../raw-data/data_split_7.csv")
ge_new_df = ge.from_pandas(new_df)

# Validate the new dataset (ensure you have the expectation suite set up)
validation_results = ge_new_df.validate()
print(validation_results)



{'success': False, 'results': [{'success': True, 'expectation_config': {'expectation_type': 'expect_table_columns_to_match_ordered_list', 'kwargs': {'column_list': ['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]'], 'result_format': 'BASIC'}, 'meta': {}}, 'result': {'observed_value': ['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']}, 'meta': {}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}, {'success': True, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'Product ID', 'result_format': 'BASIC'}, 'meta': {}}, 'result': {'element_count': 420, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_total': 0.0, 'partial_unexpected_list': []}, 'meta': {}, 'exception_info': {'raised_exception': False, '

In [29]:
results

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "kwargs": {
          "column_list": [
            "Product ID",
            "Type",
            "Air temperature [K]",
            "Process temperature [K]",
            "Rotational speed [rpm]",
            "Torque [Nm]",
            "Tool wear [min]"
          ],
          "result_format": "BASIC"
        },
        "meta": {}
      },
      "result": {
        "observed_value": [
          "Product ID",
          "Type",
          "Air temperature [K]",
          "Process temperature [K]",
          "Rotational speed [rpm]",
          "Torque [Nm]",
          "Tool wear [min]"
        ]
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_co

In [71]:
def extract_validation_statistics(file_name, ge_df, required_columns, suite, error_threshold=0.9):
    """
    Extract statistics from the validation results and return them for database insertion.

    Parameters:
    ge_df (Great Expectations DataFrame): The dataframe with expectations.
    file_name (str): The name of the file being processed.
    required_columns (list): List of required columns to check for null values.
    suite (Great Expectations Expectation Suite): The loaded expectation suite.
    error_threshold (float): The threshold for considering errors (default is 0.9 for 90%).

    Returns:
    dict: A dictionary containing the validation statistics for the file.
    """
    # Run the validation
    validation_results = ge_df.validate(expectation_suite=suite)

    # Extract the underlying pandas DataFrame
    # ge_df = ge_df.dataframe

    # Get total rows
    total_rows = len(ge_df)

    # Initialize counters for valid and invalid rows
    valid_rows = 0
    invalid_rows = 0
    error_details = {}

    # Loop through each validation result and accumulate errors
    for result in validation_results["results"]:
        if not result["success"]:
            invalid_rows += 1
            # Extract error details
            expectation_name = result["expectation_config"]["expectation_type"]
            error_details[expectation_name] = error_details.get(expectation_name, 0) + 1

    valid_rows = total_rows - invalid_rows

    # Calculate error rate
    error_rate = (invalid_rows / total_rows) * 100 if total_rows > 0 else 0

    # Check for missing columns (non-null requirement) based on the expected threshold
    missing_columns = []
    for col in required_columns:
        column_valid = ge_df[col].notnull().mean() >= error_threshold
        if not column_valid:
            missing_columns.append(col)

    # Prepare the statistics for the database
    stats = {
        "id": None,  # Placeholder for database auto-generated ID (can be handled by DB)
        "file_name": file_name,
        "total_rows": total_rows,
        "valid_rows": valid_rows,
        "invalid_rows": invalid_rows,
        "error_rate": error_rate,
        "error_details": json.dumps(error_details),  # Convert error details to JSON string
        "processed_at": datetime.now().isoformat(),  # Store current timestamp in ISO format
        "missing_columns": missing_columns,
    }

    return stats


In [72]:
required_columns = [
    "Product ID", "Type", "Air temperature [K]", "Process temperature [K]", 
     "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"
]

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,L51800,L,,311.3,1336.0,52.600000,
1,L51801,L,303.0,311.2,1341.0,,174
2,M19482,M,,311.1,1526.0,37.300000,176
3,L51803,L,303.0,311.1,,40.700000,179
4,M19484,M,303.0,311.1,1428.0,39.800000,181
...,...,...,...,...,...,...,...
415,M19895,M,304.0,313.2,1500.0,40.600000,error
416,L52216,L,304.0,313.2,1615.0,47.331165,127
417,M19897,M,304.1,313.2,1586.0,41.100000,129
418,L52218,L,304.1,313.3,1732.0,28.500000,


In [74]:
# Assuming you have context set up
suite_11 = context.get_expectation_suite("milling_machine_data_quality")


In [111]:
def check_criticality(validation_result):
    # Initialize variables for error rate calculation
    total_expectations = len(validation_result.get("results", []))
    #print(total_expectations)
    failed_expectations = sum(1 for result in validation_result.get("results", []) if not result.get("success", True))
    print(f"Ivalid_expectaion_count ---> {failed_expectations}")
    error_rate = (failed_expectations / total_expectations) * 100 if total_expectations > 0 else 0
    print(error_rate)
    # Determine overall criticality based on error rate
    if error_rate >= 50:
        overall_criticality = "High"
    elif 20 <= error_rate < 50:
        overall_criticality = "Medium"
    else:
        overall_criticality = "Low"
        
    return overall_criticality

In [113]:
directory_path = "../raw-data"

# Loop through each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):  # Process only CSV files
        # Build the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Read the CSV file into a pandas DataFrame
        new_df = pd.read_csv(file_path)
        
        # Convert the pandas DataFrame to a Great Expectations DataFrame
        ge_df = ge.from_pandas(new_df)
        # print(ge_df.head())  # Display the first few rows as an example
        
        suite = context.get_expectation_suite("milling_machine_data_quality")
        validation_result = ge_df.validate(expectation_suite=suite)
        
        c_level = check_criticality(validation_result=validation_result)
        print(f"Processing file: {file_name}")
        print(f"with c leve ---> {c_level}")
        

Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_24.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_14.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_8.csv
with c leve ---> High
Ivalid_expectaion_count ---> 13
81.25
Processing file: data_split_21.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_23.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_9.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_15.csv
with c leve ---> High
Ivalid_expectaion_count ---> 13
81.25
Processing file: data_split_11.csv
with c leve ---> High
Ivalid_expectaion_count ---> 14
87.5
Processing file: data_split_18.csv
with c leve ---> High
Ivalid_expectaion_count ---> 13
81.25
Processing file: data_split_12.csv
with c leve ---> High
Ivalid_expectaion_count ---> 13
81.25
Processing file: data

In [81]:
file_name = "data_split_9.csv"
new_df = pd.read_csv("../raw-data/data_split_9.csv")
ge_9_df = ge.from_pandas(new_df)
ge_9_df

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,L50960,L,302.4,311.1,1446.0,42.9,error
1,L50961,L,302.3,310.8,1465.0,43.8,7
2,L50962,L,302.2,310.8,1827.0,23.8,9
3,H33197,H,302.2,310.8,1397.0,47.6,11
4,L50964,L,302.2,310.8,,29.7,16
...,...,...,...,...,...,...,...
415,L51375,L,,311.0,,38.4,188
416,M19056,M,302.5,311.0,1563.0,35.6,190
417,L51377,L,302.4,311.0,1573.0,36.5,error
418,M19058,M,302.4,311.1,1432.0,44.5,195


In [82]:
val_stat_9 = extract_validation_statistics(file_name,ge_9_df,required_columns,suite_11)

In [76]:
val_stat_11

{'id': None,
 'file_name': 'data_split_11.csv',
 'total_rows': 420,
 'valid_rows': 415,
 'invalid_rows': 5,
 'error_rate': 1.1904761904761905,
 'error_details': '{"expect_column_values_to_not_be_null": 3, "expect_column_values_to_be_of_type": 1, "expect_column_values_to_not_match_regex": 1}',
 'processed_at': '2025-01-16T13:04:26.624186',
 'missing_columns': ['Tool wear [min]']}

In [79]:
val_stat_22

{'id': None,
 'file_name': 'data_split_22.csv',
 'total_rows': 420,
 'valid_rows': 414,
 'invalid_rows': 6,
 'error_rate': 1.4285714285714286,
 'error_details': '{"expect_column_values_to_be_between": 1, "expect_column_values_to_not_be_null": 3, "expect_column_values_to_be_of_type": 1, "expect_column_values_to_not_match_regex": 1}',
 'processed_at': '2025-01-16T13:05:42.766206',
 'missing_columns': ['Air temperature [K]',
  'Rotational speed [rpm]',
  'Torque [Nm]']}

In [83]:
val_stat_9

{'id': None,
 'file_name': 'data_split_9.csv',
 'total_rows': 420,
 'valid_rows': 416,
 'invalid_rows': 4,
 'error_rate': 0.9523809523809524,
 'error_details': '{"expect_column_values_to_not_be_null": 2, "expect_column_values_to_be_of_type": 1, "expect_column_values_to_not_match_regex": 1}',
 'processed_at': '2025-01-16T13:07:01.520663',
 'missing_columns': []}

In [59]:
validation_result_11 = ge_new_df.validate(expectation_suite=suite_11)


In [90]:
crit_9 = check_criticality(val_stat_9)
crit_11 = check_criticality(validation_result_11)
crit_22 = check_criticality(val_stat_22)

In [91]:
print(crit_22,crit_11,crit_9)

Medium Medium Medium


In [60]:
validation_result_11

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "kwargs": {
          "column_list": [
            "Product ID",
            "Type",
            "Air temperature [K]",
            "Process temperature [K]",
            "Rotational speed [rpm]",
            "Torque [Nm]",
            "Tool wear [min]"
          ]
        },
        "meta": {}
      },
      "result": {
        "observed_value": [
          "Product ID",
          "Type",
          "Air temperature [K]",
          "Process temperature [K]",
          "Rotational speed [rpm]",
          "Torque [Nm]",
          "Tool wear [min]"
        ]
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type":

In [85]:
data_doc_22=context.build_data_docs()

In [86]:
data_doc_22['local_site']

'file:///home/kuzhalogi/WorkSpace/EquipmentFailurePred/gx/uncommitted/data_docs/local_site/index.html'