### To test data quality using Greate expectation

In [20]:
import os
import pandas as pd
import great_expectations as gx
from great_expectations.dataset import PandasDataset
from datetime import datetime
import json
from great_expectations import DataContext


In [2]:
data = pd.read_csv("../raw-data/data_split_17.csv")

df = pd.DataFrame(data)

In [3]:
df.head()

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,M21660,M,301.1,France,1310,48.8,103
1,L53981,L,301.1,310.7,speed_error,48.2,106
2,M21662,M,301.1,310.7,1621,31.0,108
3,M21663,M,300.9,310.7,1367,43.4,111
4,L53984,L,301.0,310.6,1588,33.7,114


In [4]:
# Wrap the DataFrame in Great Expectations
ge_df = PandasDataset(df)


In [23]:
ge_df.expect_table_columns_to_match_ordered_list(
    ["Product ID", "Type", "Air temperature [K]", "Process temperature [K]", 
     "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"]
)
ge_df.expect_column_values_to_not_be_null("Product ID")
ge_df.expect_column_values_to_be_unique("Product ID")
ge_df.expect_column_values_to_match_regex("Product ID", "^[LMH]\\d+$")
ge_df.expect_column_values_to_not_be_null("Type")
# 2. Expect Type to be one of the predefined categories
ge_df.expect_column_values_to_be_in_set(
    column="Type",
    value_set=["L", "M", "H"]
)
# 3. Expect Air temperature [K] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Air temperature [K]",
    min_value=293,
    max_value=313,
    mostly=0.95  # Allow for up to 5% exceptions
)
# 4. Expect Process temperature [K] to be within a realistic range (numeric only)
ge_df.expect_column_values_to_be_of_type(column="Process temperature [K]", type_="float")
ge_df.expect_column_values_to_not_be_null(column="Air temperature [K]", mostly=0.9)
ge_df.expect_column_values_to_be_between(column="Air temperature [K]", min_value=298.0, max_value=310.0, mostly=0.9)
ge_df.expect_column_values_to_not_be_null(column="Rotational speed [rpm]", mostly=0.9)
# 5. Expect Rotational speed [rpm] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Rotational speed [rpm]",
    min_value=500,
    max_value=10000,
    mostly=0.98  # Allow for up to 2% exceptions
)
ge_df.expect_column_values_to_not_be_null(column="Torque [Nm]", mostly=0.9)
# 6. Expect Torque [Nm] to be within a realistic range
ge_df.expect_column_values_to_be_between(
    column="Torque [Nm]",
    min_value=10,
    max_value=80,
    mostly=0.95
)
ge_df.expect_column_values_to_not_be_null(column="Tool wear [min]", mostly=0.9)

# 8. Expect no null values in required columns
required_columns = [
    "Product ID", 
    "Type", 
    "Air temperature [K]", 
    "Process temperature [K]", 
    "Rotational speed [rpm]"
]

for col in required_columns:
    ge_df.expect_column_values_to_not_be_null(column=col)

# 10. Custom handling for errors (e.g., "error" values in Tool wear [min])
ge_df.expect_column_values_to_not_match_regex(
    column="Tool wear [min]",
    regex=r"error"
)


{
  "success": false,
  "result": {
    "element_count": 420,
    "missing_count": 34,
    "missing_percent": 8.095238095238095,
    "unexpected_count": 37,
    "unexpected_percent": 9.585492227979273,
    "unexpected_percent_total": 8.80952380952381,
    "unexpected_percent_nonmissing": 9.585492227979273,
    "partial_unexpected_list": [
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error",
      "error"
    ]
  },
  "meta": {},
  "exception_info": {
    "raised_exception": false,
    "exception_traceback": null,
    "exception_message": null
  }
}

In [5]:
# Show validation results
results = ge_df.validate()
print(type(results))

<class 'great_expectations.core.expectation_validation_result.ExpectationSuiteValidationResult'>


In [21]:
import json

# Validate the dataset
results = ge_df.validate()

# Convert results to a JSON-serializable dictionary
results_dict = results.to_json_dict()

# Save to a JSON file
with open("validation_results.json", "w") as f:
    json.dump(results_dict, f)

print("Validation results saved successfully.")



Validation results saved successfully.


In [22]:
# Load validation results
with open("validation_results.json", "r") as f:
    saved_results = json.load(f)

# Use the saved results for logging or comparisons
print(saved_results)

# Apply the expectation suite to a new dataset

new_df = pd.read_csv("../raw-data/data_split_7.csv")
ge_new_df = ge.from_pandas(new_df)

# Validate the new dataset (ensure you have the expectation suite set up)
validation_results = ge_new_df.validate()
print(validation_results)



{'success': False, 'results': [{'success': True, 'expectation_config': {'expectation_type': 'expect_table_columns_to_match_ordered_list', 'kwargs': {'column_list': ['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]'], 'result_format': 'BASIC'}, 'meta': {}}, 'result': {'observed_value': ['Product ID', 'Type', 'Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 'Torque [Nm]', 'Tool wear [min]']}, 'meta': {}, 'exception_info': {'raised_exception': False, 'exception_message': None, 'exception_traceback': None}}, {'success': True, 'expectation_config': {'expectation_type': 'expect_column_values_to_not_be_null', 'kwargs': {'column': 'Product ID', 'result_format': 'BASIC'}, 'meta': {}}, 'result': {'element_count': 420, 'unexpected_count': 0, 'unexpected_percent': 0.0, 'unexpected_percent_total': 0.0, 'partial_unexpected_list': []}, 'meta': {}, 'exception_info': {'raised_exception': False, '

In [6]:
results

{
  "success": true,
  "results": [],
  "evaluation_parameters": {},
  "statistics": {
    "evaluated_expectations": 0,
    "successful_expectations": 0,
    "unsuccessful_expectations": 0,
    "success_percent": null
  },
  "meta": {
    "great_expectations_version": "0.18.13",
    "expectation_suite_name": "default",
    "run_id": {
      "run_name": null,
      "run_time": "2025-02-03T09:50:37.910235+01:00"
    },
    "batch_kwargs": {
      "ge_batch_id": "e9c08cbc-e20b-11ef-8a73-a864f1e2a1ee"
    },
    "batch_markers": {},
    "batch_parameters": {},
    "validation_time": "20250203T085037.910188Z",
    "expectation_suite_meta": {
      "great_expectations_version": "0.18.13"
    }
  }
}

In [7]:
def extract_validation_statistics(file_name, ge_df, required_columns, suite, error_threshold=0.9):
    """
    Extract statistics from the validation results and return them for database insertion.

    Parameters:
    ge_df (Great Expectations DataFrame): The dataframe with expectations.
    file_name (str): The name of the file being processed.
    required_columns (list): List of required columns to check for null values.
    suite (Great Expectations Expectation Suite): The loaded expectation suite.
    error_threshold (float): The threshold for considering errors (default is 0.9 for 90%).

    Returns:
    dict: A dictionary containing the validation statistics for the file.
    """
    # Run the validation
    validation_results = ge_df.validate(expectation_suite=suite)

    # Extract the underlying pandas DataFrame
    # ge_df = ge_df.dataframe

    # Get total rows
    total_rows = len(ge_df)

    # Initialize counters for valid and invalid rows
    valid_rows = 0
    invalid_rows = 0
    error_details = {}

    # Loop through each validation result and accumulate errors
    for result in validation_results["results"]:
        if not result["success"]:
            invalid_rows += 1
            # Extract error details
            expectation_name = result["expectation_config"]["expectation_type"]
            error_details[expectation_name] = error_details.get(expectation_name, 0) + 1

    valid_rows = total_rows - invalid_rows

    # Calculate error rate
    error_rate = (invalid_rows / total_rows) * 100 if total_rows > 0 else 0

    # Check for missing columns (non-null requirement) based on the expected threshold
    missing_columns = []
    for col in required_columns:
        column_valid = ge_df[col].notnull().mean() >= error_threshold
        if not column_valid:
            missing_columns.append(col)

    # Prepare the statistics for the database
    stats = {
        "id": None,  # Placeholder for database auto-generated ID (can be handled by DB)
        "file_name": file_name,
        "total_rows": total_rows,
        "valid_rows": valid_rows,
        "invalid_rows": invalid_rows,
        "error_rate": error_rate,
        "error_details": json.dumps(error_details),  # Convert error details to JSON string
        "processed_at": datetime.now().isoformat(),  # Store current timestamp in ISO format
        "missing_columns": missing_columns,
    }

    return stats


In [8]:
required_columns = [
    "Product ID", "Type", "Air temperature [K]", "Process temperature [K]", 
     "Rotational speed [rpm]", "Torque [Nm]", "Tool wear [min]"
]

In [10]:
def check_criticality(validation_result):
    # Initialize variables for error rate calculation
    total_expectations = len(validation_result.get("results", []))
    #print(total_expectations)
    failed_expectations = sum(1 for result in validation_result.get("results", []) if not result.get("success", True))
    # print(f"Ivalid_expectaion_count ---> {failed_expectations}")
    error_rate = (failed_expectations / total_expectations) * 100 if total_expectations > 0 else 0
    # print(error_rate)
    # Determine overall criticality based on error rate
    if error_rate >= 50:
        overall_criticality = "High"
    elif 20 <= error_rate < 50:
        overall_criticality = "Medium"
    else:
        overall_criticality = "Low"
    
    error_stat_pot = [overall_criticality, failed_expectations, error_rate]
    
    return error_stat_pot

In [11]:
directory_path = "../raw-data"

val_stat = pd.DataFrame(columns=['file_name', 'criticality_level', 'failed_expection_count', 'error_rate'])
# Loop through each file in the directory
for file_name in os.listdir(directory_path):
    if file_name.endswith(".csv"):  # Process only CSV files
        # Build the full file path
        file_path = os.path.join(directory_path, file_name)
        
        # Read the CSV file into a pandas DataFrame
        new_df = pd.read_csv(file_path)
        
        # Convert the pandas DataFrame to a Great Expectations DataFrame
        ge_df = ge.from_pandas(new_df)
        # print(ge_df.head())  # Display the first few rows as an example
        
        context = DataContext()
        
        suite = context.get_expectation_suite("milling_machine_data_quality")
        validation_result = ge_df.validate(expectation_suite=suite)
        
        erro_stat_pot = check_criticality(validation_result=validation_result)
        val_stat.loc[len(val_stat)] = [file_name, erro_stat_pot[0], erro_stat_pot[1], erro_stat_pot[2]]
        # print(f"Processing file: {file_name}")
        # print(f"with c leve ---> {c_level}")
        
        

In [12]:
val_stat

Unnamed: 0,file_name,criticality_level,failed_expection_count,error_rate
0,data_split_24.csv,Low,2,12.5
1,data_split_14.csv,Low,0,0.0
2,data_split_8.csv,Medium,7,43.75
3,data_split_21.csv,High,9,56.25
4,data_split_23.csv,High,10,62.5
5,data_split_9.csv,Low,2,12.5
6,data_split_15.csv,High,9,56.25
7,data_split_11.csv,Medium,7,43.75
8,data_split_18.csv,Medium,4,25.0
9,data_split_12.csv,Low,0,0.0


In [13]:
file_name = "data_split_9.csv"
new_df = pd.read_csv("../raw-data/data_split_9.csv")
ge_9_df = ge.from_pandas(new_df)
ge_9_df

Unnamed: 0,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
0,XYZ123,L,301.7,310.5,1523,34.7,181
1,L50781,L,301.7,310.5,1377,43.1,183
2,M18462,M,301.7,310.6,1417,45.2,185
3,L50783,L,301.7,310.7,1542,40.0,188
4,L50784,L,301.8,310.8,1557,33.3,190
...,...,...,...,...,...,...,...
395,L51175,L,302.3,311.3,1621,31.6,135
396,M18856,M,302.3,311.3,1625,33.1,137
397,L51177,L,302.2,311.1,1394,53.7,140
398,L51178,L,302.1,311.0,1586,35.1,142


In [15]:
context = DataContext()        
suite = context.get_expectation_suite("milling_machine_data_quality")
validation_result_9 = ge_9_df.validate(expectation_suite=suite)

In [16]:
validation_result_9

{
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "kwargs": {
          "column_list": [
            "Product ID",
            "Type",
            "Air temperature [K]",
            "Process temperature [K]",
            "Rotational speed [rpm]",
            "Torque [Nm]",
            "Tool wear [min]"
          ]
        },
        "meta": {}
      },
      "result": {
        "observed_value": [
          "Product ID",
          "Type",
          "Air temperature [K]",
          "Process temperature [K]",
          "Rotational speed [rpm]",
          "Torque [Nm]",
          "Tool wear [min]"
        ]
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": true,
      "expectation_config": {
        "expectation_type":

In [None]:
import great_expectations as gx

context = gx.get_context()

datasource = context.get_datasource("milling_machine_data")
print("Available Data Connectors:", datasource.data_connectors.keys())
print(datasource.get_available_data_asset_names())

Available Data Connectors: dict_keys(['default_inferred_data_connector_name', 'default_runtime_data_connector_name'])
{'default_inferred_data_connector_name': ['data_split_9.csv', 'good_data', 'data_split_4.csv', 'data_split_3.csv', 'data_split_10.csv', 'data_split_5.csv', 'data_split_14.csv', 'data_split_13.csv', 'data_split_0.csv', 'data_split_15.csv', 'data_split_2.csv', 'data_split_1.csv', 'data_split_12.csv', 'data_split_17.csv', 'data_split_8.csv', 'data_split_24.csv', 'data_split_18.csv', 'data_split_7.csv', 'data_split_23.csv', 'bad_data', 'data_split_19.csv', 'data_split_11.csv', 'data_split_6.csv', 'data_split_21.csv', 'data_split_16.csv', 'data_split_22.csv', 'data_split_20.csv'], 'default_runtime_data_connector_name': ['my_runtime_asset_name']}


In [88]:
# batch_request = {
#     "datasource_name": "milling_machine_data",
#     "data_connector_name": "default_inferred_data_connector_name",  # Use the name from the previous step
#     "data_asset_name": "data_split_16.csv"  # Replace with the actual asset name
# }
from great_expectations.core.batch import BatchRequest
batch_request = BatchRequest(
    datasource_name="milling_machine_data",  # Replace with actual datasource name
    data_connector_name="default_inferred_data_connector_name",  # Replace with actual data connector name
    data_asset_name="data_split_16.csv",  # Replace with actual data asset
)

In [92]:
GREAT_EXPECTATION = '/home/kuzhalogi/WorkSpace/EquipmentFailurePred/gx'
SUITE_NAME = "milling_machine_data_quality"
suite = context.get_expectation_suite(SUITE_NAME)
print(suite)
validator = context.get_validator(batch_request=batch_request, expectation_suite=suite)

{
  "expectation_suite_name": "milling_machine_data_quality",
  "ge_cloud_id": null,
  "expectations": [
    {
      "expectation_type": "expect_table_columns_to_match_ordered_list",
      "kwargs": {
        "column_list": [
          "Product ID",
          "Type",
          "Air temperature [K]",
          "Process temperature [K]",
          "Rotational speed [rpm]",
          "Torque [Nm]",
          "Tool wear [min]"
        ]
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_not_be_null",
      "kwargs": {
        "column": "Product ID"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_be_unique",
      "kwargs": {
        "column": "Product ID"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_column_values_to_match_regex",
      "kwargs": {
        "column": "Product ID",
        "regex": "^[LMH]\\d+$"
      },
      "meta": {}
    },
    {
      "expectation_type": "expect_c

## Validation by each row

In [90]:
import great_expectations as gx
import pandas as pd

def validate_data_row_by_row(file_path: str, validator):
    df = pd.read_csv(file_path)  # Read the CSV file
    error_indices = []  # Store indices of erroneous rows
    
    for idx, row in df.iterrows():
        row_df = pd.DataFrame([row])  # Convert single row to DataFrame
        validation_result = validator.validate(row_df)
        
        # Ensure validation_result contains errors
        if not validation_result.success:
            error_info = validation_result.to_json_dict()  # Convert result to a dictionary
            
            # Check if there are any failed expectations
            if "results" in error_info:
                for result in error_info["results"]:
                    if not result.get("success", True):
                        error_indices.append(idx)  # Store index of failed row
                        break  # Stop checking further for this row

    return error_indices

# # Example usage
# file_path = "../raw-data/data_split_9.csv"
# error_rows = validate_data_row_by_row(file_path, validator)
# print("Rows with errors:", error_rows)


In [93]:
rows_with_errors = validate_data_row_by_row("../raw-data/data_split_16.csv",validator)

ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to validate using the provided value for expectation suite; does it need to be loaded from a dictionary?
ERROR:great_expectations.validator.validator:Unable to val

In [None]:
def validate_data(file_path: str):
    context = gx.data_context.DataContext(GREAT_EXPECTATION)
    suite = context.get_expectation_suite(SUITE_NAME)
    df = gx.read_csv(file_path)
    
    results = df.validate(expectation_suite=suite)
    capsule = {'results': results, 'file_path': file_path, 'errors': [],'stats':[]}
    if not results["success"]:
        for result in results["results"]:
            if not result["success"]:
                error = {
                    "expectation": result["expectation_config"],
                    "unexpected_value": result["result"].get("unexpected_list", [])
                }
                capsule['errors'].append(error)
    return capsule


In [28]:
def validate_data(file_path: str):
    context = gx.data_context.DataContext(GREAT_EXPECTATION)
    suite = context.get_expectation_suite(SUITE_NAME)
    df = gx.read_csv(file_path)
    
    results = df.validate(expectation_suite=suite)
    capsule = {'results': results, 'file_path': file_path, 'errors': [],'stats':[]}
    if not results["success"]:
        for result in results["results"]:
            if not result["success"]:
                error = {
                    "expectation": result["expectation_config"],
                    "unexpected_value": result["result"].get("unexpected_list", [])
                }
                capsule['errors'].append(error)
    return capsule


In [37]:
val_stat_9 = validate_data("../raw-data/data_split_11.csv")




In [38]:
val_stat_9

{'results': {
   "success": false,
   "results": [
     {
       "success": true,
       "expectation_config": {
         "expectation_type": "expect_table_columns_to_match_ordered_list",
         "kwargs": {
           "column_list": [
             "Product ID",
             "Type",
             "Air temperature [K]",
             "Process temperature [K]",
             "Rotational speed [rpm]",
             "Torque [Nm]",
             "Tool wear [min]"
           ]
         },
         "meta": {}
       },
       "result": {
         "observed_value": [
           "Product ID",
           "Type",
           "Air temperature [K]",
           "Process temperature [K]",
           "Rotational speed [rpm]",
           "Torque [Nm]",
           "Tool wear [min]"
         ]
       },
       "meta": {},
       "exception_info": {
         "raised_exception": false,
         "exception_message": null,
         "exception_traceback": null
       }
     },
     {
       "success": false,
     

### Utils.py

In [51]:
import os
import pandas as pd
import logging

def create_directories(base_dir: str, good_data_dir: str, bad_data_dir: str):
    """
    Create 'good_data' and 'bad_data' directories if they don't exist.
    """
    os.makedirs(good_data_dir, exist_ok=True)
    os.makedirs(bad_data_dir, exist_ok=True)
    print(f"Created directories: {good_data_dir}, {bad_data_dir}")

def move_file(file_path: str, target_dir: str):
    """
    Move a file to the specified target directory.
    """
    file_name = os.path.basename(file_path)
    new_path = os.path.join(target_dir, file_name)
    os.rename(file_path, new_path)
    print(f"File moved to {new_path}.")

def split_file(df: pd.DataFrame, error_indices: set, file_name: str, good_data_dir: str, bad_data_dir: str):
    """
    Split the DataFrame into good and bad rows, and save them to respective directories.
    """
    # Convert set to list for indexing
    error_indices_list = list(error_indices)
    
    good_rows = df.drop(error_indices_list)
    bad_rows = df.loc[error_indices_list]

    # Save good rows to good_data
    good_file_path = os.path.join(good_data_dir, file_name)
    good_rows.to_csv(good_file_path, index=False)
    print(f"Good rows saved to {good_file_path}.")

    # Save bad rows to bad_data
    bad_file_name = f"bad_{file_name}"
    bad_file_path = os.path.join(bad_data_dir, bad_file_name)
    bad_rows.to_csv(bad_file_path, index=False)
    print(f"Bad rows saved to {bad_file_path}.")

def validate_row(row: pd.Series, suite: gx.core.ExpectationSuite) -> bool:
    """
    Validate a single row against the Great Expectations suite.
    Returns True if the row passes all expectations, False otherwise.
    """
    row_df = row.to_frame().T  # Convert row to a single-row DataFrame
    validation_result = row_df.validate(expectation_suite=suite)
    return validation_result["success"]

def get_error_indices(df: pd.DataFrame, suite: gx.core.ExpectationSuite) -> set:
    """
    Iterate through the dataset row by row, apply GX checks, and store indices of failed rows.
    """
    error_indices = set()

    for index, row in df.iterrows():
        if not validate_row(row, suite):
            error_indices.add(index)

    return error_indices

### actual dag

In [54]:
def split_and_save_files(capsule):
    file_path = capsule["file_path"]
    df = pd.read_csv(file_path)

    # Load the Great Expectations suite
    context = gx.data_context.DataContext(GREAT_EXPECTATION)
    suite = context.get_expectation_suite(SUITE_NAME)

    # Determine the file name and base directory
    file_name = os.path.basename(file_path)
    base_dir = os.path.dirname(file_path)
    good_data_dir = os.path.join(base_dir, "good_data")
    bad_data_dir = os.path.join(base_dir, "bad_data")

    # Create directories if they don't exist
    create_directories(base_dir, good_data_dir, bad_data_dir)

    # Get indices of rows with errors
    error_indices = get_error_indices(df, suite)

    if not error_indices:
        # No errors: Move the file to good_data
        move_file(file_path, good_data_dir)
    elif len(error_indices) == len(df):
        # All rows have errors: Move the file to bad_data
        move_file(file_path, bad_data_dir)
    else:
        # Some rows have errors: Split the file
        split_file(df, error_indices, file_name, good_data_dir, bad_data_dir)

        # Remove the original file after splitting
        os.remove(file_path)
        print(f"Original file {file_name} removed after splitting.")

In [55]:
split_and_save_files(val_stat_9)




Created directories: ../raw-data/good_data, ../raw-data/bad_data


AttributeError: 'DataFrame' object has no attribute 'validate'

In [44]:
print(val_stat_9)

{'results': {
  "success": false,
  "results": [
    {
      "success": true,
      "expectation_config": {
        "expectation_type": "expect_table_columns_to_match_ordered_list",
        "kwargs": {
          "column_list": [
            "Product ID",
            "Type",
            "Air temperature [K]",
            "Process temperature [K]",
            "Rotational speed [rpm]",
            "Torque [Nm]",
            "Tool wear [min]"
          ]
        },
        "meta": {}
      },
      "result": {
        "observed_value": [
          "Product ID",
          "Type",
          "Air temperature [K]",
          "Process temperature [K]",
          "Rotational speed [rpm]",
          "Torque [Nm]",
          "Tool wear [min]"
        ]
      },
      "meta": {},
      "exception_info": {
        "raised_exception": false,
        "exception_message": null,
        "exception_traceback": null
      }
    },
    {
      "success": false,
      "expectation_config": {
        "expec

In [56]:
def validate_data(file_path: str):
    context = gx.data_context.DataContext(GREAT_EXPECTATION)
    suite = context.get_expectation_suite(SUITE_NAME)
    df = pd.read_csv(file_path)  # Load DataFrame manually
    
    invalid_rows = set()  # Store indices of bad rows
    error_details = []  # Store detailed error info per row
    
    # Loop through each row and validate columns one by one
    for idx, row in df.iterrows():
        row_errors = []  # Track errors for this row
        for expectation in suite.expectations:
            column = expectation.kwargs.get("column")
            expectation_type = expectation.expectation_type
            
            # Skip if column is not in data
            if column not in df.columns:
                continue  
            
            # Validate the specific row-column value
            value = row[column]
            validation_result = context.run_validation_operator(
                "action_list_operator",
                assets_to_validate=[gx.dataset.PandasDataset(pd.DataFrame({column: [value]}))],
                run_id=f"row_{idx}"
            )
            
            # Check if validation failed
            if not validation_result["success"]:
                row_errors.append({
                    "column": column,
                    "expectation": expectation_type,
                    "value": value
                })

        # If the row has errors, store its index
        if row_errors:
            invalid_rows.add(idx)
            error_details.append({"row": idx, "errors": row_errors})

    # Prepare capsule result
    capsule = {
        "file_path": file_path,
        "errors": error_details,
        "invalid_rows": list(invalid_rows)  # Convert set to list for JSON compatibility
    }

    return capsule


In [57]:
cap=validate_data("../raw-data/data_split_11.csv")




DataContextError: No validation operator `action_list_operator` was found in your project. Please verify this in your great_expectations.yml

In [None]:
def split_and_save_files(capsule):
    file_path = capsule["file_path"]
    df = pd.read_csv(file_path)
    invalid_rows = set(capsule.get("invalid_rows", []))  # Convert to set

    # Separate good and bad data
    bad_data = df.iloc[list(invalid_rows)]  # Rows that failed validation
    good_data = df.drop(index=list(invalid_rows))  # Remaining valid rows

    # Define save paths
    base_name = os.path.basename(file_path)
    good_file_path = os.path.join(GOOD_DATA_DIR, f"good_{base_name}")
    bad_file_path = os.path.join(BAD_DATA_DIR, f"bad_{base_name}")

    # Save files
    if not good_data.empty:
        good_data.to_csv(good_file_path, index=False)
        logging.info(f"✅ Good data saved: {good_file_path}")

    if not bad_data.empty:
        bad_data.to_csv(bad_file_path, index=False)
        logging.info(f"❌ Bad data saved: {bad_file_path}")

    return {"good_file": good_file_path, "bad_file": bad_file_path}
