<a href="https://colab.research.google.com/github/just-mubashir/data-validation/blob/main/data_validation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# %pip install sagemaker_pyspark
# %pip install pydeequ
# %pip install SparkSession

import os
os.environ["SPARK_VERSION"]="3.3"
import pydeequ

import sagemaker_pyspark
from pyspark.sql import SparkSession, Row

classpath = ":".join(sagemaker_pyspark.classpath_jars()) # aws-specific jars

spark = (SparkSession
    .builder
    .config("spark.driver.extraClassPath", classpath)
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)
    .getOrCreate())

In [None]:
from google.colab import files

uploaded = files.upload()

In [None]:
import pandas as pd

file_name = 'testData.csv'

# Read the CSV file into a DataFrame
data = pd.read_csv(file_name)


In [None]:
from pydeequ.checks import *
from pydeequ.verification import *

check = Check(spark, CheckLevel.Error, "Integrity checks")


# testData overview / data testing
checkResult = VerificationSuite(spark) \
    .onData(data) \
    .addCheck(
        check.hasSize(lambda x: x >= 50) \
        .hasMin("price", lambda x: x > 0) \
        .isComplete("status")  \
        .isUnique("prev_sold_date")  \
        .isNonNegative("price")) \
    .run()

# Running verification
checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()

# Describing the result
if checkResult.status == "Success":
    print('Tests passed')
else:
    print('Errors found:')

    for check_json in checkResult.checkResults:
        if check_json['constraint_status'] != "Success":
            print(f"\t{check_json['constraint']} reason: {check_json['constraint_message']}")

checkResult_df = VerificationResult.checkResultsAsDataFrame(spark, checkResult)
checkResult_df.show()

In [None]:

import pydeequ
from pydeequ.checks import *
from pydeequ.verification import *
from pydeequ.analyzers import *
from pydeequ.anomaly_detection import *
from pydeequ.profiles import *

analysisResult = AnalysisRunner(spark) \
                    .onData(data) \
                    .addAnalyzer(Size()) \
                    .addAnalyzer(Completeness("price")) \
                    .addAnalyzer(ApproxCountDistinct("city")) \
                    .addAnalyzer(Mean("acre_lot")) \
                    .addAnalyzer(Compliance("acre_lot", "acre_lot >= 140.0")) \
                    .addAnalyzer(Correlation("bed", "bath")) \
                    .run()

analysisResult_df = AnalyzerContext.successMetricsAsDataFrame(spark, analysisResult)
analysisResult_df.show()

In [None]:
# Define your anomaly check
anomaly_check = BatchNormalStrategy(lowerDeviationFactor=3.0, upperDeviationFactor=3.0, includeInterval=False)

# Run the verification suite with the anomaly check
anomalyResult = VerificationSuite(spark) \
    .onData(data) \
    .addAnomalyCheck(anomaly_check) \
    .run()

# Convert the result to a DataFrame
anomalyResultDF = AnalysisRunner(spark).run(anomalyResult)

# Show the results
anomalyResultDF.show()