In [None]:
import argparse
import os
os.environ['SPARK_VERSION'] = '3.1'
os.environ['JAVA_HOME'] = '/Library/Java/JavaVirtualMachines/jdk-11.0.12.jdk/Contents/Home'

In [None]:
import pydeequ
from pydeequ.analyzers import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import countDistinct, count, col, lit

In [None]:
spark = SparkSession\
    .builder\
    .config("spark.jars.packages", pydeequ.deequ_maven_coord)\
    .config("spark.jars.excludes", pydeequ.f2j_maven_coord)\
    .getOrCreate()

In [None]:
df = spark.read.parquet('./data/data.parquet')
df.printSchema()

## PyDeeque Analyzer Research

In [None]:
analyzer = AnalysisRunner(spark)\
    .onData(df)\
    .addAnalyzer(Size())\
    .addAnalyzer(Completeness('id'))\
    .addAnalyzer(Completeness('vendor_code'))\
    .addAnalyzer(Completeness('name'))\
    .addAnalyzer(Completeness('type'))\
    .addAnalyzer(Completeness('label'))\
    .addAnalyzer(Completeness('price'))\
    .addAnalyzer(Completeness('discount'))\
    .addAnalyzer(Completeness('available_count'))\
    .addAnalyzer(Completeness('preorder_count'))\
    .addAnalyzer(Compliance('discount less than 0', 'discount<0'))\
    .addAnalyzer(Compliance('discount greater than 100', 'discount>100'))\
    .addAnalyzer(Compliance('availiable_count less than 0', 'available_count<0'))\
    .addAnalyzer(Compliance('preorder_count less than 0', 'preorder_count<0'))\
    .addAnalyzer(Distinctness('id'))\
    .run()

In [None]:
success_metrics_df = AnalyzerContext.successMetricsAsDataFrame(spark, analyzer)

In [None]:
success_metrics_df.toPandas()