In [0]:
####################
#CONFIG & ENV SETUP
####################
AppealState = "paymentPending"

config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()
 
print(f"env_code: {lz_key}")  # This won't be redacted
print(f"env_name: {env_name}")  # This won't be redacted
 
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
print(f"KeyVault_name: {KeyVault_name}")
 
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")
 
# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"
 
 
# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 

# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{AppealState}"
gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{AppealState}"
 
 
 
# Print all variables
variables = {
    # "read_hive": read_hive,
    
    "bronze_path": bronze_path,
    "silver_path": silver_path,
    "audit_path": audit_path,
    "gold_path": gold_path,
    "key_vault": KeyVault_name,
    "AppealState": AppealState
 
}
 
display(variables)


import pandas as pd
import os
import re

from functools import reduce
from pyspark.sql import SparkSession, Row
from pyspark.sql.functions import col, to_date, coalesce, greatest, lit, explode, date_add, current_date, count, monotonically_increasing_id, array_contains, count, explode, year, month, dayofmonth, when, first, countDistinct, collect_list, concat_ws, array, size, lower
from pyspark.sql.window import Window
from pyspark.sql.types import (
    StructType,
    StructField,
    StringType,
    IntegerType,
    DateType,
    ArrayType,
)
import json
# from docx import Document
# from docx.shared import Inches

In [0]:
### Setup Complete

In [0]:
#####################
#Used during development to Quickly reload libraries as they dont auto refresh when edited
#####################
import importlib
import functions.parquet_helper as pqhelper
import functions.sql_helper as sqlhelper
import functions.compare_data_helper as datahelper
import reporting.sql_to_parquet_html_report as html
import reporting.csv_report as csv

importlib.reload(pqhelper)
importlib.reload(sqlhelper)
importlib.reload(datahelper)
importlib.reload(html)
importlib.reload(csv)

In [0]:
#######################
#TEST 1 - MAIN TEST - Source (SQL) to Target(parquet)
# - Tests -
#- Check Schema
#- Check Row Counts
#- Check Row Hash (all data matches exactly)
#######################

from datetime import datetime
import os
from models.test_result import TestResult
import functions.parquet_helper as pqhelper
import functions.sql_helper as sqlhelper
import functions.compare_data_helper as datahelper
import reporting.sql_to_parquet_html_report as html
import reporting.csv_report as csv


###################
#SETUP / CONFIG
###################

PARQUET_BASE_PATH = "abfss://landing@ingest00landingstg.dfs.core.windows.net/SQLServer/Sales/IRIS/dbo/"
SQL_SCHEMA = "ARIA_STG.dbo"
results = []

#Setup SQL Database Connection
jdbc_hostname = "ingest00-legacy"
jdbc_port = 1433
database_name = "ARIA_STG"
username = "ARIA_databricks"
password = ""
SQL_SCHEMA = "ARIA_STG.dbo"
SQL_JDBC_URL = f"jdbc:sqlserver://{jdbc_hostname}:{jdbc_port};databaseName={database_name};user={username};password={password};encrypt=false"

#####################
#Get SQL Table List
#####################
sql_tables = sqlhelper.get_sql_tables(SQL_JDBC_URL)
# print(str(len(sql_tables)))

######################
#Known Empty tables (Checked in orig sql database and confirmed with no folder/data in parquet file location)
######################
known_empty_tables = ["Spot_Work_LogShippingInfo", "Centre", "UserCopyEntity", "Spot_Log", "Spot_RefreshAutoClose","Spot_Repl_Agents", "Spot_Repl_PubSub" , "Spot_TableFragDetails" , "Spot_TraceConsumer_Global", "Spot_TraceData", "Spot_TraceGlobalSummary", "Spot_Work_BufferCacheContents", "Spot_Work_Locks", "Spot_Work_OldestTranInfo", "Spot_Work1", "Spot_Work3", "AssociatedRep", "Spot_WorkFileDetails", "BundleRequest" , "BundleAccess", "InterfaceRestartQueue"]

########################
#Begin Main Test - Looping each table and check each one - building a report of pass/fail
########################
#Loop all tables and check, schema, counts and row hash's
for table in sql_tables:
    #Check if Empty Table (as they were not copied to parquet)
    if table in known_empty_tables:
        results.append(TestResult(table, "TABLE_CHECK", "PASS", f"Ignored Checking Empty table :{table}"))
        continue


    #############
    #Setup parquet location for table
    #############    
    table_parquet_path = PARQUET_BASE_PATH + table + "/"    
    try:
        parquet_folders = pqhelper.get_parquet_root_paths(table_parquet_path,dbutils)
    except:
        results.append(TestResult(table, "SEARCH_PARQUET", "FAIL", f"Failed to find Parquet Folder for table :{table}"))
        continue

    #Found one Parquet as Expected
    if len(parquet_folders) == 1:                        
        parquet_path = parquet_folders[0]
        
        #############
        #Load SQL Tables into Dataframes
        #############
        try:
            #if Transaction table, put in squarebracks else raises error for reserved word
            if table == "Transaction":
                table = f"[{table}]"                        
            sql_df = sqlhelper.read_sql_table(table, SQL_JDBC_URL, SQL_SCHEMA)
            # display(sql_df)
        except:
            results.append(TestResult(table, "READ_SQL", "FAIL", f"Failed to read SQL for table :{table}"))
            continue
        
        #############
        #Load Parquet files as Dataframes        
        #############
        try:
            pq_df = spark.read.parquet(parquet_path)        
            # display(pq_df)
        except:
            results.append(TestResult(table, "READ_PQ", "FAIL", f"Failed to read Parquet for table :{table}"))
            continue


        #############
        #Compare SCHEMA
        #############
        try:
            results.append(datahelper.check_schema(table, sql_df, pq_df))
        except:
            results.append(TestResult(table, "CHECK_SCHEMA", "FAIL", f"Failed to check Schema for table :{table}"))
            continue

        #############
        #Check table Row Counts 
        #############
        try:
            results.append(datahelper.check_row_counts(table, sql_df, pq_df))
        except:
            results.append(TestResult(table, "CHECK_ROW_COUNT", "FAIL", f"Failed to check row Count for table :{table}"))
            continue

        #############
        #Compare Row Data (converting row data to string and hash for source and target then comparing row hash values)
        #############        
        try:            
            result, sql_h,pq_h = datahelper.check_row_data(table, sql_df, pq_df, "")
            results.append(result)
            # display(sql_h)
            # display(pq_h)   
        except:
            results.append(TestResult(table, "CHECK_ROW_DATA", "FAIL", f"Failed to check data for table :{table}"))
            continue   

        ############# 
        #Report All Tests Run for Table
        ############# 
        results.append(TestResult(table, f"TABLE_CHECK", "PASS", f"All Tests Have Run for : {table} (Schema/Record Count/Row Hash Check)")) 

    #Failed to find or more than one parquet folder
    else:        
        results.append(TestResult(table, "SEARCH_PARQUET", "FAIL", f"Failed to find One (found : {str(len(parquet_folders))}) Parquet folder for table :{table}"))


####################
#Print Results as DataFrame and Display
####################
ordered_cols = [
    "table_name",
    "test_type",
    "status",
    "message",    
]
results_df = spark.createDataFrame(results)
results_ordered_df = results_df.select(*ordered_cols)
display(results_ordered_df)

########################
#Create Results Folder with CSV AND HTML REPORT
########################
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"results/{timestamp}/"
os.makedirs(output_dir, exist_ok=True)

# Generate HTML report
html_report_path = html.generate_html_report(results,output_dir,timestamp)

#Generate CSV Report
csv_report_path = csv.generate_csv_report(results,output_dir)

In [0]:
#######################
#TEST 2 - #Compare SQL Table List with Parquet Folder list
#######################

##RESULTS 21 Folder difference - due to empty tables in sql that are not copied over.
known_empty_tables = ["Spot_Work_LogShippingInfo", "Centre", "UserCopyEntity", "Spot_Log", "Spot_RefreshAutoClose","Spot_Repl_Agents", "Spot_Repl_PubSub" , "Spot_TableFragDetails" , "Spot_TraceConsumer_Global", "Spot_TraceData", "Spot_TraceGlobalSummary", "Spot_Work_BufferCacheContents", "Spot_Work_Locks", "Spot_Work_OldestTranInfo", "Spot_Work1", "Spot_Work3", "AssociatedRep", "Spot_WorkFileDetails", "BundleRequest" , "BundleAccess", "InterfaceRestartQueue"]

#TODO - Add in using list above in results
#TODO add on reporting to file / table solution

###########
#Get SQL list of Tables
###########
sql_tables = {
    t.lower()
    for t in sqlhelper.get_sql_tables(SQL_JDBC_URL)
}

###########
#Get list of Parquet Folders
###########
folders = {
    f.name.rstrip("/").lower()
    for f in dbutils.fs.ls(PARQUET_BASE_PATH)
    if f.isDir()
}
# Normalise
folders = {f.lower() for f in folders}
#print(str(len(folders)))

###########
#Compare SQl vs Parquet lists of tables/folders
###########
matching = sql_tables & folders
missing_folders = sql_tables - folders
extra_folders = folders - sql_tables

###########
#Report Results
###########

print(f"Tables in DB     : {len(sql_tables)}")
print(f"Folders in DBFS  : {len(folders)}")
print(f"Matching         : {len(matching)}")
print()

print(f"❌ Missing folders : {str(len(missing_folders))} - (table exists, folder missing):")
for t in sorted(missing_folders):
    print("  ", t)

print(f"\n⚠️ Extra folders : {str(len(missing_folders))} - (folder exists, no table):")
for f in sorted(extra_folders):
    print("  ", f)
