In [0]:
#Load Config and Setup Enviorment Variables
config = spark.read.option("multiline", "true").json("dbfs:/configs/config.json")
env_name = config.first()["env"].strip().lower()
lz_key = config.first()["lz_key"].strip().lower()
 
# print(f"env_code: {lz_key}")  # This won't be redacted
# print(f"env_name: {env_name}")  # This won't be redacted
 
KeyVault_name = f"ingest{lz_key}-meta002-{env_name}"
# print(f"KeyVault_name: {KeyVault_name}")
 
# Service principal credentials
client_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-ID")
client_secret = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-CLIENT-SECRET")
tenant_id = dbutils.secrets.get(KeyVault_name, "SERVICE-PRINCIPLE-TENANT-ID")
 
# Storage account names
curated_storage = f"ingest{lz_key}curated{env_name}"
checkpoint_storage = f"ingest{lz_key}xcutting{env_name}"
raw_storage = f"ingest{lz_key}raw{env_name}"
landing_storage = f"ingest{lz_key}landing{env_name}"
external_storage = f"ingest{lz_key}external{env_name}"
  
# Spark config for curated storage (Delta table)
spark.conf.set(f"fs.azure.account.auth.type.{curated_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{curated_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{curated_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{curated_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{curated_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{checkpoint_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{checkpoint_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{checkpoint_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{checkpoint_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{checkpoint_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{raw_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{raw_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{raw_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{raw_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{raw_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{landing_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{landing_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{landing_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{landing_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{landing_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
 
 
# Spark config for checkpoint storage
spark.conf.set(f"fs.azure.account.auth.type.{external_storage}.dfs.core.windows.net", "OAuth")
spark.conf.set(f"fs.azure.account.oauth.provider.type.{external_storage}.dfs.core.windows.net", "org.apache.hadoop.fs.azurebfs.oauth2.ClientCredsTokenProvider")
spark.conf.set(f"fs.azure.account.oauth2.client.id.{external_storage}.dfs.core.windows.net", client_id)
spark.conf.set(f"fs.azure.account.oauth2.client.secret.{external_storage}.dfs.core.windows.net", client_secret)
spark.conf.set(f"fs.azure.account.oauth2.client.endpoint.{external_storage}.dfs.core.windows.net", f"https://login.microsoftonline.com/{tenant_id}/oauth2/token")
  
# Setting variables for use in subsequent cells
bronze_path = f"abfss://bronze@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
silver_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/"
# audit_path = f"abfss://silver@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/AUDIT/{state_under_test}"
# gold_path = f"abfss://gold@ingest{lz_key}curated{env_name}.dfs.core.windows.net/ARIADM/ACTIVE/CCD/APPEALS/{state_under_test}"
 
 
# Print all variables
# variables = {
#     # "read_hive": read_hive,
    
#     "bronze_path": bronze_path,
#     "silver_path": silver_path,
#     "audit_path": audit_path,
#     "gold_path": gold_path,
#     "key_vault": KeyVault_name,
#     "AppealState": state_under_test
 
# }
 
# display(variables)

import json

#Get Latest Json Folder
# json_location = dbutils.fs.ls(f"{gold_path}/")[-1]
# latest_json_location = json_location.name
# dbutils.fs.ls(f"{gold_path}/{latest_json_location}")

#Set Paths
try: 
    #json_path = f"{gold_path}/{latest_json_location}/JSON/"
    # json_path = f"{gold_path}/{latest_json_location}/INVALID_JSON/"
    M1_silver = f"{silver_path}/silver_appealcase_detail"
    M1_bronze = f"{bronze_path}/bronze_appealcase_crep_rep_floc_cspon_cfs"
    M2_silver = f"{silver_path}/silver_caseapplicant_detail"
    M3_silver = f"{silver_path}/silver_status_detail"
    C = f"{silver_path}/silver_appealcategory_detail"
    bhc = f"{bronze_path}/bronze_hearing_centres"
    bat = f"{bronze_path}/bronze_appealtype" 
    docsr = f"{bronze_path}/bronze_documentsreceived"   
except:
    print(f"Error during fetch: {str(e)}")

#Create and Load Dataframes
# json_data = spark.read.format("json").load(json_path)
M1_silver = spark.read.format("delta").load(M1_silver)
M1_bronze = spark.read.format("delta").load(M1_bronze)
M2_silver = spark.read.format("delta").load(M2_silver)
M3_silver = spark.read.format("delta").load(M3_silver)
C = spark.read.format("delta").load(C)
bhc = spark.read.format("delta").load(bhc)
bat = spark.read.format("delta").load(bat)
docsr = spark.read.format("delta").load(docsr)

#Can be removed later, added to allow developing of code in this notebook to begin with before moving to func files
from pyspark.sql.functions import (
    col, when, lit, array, struct, collect_list, 
    max as spark_max, date_format, row_number, expr, 
    size, udf, coalesce, concat_ws, concat, trim, year, split, datediff,
    collect_set, current_timestamp,transform, first, array_contains
)

In [0]:
######################
#Code to quickly reload Libraries during Development
######################
# import importlib
# import functions.compare_data_helper as datahelper
# import functions.sql_helper as sqlhelper
# import functions.parquet_helper as pqhelper
# import functions.test_run_helper as run_test
# import reporting.parquet_to_bronze_html_report as html
# import parquet_to_bronze_tests.Bronze_M1_Test as m1test
# import parquet_to_bronze_tests.Bronze_M2_Test as m2test
# import parquet_to_bronze_tests.Bronze_M3_Test as m3test
# import parquet_to_bronze_tests.Bronze_M4_Test as m4test
# import parquet_to_bronze_tests.Bronze_M5_Test as m5test
# import parquet_to_bronze_tests.Bronze_M6_Test as m6test
# import parquet_to_bronze_tests.Bronze_C_Test as ctest
# import parquet_to_bronze_tests.Bronze_D_Test as dtest
# import parquet_to_bronze_tests.Bronze_H_Test as htest


# importlib.reload(datahelper)
# importlib.reload(sqlhelper)
# importlib.reload(pqhelper)
# importlib.reload(run_test)
# importlib.reload(html)
# importlib.reload(m1test)
# importlib.reload(m2test)
# importlib.reload(m3test)
# importlib.reload(m4test)
# importlib.reload(m5test)
# importlib.reload(ctest)
# importlib.reload(dtest)
# importlib.reload(htest)




In [0]:
#################
#PARQUET TO BRONZE ARIA TEST

#Status - 9 Tables - 8 passing, 1 WIP (M3 table)
##################

test_results_path= "/Workspace/Users/peter.gresty@hmcts.net/Results/Bronze_To_Parquet"

##########################
#MAIN SETUP FOR ALL TESTS
##########################
import functions.compare_data_helper as datahelper
import functions.sql_helper as sqlhelper
import functions.parquet_helper as pqhelper
from functools import reduce
from datetime import datetime
import os
import reporting.csv_report as csv
import reporting.parquet_to_bronze_html_report as html
import functions.test_run_helper as run_test
##Import Tests
import parquet_to_bronze_tests.Bronze_M1_Test as m1test
import parquet_to_bronze_tests.Bronze_M2_Test as m2test
import parquet_to_bronze_tests.Bronze_M3_Test as m3test
import parquet_to_bronze_tests.Bronze_M4_Test as m4test
import parquet_to_bronze_tests.Bronze_M5_Test as m5test
import parquet_to_bronze_tests.Bronze_M6_Test as m6test
import parquet_to_bronze_tests.Bronze_C_Test as ctest
import parquet_to_bronze_tests.Bronze_D_Test as dtest
import parquet_to_bronze_tests.Bronze_H_Test as htest

PARQUET_BASE_PATH = "abfss://landing@ingest00landingstg.dfs.core.windows.net/SQLServer/Sales/IRIS/dbo/"
results = []

##############
# Test Config
##############
results = []
config = {
    "pqhelper":pqhelper,
    "PARQUET_BASE_PATH": PARQUET_BASE_PATH ,
    "dbutils": dbutils,
    "spark":spark,
    "run_tests":run_test,
}
#######################
#Tests
#######################
all_results = []

#Passing
#Parquet To M1 Bronze
results, result_text,sql_h,pq_h  = m1test.test_m1_bronze_table(config)
all_results.extend(results)
print(result_text)
# display(sql_h.orderBy("_row_hash"))
# display(pq_h.orderBy("_row_hash"))

#Passing
#Parquet To M2 Bronze
# results, result_text,sql_h,pq_h  = m2test.test_m2_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#WIP ? - Not quite finished
#Parquet To M3 Bronze
# results, result_text,sql_h,pq_h  = m3test.test_m3_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#Passing
#Parquet To M4 Bronze
# results, result_text,sql_h,pq_h  = m4test.test_m4_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#Passing
#Parquet To M5 Bronze
# results, result_text,sql_h,pq_h  = m5test.test_m5_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#Passing
#Parquet To M6 Bronze
# results, result_text,sql_h,pq_h  = m6test.test_m6_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#Passing
#Parquet To C Bronze
# results, result_text,sql_h,pq_h  = ctest.test_c_bronze_table(config)
# all_results.extend(results)
# print(result_text)

#Passing
# #Parquet To D Bronze
# results, result_text,sql_h,pq_h  = dtest.test_d_bronze_table(config)
# all_results.extend(results)
# print(result_text)
# display(sql_h.orderBy("_row_hash"))
# display(pq_h.orderBy("_row_hash"))

# #Parquet To H Bronze
# results, result_text,sql_h,pq_h  = htest.test_h_bronze_table(config)
# all_results.extend(results)
# print(result_text)
# display(sql_h.orderBy("_row_hash"))
# display(pq_h.orderBy("_row_hash"))



##############
#Display Results
##############
ordered_cols = [
    "table_name",
    "test_type",
    "status",
    "message",    
]
results_df = spark.createDataFrame(all_results)
results_ordered_df = results_df.select(*ordered_cols)
display(results_ordered_df)

In [0]:
#####################
#REPORTS AND RESULTS
#####################
#display Results
ordered_cols = [
    "table_name",
    "test_type",
    "status",
    "message",    
]
results_df = spark.createDataFrame(all_results)
results_ordered_df = results_df.select(*ordered_cols)
display(results_ordered_df)

#generate Results Folder
timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
output_dir = f"{test_results_path}/{timestamp}/"
os.makedirs(output_dir, exist_ok=True)

#TODO - Choose output format
# Generate HTML report
html_report_path = html.generate_html_report(all_results,output_dir,timestamp)

#Generate CSV Report
csv_report_path = csv.generate_csv_report(all_results,output_dir)