In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 40
   }
}

In [ ]:
batch_id = ''
invoice_schema_applied_path = ''
statistics_path = ''
data_separator = ''
data_encoding = ''
taxpayer_profile_schema_applied_path = ''
data_quality_min_date = ''
data_quality_max_date = ''

In [ ]:
#Sample Quality Index Weights
SQi_below_range_weight = 3
SQi_above_range_weight = 3
SQi_in_range_weight = 2

#Completeness Index Weights. 
#Increasing the value for one column will increase its importance relative to the other columns being measured for completeness.
Ci_issuer_id_weight = 3
Ci_issued_date_weight = 3
Ci_issuer_type_weight = 2
Ci_document_type_weight = 2
Ci_currency_weight = 2
Ci_exchange_rate_weight = 1
Ci_total_taxable_weight = 1
Ci_total_non_taxable_weight = 1
Ci_total_sales_weight = 1
Ci_total_voucher_weight = 2
Ci_total_tax_weight = 2
Ci_activity_issuer_weight = 2
Ci_document_id_weight = 3
Ci_receiver_id_weight = 2
Ci_total_discount_weight = 1

#Cross Reference Index weights
xRefi_issuer_id_no_taxpayer_id_weight = 3

In [ ]:
import datetime
import csv
import pandas as pd
from datetime import date, datetime
from calendar import monthrange
import time
import pyodbc
import pyspark.sql.functions as F
from pyspark.sql.functions import col, year, month, dayofmonth, isnan, when, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, FloatType, DateType

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'statistics_path': statistics_path,
    'data_encoding': data_encoding,
    'data_separator': data_separator,
    'invoice_schema_applied_path': invoice_schema_applied_path,
    'taxpayer_profile_schema_applied_path': taxpayer_profile_schema_applied_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
# for every field it is needed to calculate the number of NULL records by month
all_files_data_quality_schema = StructType([ 
    StructField("filename",StringType(),True), 
    StructField("min_issued_date",DateType(),True),
    StructField("max_issued_date",DateType(),True),
    StructField("num_recs",DoubleType(),True),
    StructField("num_null_issuer_type",DoubleType(),True),
    StructField("num_null_issuer_id",DoubleType(),True),
    StructField("num_null_activity_issuer",DoubleType(),True), 
    StructField("num_null_receiver_type",DoubleType(),True),
    StructField("num_null_receiver_id",DoubleType(),True), 
    StructField("num_null_document_type",DoubleType(),True),
    StructField("num_null_document_id",DoubleType(),True),  
    StructField("num_null_issued_date",DoubleType(),True),
    StructField("num_null_sales_terms",DoubleType(),True),
    StructField("num_null_credit_term",DoubleType(),True),
    StructField("num_null_currency",DoubleType(),True),
    StructField("num_null_exchange_rate_r",DoubleType(),True),
    StructField("num_null_payment_method1",DoubleType(),True),
    StructField("num_null_payment_method2",DoubleType(),True),
    StructField("num_null_payment_method3",DoubleType(),True),
    StructField("num_null_payment_method4",DoubleType(),True),
    StructField("num_null_payment_method5",DoubleType(),True),
    StructField("num_null_payment_method99",DoubleType(),True),
    StructField("num_null_total_taxable_services",DoubleType(),True),
    StructField("num_null_total_non_taxable_service",DoubleType(),True),
    StructField("num_null_total_taxable_goods",DoubleType(),True),
    StructField("num_null_total_non_taxable_goods",DoubleType(),True),
    StructField("num_null_total_taxable",DoubleType(),True),
    StructField("num_null_total_non_taxable",DoubleType(),True),
    StructField("num_null_total_sales",DoubleType(),True),
    StructField("num_null_total_discounts",DoubleType(),True),
    StructField("num_null_total_voucher",DoubleType(),True),
    StructField("num_null_total_tax",DoubleType(),True),
    StructField("num_invoices",DoubleType(),True),
    StructField("num_deb_notes",DoubleType(),True),
    StructField("num_credit_notes",DoubleType(),True),
    StructField("num_purchase_docs",DoubleType(),True),
    StructField("num_export_docs",DoubleType(),True),
    StructField("num_sellers",DoubleType(),True),
    StructField("num_buyers",DoubleType(),True),
    StructField("number_isics",DoubleType(),True),
    StructField("num_issuer_id_no_taxpayer_id",DoubleType(),True)
])

unique_issuer_schema = StructType([StructField("issuer_id",StringType(),True)])
unique_receiver_schema = StructType([StructField("receiver_id",StringType(),True)])
unique_activity_issuer_schema = StructType([StructField("activity_issuer",StringType(),True)])

emptyRDD = spark.sparkContext.emptyRDD()
df_all_files_data_quality = spark.createDataFrame(emptyRDD, schema=all_files_data_quality_schema)
df_unique_issuer_id_dataset = spark.createDataFrame(emptyRDD, schema=unique_issuer_schema)
df_unique_receiver_id_dataset = spark.createDataFrame(emptyRDD, schema=unique_receiver_schema)
df_unique_activity_issuer_dataset = spark.createDataFrame(emptyRDD, schema=unique_activity_issuer_schema)

In [ ]:
def get_total_nulls(df, colname) :
    if [dtype for name, dtype in df.dtypes if name == colname][0] not in ("timestamp", "date"):
       total_rows = df.where(col(colname).contains('None') | col(colname).contains('NULL') | (col(colname) == '' ) | col(colname).isNull() | isnan(colname)).count()
    else:
        # isnan isn't supported on timestamp and date types
        total_rows = df.where(col(colname).contains('None') | col(colname).contains('NULL') | (col(colname) == '' ) | col(colname).isNull()).count()
    return total_rows

def qualify_totals(total_value):
    if len(total_value) > 0 :
        total_num = total_value[0][0]
    else :
        total_num = 0
    return total_num

In [ ]:
def get_data_quality_of_file(df, fileName, df_taxpayer_profile) :
    global df_unique_issuer_id_dataset
    global df_unique_receiver_id_dataset
    global df_unique_activity_issuer_dataset

    df_counts = df.groupBy('document_type').count()

    #invoices
    total_number_sale_invoices = qualify_totals(df_counts.filter(df_counts.document_type == 'I').select('count').collect())
    #debit notes
    total_number_debit_notes = qualify_totals(df_counts.filter(df_counts.document_type == 'D').select('count').collect())
    #credit notes
    total_number_credit_notes = qualify_totals(df_counts.filter(df_counts.document_type == 'C').select('count').collect())
    #purchase
    total_number_purchase = qualify_totals(df_counts.filter(df_counts.document_type == 'P').select('count').collect())
    #export invoice
    total_number_export = qualify_totals(df_counts.filter(df_counts.document_type == 'X').select('count').collect())

    total_issuer_ids = df.select('issuer_id').distinct().count()
    total_receiver_ids = df.select('receiver_id').distinct().count()
    total_ISICs = df.select('activity_issuer').distinct().count()
    total_rows_in_dataset = df.count()

    total_rows_null_issuer_id = get_total_nulls(df,"issuer_id") 
    total_rows_null_issued_date = get_total_nulls(df,"issued_date")
    total_rows_null_issuer_type = get_total_nulls(df,"issuer_type")
    total_rows_null_document_type = get_total_nulls(df,"document_type")
    total_rows_null_currency = get_total_nulls(df,"currency")
    total_rows_null_total_voucher = get_total_nulls(df,"total_voucher")
    total_rows_null_total_tax = get_total_nulls(df,"total_tax")
    total_rows_null_activity_issuer = get_total_nulls(df,"activity_issuer")
    total_rows_null_document_id = get_total_nulls(df,"document_id")
    total_rows_null_receiver_id = get_total_nulls(df,"receiver_id")
    total_rows_null_total_taxable = get_total_nulls(df,"total_taxable")
    total_rows_null_total_non_taxable = get_total_nulls(df,"total_non_taxable")
    total_rows_null_total_sales = get_total_nulls(df,"total_sales")
    total_rows_null_total_discount = get_total_nulls(df,"total_discounts")
    total_rows_null_receiver_type = get_total_nulls(df,"receiver_type")
    total_rows_null_sales_terms = get_total_nulls(df,"sales_terms")
    total_rows_null_credit_terms = get_total_nulls(df,"credit_term")
    total_rows_null_exchange_rate = get_total_nulls(df,"exchange_rate_r")
    total_rows_null_payment_method1 = get_total_nulls(df,"payment_method1")
    total_rows_null_payment_method2 = get_total_nulls(df,"payment_method2")
    total_rows_null_payment_method3 = get_total_nulls(df,"payment_method3")
    total_rows_null_payment_method4 = get_total_nulls(df,"payment_method4")
    total_rows_null_payment_method5 = get_total_nulls(df,"payment_method5")
    total_rows_null_payment_method99 = get_total_nulls(df,"payment_method99")
    total_rows_null_total_taxable_services = get_total_nulls(df,"total_taxable_services")
    total_rows_null_total_non_taxable_service = get_total_nulls(df,"total_non_taxable_services")
    total_rows_null_total_taxable_goods = get_total_nulls(df,"total_taxable_goods")
    total_rows_null_total_non_taxable_goods = get_total_nulls(df,"total_non_taxable_goods")

    issued_date_min = df.agg({"issued_date": "min"}).collect()[0][0]
    issued_date_max = df.agg({"issued_date": "max"}).collect()[0][0]
        
    df_unique_issuer_id_in_file = df.select('issuer_id').distinct()
    
    df_cross_reference = df_unique_issuer_id_in_file.join(df_taxpayer_profile, (df_unique_issuer_id_in_file['issuer_id'] == df_taxpayer_profile['taxpayer_id']) , how='left') 
    num_issuer_id_no_taxpayer_id = df_cross_reference.filter(col("taxpayer_id").isNull()).count()
    
    new_row = [[fileName, issued_date_min, issued_date_max, 
                total_rows_in_dataset, total_rows_null_issuer_type, total_rows_null_issuer_id, total_rows_null_activity_issuer,
                total_rows_null_receiver_type, total_rows_null_receiver_id, total_rows_null_document_type,
                total_rows_null_document_id, total_rows_null_issued_date, total_rows_null_sales_terms, total_rows_null_credit_terms,
                total_rows_null_currency, total_rows_null_exchange_rate, total_rows_null_payment_method1, total_rows_null_payment_method2,
                total_rows_null_payment_method3, total_rows_null_payment_method4, total_rows_null_payment_method5,
                total_rows_null_payment_method99, total_rows_null_total_taxable_services, total_rows_null_total_non_taxable_service,
                total_rows_null_total_taxable_goods, total_rows_null_total_non_taxable_goods, total_rows_null_total_taxable,
                total_rows_null_total_non_taxable, total_rows_null_total_sales, total_rows_null_total_discount, 
                total_rows_null_total_voucher, total_rows_null_total_tax, 
                total_number_sale_invoices, total_number_debit_notes, total_number_credit_notes, total_number_purchase, 
                total_number_export, total_issuer_ids, total_receiver_ids, total_ISICs, num_issuer_id_no_taxpayer_id
                ]]
    
    df_unique_issuer_id_dataset = df_unique_issuer_id_dataset.union(df_unique_issuer_id_in_file)
    df_unique_receiver_id_dataset = df_unique_receiver_id_dataset.union(df.select('receiver_id').distinct())
    df_unique_activity_issuer_dataset = df_unique_activity_issuer_dataset.union(df.select('activity_issuer').distinct())

    return new_row

## Calculate the data quality of each file provided on a per-file basis

In [ ]:
with tracer.span('Reading taxpayer profile from ADLS'):
    df_taxpayer_profile = spark.read.csv(f'{taxpayer_profile_schema_applied_path}/tax_payer_profile/', sep=data_separator,inferSchema=True, header=True).select('taxpayer_id')

file_names = mssparkutils.fs.ls(invoice_schema_applied_path)

firstFile = True
for filename in file_names:
    logger.info(f'Caclulating data quality for file: {filename.name}')
    with tracer.span('Reading invoice file from ADLS'):
         df = spark.read.csv(filename.path, sep=data_separator,inferSchema=True, header=True)

    with tracer.span('Calculating data quality for invoice file'):
        file_data_quality_row = get_data_quality_of_file(df, filename.name, df_taxpayer_profile)
        df_all_files_data_quality = df_all_files_data_quality.union(spark.createDataFrame(file_data_quality_row))
    
    if firstFile == True:
        df_alldata = df
        firstFile = False
    else:
        df_alldata = df_alldata.union(df)

## Calculate the aggregate data quality for the entire dataset across all files

In [ ]:
with tracer.span('Calcuate the in and out of range metrics for entire dataset'):
    max_date = datetime.strptime(data_quality_max_date, "%d-%m-%Y")
    min_date = datetime.strptime(data_quality_min_date, "%d-%m-%Y")

    total_records = df_all_files_data_quality.select(F.sum('num_recs')).take(1)[0][0]
    total_records_in_period_of_analysis = df_alldata.filter(df_alldata.issued_date >= min_date).filter(df_alldata.issued_date <= max_date).count()
    total_records_out_of_period_of_analysis = total_records - total_records_in_period_of_analysis

    total_records_below_range = df_alldata.filter(df_alldata.issued_date < min_date).count()
    total_records_above_range = df_alldata.filter(df_alldata.issued_date > max_date).count()
    

In [ ]:
with tracer.span('Calcuating aggregate data quality for entire dataset'):
    df_dataset_data_quality = df_all_files_data_quality.select(
                                            F.lit(min_date).alias('min_date'),
                                            F.lit(max_date).alias('max_date'),
                                            F.min('min_issued_date').alias('min_issued_date'),
                                            F.max('max_issued_date').alias('max_issued_date'),
                                            F.lit(total_records).alias('num_recs'),
                                            F.lit(total_records_in_period_of_analysis).alias('num_recs_in_period_of_analysis'),
                                            F.lit(total_records_out_of_period_of_analysis).alias('num_recs_out_period_of_analysis'),
                                            F.sum('num_null_issuer_type').alias('num_null_issuer_type'),
                                            F.sum('num_null_issuer_id').alias('num_null_issuer_id'),
                                            F.sum('num_null_activity_issuer').alias('num_null_activity_issuer'),
                                            F.sum('num_null_receiver_type').alias('num_null_receiver_type'),
                                            F.sum('num_null_receiver_id').alias('num_null_receiver_id'),
                                            F.sum('num_null_document_type').alias('num_null_document_type'),
                                            F.sum('num_null_document_id').alias('num_null_document_id'),
                                            F.sum('num_null_issued_date').alias('num_null_issued_date'),
                                            F.sum('num_null_sales_terms').alias('num_null_sales_terms'),
                                            F.sum('num_null_credit_term').alias('num_null_credit_term'),
                                            F.sum('num_null_currency').alias('num_null_currency'),
                                            F.sum('num_null_exchange_rate_r').alias('num_null_exchange_rate_r'),
                                            F.sum('num_null_payment_method1').alias('num_null_payment_method1'),
                                            F.sum('num_null_payment_method2').alias('num_null_payment_method2'),
                                            F.sum('num_null_payment_method3').alias('num_null_payment_method3'),
                                            F.sum('num_null_payment_method4').alias('num_null_payment_method4'),
                                            F.sum('num_null_payment_method5').alias('num_null_payment_method5'),
                                            F.sum('num_null_payment_method99').alias('num_null_payment_method99'),
                                            F.sum('num_null_total_taxable_services').alias('num_null_total_taxable_services'),
                                            F.sum('num_null_total_non_taxable_service').alias('num_null_total_non_taxable_service'),
                                            F.sum('num_null_total_taxable_goods').alias('num_null_total_taxable_goods'),
                                            F.sum('num_null_total_non_taxable_goods').alias('num_null_total_non_taxable_goods'),
                                            F.sum('num_null_total_taxable').alias('num_null_total_taxable'),
                                            F.sum('num_null_total_non_taxable').alias('num_null_total_non_taxable'),
                                            F.sum('num_null_total_sales').alias('num_null_total_sales'),
                                            F.sum('num_null_total_discounts').alias('num_null_total_discounts'),
                                            F.sum('num_null_total_voucher').alias('num_null_total_voucher'),
                                            F.sum('num_null_total_tax').alias('num_null_total_tax'),
                                            F.sum('num_invoices').alias('num_invoices'),
                                            F.sum('num_deb_notes').alias('num_deb_notes'),
                                            F.sum('num_credit_notes').alias('num_credit_notes'),
                                            F.sum('num_purchase_docs').alias('num_purchase_docs'),
                                            F.sum('num_export_docs').alias('num_export_docs'),
                                            ((F.lit(total_records_in_period_of_analysis)/F.lit(total_records))*100).alias('pct_in_range'),
                                            ((F.lit(total_records_below_range)/F.lit(total_records))*100).alias('pct_below_range'),
                                            ((F.lit(total_records_above_range)/F.lit(total_records))*100).alias('pct_above_range'),
                                            ((F.sum('num_issuer_id_no_taxpayer_id')/F.lit(df_unique_issuer_id_dataset.distinct().count()))*100).alias('pct_issuer_id_no_taxpayer_id'),
                                            F.lit(df_unique_issuer_id_dataset.distinct().count()).alias('num_unique_issuer_id'),
                                            F.lit(df_unique_receiver_id_dataset.distinct().count()).alias('num_unique_receiver_id'),
                                            F.lit(df_unique_activity_issuer_dataset.distinct().count()).alias('num_unique_ISICS')
                                            )

## Calculate the weighted and scored Quality Indexes for the entire dataset

In [ ]:
def get_scoring_by_percentage_range(percentage) :
    scoring_completeness = 1

    if percentage == 0:
        scoring_completeness = 1
    elif ((percentage > 0) and (percentage <= 0.01)):
        scoring_completeness = 0.9
    elif ((percentage > 0.01) and (percentage <= 0.02)):
        scoring_completeness = 0.5
    elif ((percentage > 0.02) and (percentage <= 0.03)):
        scoring_completeness = 0.25
    else:
        scoring_completeness = 0

    return scoring_completeness

In [ ]:
with tracer.span('Calcuating quality indexes for entire dataset'):
    ds_pct_below_range = df_dataset_data_quality.select("pct_below_range").take(1)[0][0] / 100

    #Calculation of the scoring for ds_pct_below_range
    if ds_pct_below_range <= 0.01:
        SQi_below_range_score = 1
    elif ((ds_pct_below_range > 0.01) and (ds_pct_below_range < 0.02)):
        SQi_below_range_score = 0.9
    elif ((ds_pct_below_range > 0.02) and (ds_pct_below_range < 0.05)):
        SQi_below_range_score = 0.5
    elif ds_pct_below_range > 0.05:
        SQi_below_range_score = 0.25
    else:
        SQi_below_range_score = 0

    logger.info(f'SQi_below_range_score: {SQi_below_range_score}')

    #Calculation of the scoring for ds_pct_above_range
    ds_pct_above_range = df_dataset_data_quality.select("pct_above_range").take(1)[0][0] / 100

    if ds_pct_above_range <= 0.01:
        SQi_above_range_score = 1
    elif ((ds_pct_above_range > 0.01) and (ds_pct_above_range < 0.02)):
        SQi_above_range_score = 0.9
    elif ((ds_pct_above_range > 0.02) and (ds_pct_above_range < 0.05)):
        SQi_above_range_score = 0.5
    elif ds_pct_above_range > 0.05:
        SQi_above_range_score = 0.25
    else:
        SQi_above_range_score = 0
    logger.info(f'SQi_above_range_score: {SQi_above_range_score}')

    #Calculation of the scoring for ds_pct_in_range
    ds_pct_in_range = df_dataset_data_quality.select("pct_in_range").take(1)[0][0] / 100

    if ds_pct_in_range >= 0.991:
        SQi_in_range_score = 1
    elif ((ds_pct_in_range > 0.975) and (ds_pct_in_range < 0.991)):
        SQi_in_range_score = 0.9
    elif ((ds_pct_in_range > 0.935) and (ds_pct_in_range < 0.975)):
        SQi_in_range_score = 0.5
    elif ds_pct_in_range < 0.935:
        SQi_in_range_score = 0.25
    else:
        SQi_in_range_score = 0

    SQi_max_weight = SQi_below_range_weight + SQi_above_range_weight + SQi_in_range_weight
    #SQi = Average (Sigma (Parameter wt * score) / Max possible score >> across all defined criteria
    SQi = ((SQi_below_range_weight * SQi_below_range_score) +  (SQi_above_range_weight * SQi_above_range_score) + \
            (SQi_in_range_weight * SQi_in_range_score )) / SQi_max_weight

    logger.info(f'SQi: {SQi}')

    # Calculation of Completeness index
    total_rows_in_dataset = df_dataset_data_quality.select("num_recs").take(1)[0][0]

    pct_null_issuer_id = df_dataset_data_quality.select("num_null_issuer_id").take(1)[0][0] / total_rows_in_dataset
    pct_null_issued_date = df_dataset_data_quality.select("num_null_issued_date").take(1)[0][0] / total_rows_in_dataset
    pct_null_issuer_type = df_dataset_data_quality.select("num_null_issuer_type").take(1)[0][0] / total_rows_in_dataset
    pct_null_document_type = df_dataset_data_quality.select("num_null_document_type").take(1)[0][0] / total_rows_in_dataset
    pct_null_currency = df_dataset_data_quality.select("num_null_currency").take(1)[0][0] / total_rows_in_dataset
    pct_null_exchange_rate = df_dataset_data_quality.select("num_null_exchange_rate_r").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_taxable = df_dataset_data_quality.select("num_null_total_taxable").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_non_taxable = df_dataset_data_quality.select("num_null_total_non_taxable").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_sales = df_dataset_data_quality.select("num_null_total_sales").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_voucher = df_dataset_data_quality.select("num_null_total_voucher").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_tax = df_dataset_data_quality.select("num_null_total_tax").take(1)[0][0] / total_rows_in_dataset
    pct_null_activity_issuer = df_dataset_data_quality.select("num_null_activity_issuer").take(1)[0][0] / total_rows_in_dataset
    pct_null_document_id = df_dataset_data_quality.select("num_null_document_id").take(1)[0][0] / total_rows_in_dataset
    pct_null_receiver_id = df_dataset_data_quality.select("num_null_receiver_id").take(1)[0][0] / total_rows_in_dataset
    pct_null_total_discount = df_dataset_data_quality.select("num_null_total_discounts").take(1)[0][0] / total_rows_in_dataset


    computed_score_issuer_id = get_scoring_by_percentage_range(pct_null_issuer_id)
    computed_score_issued_date =  get_scoring_by_percentage_range(pct_null_issued_date)
    computed_score_issuer_type =  get_scoring_by_percentage_range(pct_null_issuer_type)
    computed_score_document_type =  get_scoring_by_percentage_range(pct_null_document_type)
    computed_score_currency =  get_scoring_by_percentage_range(pct_null_currency)
    computed_score_exchange_rate =  get_scoring_by_percentage_range(pct_null_exchange_rate)
    computed_score_total_taxable =  get_scoring_by_percentage_range(pct_null_total_taxable)
    computed_score_total_non_taxable = get_scoring_by_percentage_range(pct_null_total_non_taxable)
    computed_score_total_sales =  get_scoring_by_percentage_range(pct_null_total_sales)
    computed_score_total_voucher =  get_scoring_by_percentage_range(pct_null_total_voucher)
    computed_score_total_tax =  get_scoring_by_percentage_range(pct_null_total_tax)
    computed_score_activity_issuer =  get_scoring_by_percentage_range(pct_null_activity_issuer)
    computed_score_document_id =  get_scoring_by_percentage_range(pct_null_document_id)
    computed_score_receiver_id =  get_scoring_by_percentage_range(pct_null_receiver_id)
    computed_score_total_discount =  get_scoring_by_percentage_range(pct_null_total_discount)

    weighted_score_issuer_id = computed_score_issuer_id * Ci_issuer_id_weight
    weighted_score_issued_date =  computed_score_issued_date * Ci_issued_date_weight
    weighted_score_issuer_type =  computed_score_issuer_type * Ci_issuer_type_weight
    weighted_score_document_type =  computed_score_document_type * Ci_document_type_weight
    weighted_score_currency =  computed_score_currency * Ci_currency_weight
    weighted_score_exchange_rate =  computed_score_exchange_rate * Ci_exchange_rate_weight
    weighted_score_total_taxable =  computed_score_total_taxable * Ci_total_taxable_weight
    weighted_score_total_non_taxable = computed_score_total_non_taxable * Ci_total_non_taxable_weight
    weighted_score_total_sales =  computed_score_total_sales * Ci_total_sales_weight
    weighted_score_total_voucher =  computed_score_total_voucher * Ci_total_voucher_weight
    weighted_score_total_tax =  computed_score_total_tax * Ci_total_tax_weight
    weighted_score_activity_issuer =  computed_score_activity_issuer * Ci_activity_issuer_weight
    weighted_score_document_id =  computed_score_document_id * Ci_document_id_weight
    weighted_score_receiver_id =  computed_score_receiver_id * Ci_receiver_id_weight
    weighted_score_total_discounts =  computed_score_total_discount * Ci_total_discount_weight

    weighted_score_total = weighted_score_issuer_id + weighted_score_issued_date + weighted_score_issuer_type + weighted_score_document_type + \
                        weighted_score_currency + weighted_score_exchange_rate + weighted_score_total_taxable + weighted_score_total_non_taxable + \
                        weighted_score_total_sales + weighted_score_total_voucher + weighted_score_total_tax + weighted_score_activity_issuer + \
                        weighted_score_document_id + weighted_score_receiver_id + weighted_score_total_discounts

    Ci_max_weight = Ci_issuer_id_weight + Ci_issued_date_weight + Ci_issuer_type_weight + Ci_document_type_weight + \
                            Ci_currency_weight + Ci_exchange_rate_weight + Ci_total_taxable_weight + Ci_total_non_taxable_weight + \
                            Ci_total_sales_weight + Ci_total_voucher_weight + Ci_total_tax_weight + Ci_activity_issuer_weight + \
                            Ci_document_id_weight + Ci_receiver_id_weight + Ci_total_discount_weight 

    #Ci = Average (Sigma (Parameter wt * score) / Max possible score >> across all defined criteria parameters.
    Ci = weighted_score_total / Ci_max_weight
    logger.info(f'Ci: {Ci}')

    # Cross Reference index
    ds_pct_issuer_id_no_taxpayer_id = df_dataset_data_quality.select("pct_issuer_id_no_taxpayer_id").take(1)[0][0] / 100

    computed_score_issuer_id_no_taxpayer_id = get_scoring_by_percentage_range(ds_pct_issuer_id_no_taxpayer_id)
    weighted_score_issuer_id_no_taxpayer_id = xRefi_issuer_id_no_taxpayer_id_weight * computed_score_issuer_id_no_taxpayer_id

    #xRefi = Average (Sigma (Parameter wt * score) / Max possible score >> across all defined cross reference data sources
    xRefi = weighted_score_issuer_id_no_taxpayer_id / xRefi_issuer_id_no_taxpayer_id_weight
    logger.info(f'xRefi: {xRefi}')

    #Overall Data Quality Index (DQi) is defined as the average of all the three scores : Average (SQi, Ci, xRefi)
    DQi = (SQi + Ci + xRefi) / 3
    logger.info(f'DQi: {DQi}')

    df_dataset_data_quality = df_dataset_data_quality.withColumn('SQi', F.lit(SQi)).withColumn('Ci', F.lit(Ci)).withColumn('xRefi', F.lit(xRefi)).withColumn('DQi', F.lit(DQi))

# Write out the statistics
if the notebook receives the "run_time_stamp" parameter from Synapse Ingest the parameter is used to create a folder to store the statistics, if no parameter is received we write the statistics to the default output folder

In [ ]:
with tracer.span('Saving data quality statistics to ADLS'):
    #writeout statistics
    df_all_files_data_quality.repartition(1).write.mode("overwrite").parquet(f'{statistics_path}file_data_quality')
    df_dataset_data_quality.repartition(1).write.mode("overwrite").parquet(f'{statistics_path}dataset_data_quality')

### Write out SQL table

In [ ]:
# serverless SQL config
database = 'eiad'
driver= '{ODBC Driver 17 for SQL Server}'

sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SyanpseServerlessSQLEndpoint")

In [ ]:
def generate_schema_string(dataframe):
    schema_string = ""
    for name in dataframe.schema.fieldNames():
        schema_string += "[" + name + "] "
        datatype = str(dataframe.schema[name].dataType.simpleString())
        if datatype == 'double': datatype = 'float'
        if datatype == 'string': datatype = 'nvarchar(MAX)'
        if datatype == 'timestamp': datatype = 'datetime2(7)'
        schema_string += datatype + ", "
    return schema_string[:-2]

In [ ]:
with tracer.span('Creating SQL table for per file data quality'):
    table_name = statistics_path.split('/')[3] + '_' + statistics_path.split('/')[2].split('@')[0] + '_' + statistics_path.split('/')[4] + '_' + 'file_data_quality'
    schema_string = generate_schema_string(df_all_files_data_quality)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(statistics_path.split('/')) if idx > 2]) + 'file_data_quality'
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)

In [ ]:
with tracer.span('Creating SQL table for full dataset data quality'):
    table_name = statistics_path.split('/')[3] + '_' + statistics_path.split('/')[2].split('@')[0] + '_' + statistics_path.split('/')[4] + '_' + 'dataset_data_quality'
    schema_string = generate_schema_string(df_dataset_data_quality)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(statistics_path.split('/')) if idx > 2]) + 'dataset_data_quality'
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)