# Calculate uploaded data statistics
Calculates statistics of the uploaded data


- Total number of sales invoices reported 

- Total number of credit notes reported 
 
- Total number of debit notes reported 
 
- Number of issuer_ids in the entired period 
 
- Number of identified receiver_ids in the period 
 
- Number of ISICs (activity) reported in the period 

In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 40
   }
}

In [ ]:
batch_id = ''
invoice_schema_applied_path = ''
statistics_path = ''
data_separator = ''
data_encoding = ''

In [ ]:
import pandas as pd
import datetime
import csv
import time
import pyodbc
from pyspark.sql.types import StructType,StructField, StringType, IntegerType,DoubleType
from pyspark.sql.functions import col, year, month, dayofmonth, isnan, when, count 

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'statistics_path': statistics_path,
    'data_encoding': data_encoding,
    'data_separator': data_separator,
    'invoice_schema_applied_path': invoice_schema_applied_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
schema_uploaded_data_statistics = StructType([ 
    StructField("filename",StringType(),True), 
    StructField("month",StringType(),True), 
    StructField("num_invoices",DoubleType(),True),
    StructField("num_deb_notes",DoubleType(),True),
    StructField("num_credit_notes",DoubleType(),True),
    StructField("num_sellers",DoubleType(),True),
    StructField("num_buyers",DoubleType(),True),
    StructField("number_isics",DoubleType(),True),
    StructField("number_records",DoubleType(),True),
    StructField("num_purchase_docs",DoubleType(),True),
    StructField("num_export_docs",DoubleType(),True)
  ])

schema_total_documents = StructType([
    StructField('document_type', StringType(), True),
    StructField('total_documents', IntegerType(), True)
])

schema_total_unique = StructType([
    StructField('unique_issuer_id', IntegerType(), True),
    StructField('unique_receiver_id', IntegerType(), True),
    StructField('unique_activity_id', IntegerType(), True)
])


unique_issuer_id_dataset = pd.DataFrame()
unique_receiver_id_dataset = pd.DataFrame()
unique_activity_id_dataset = pd.DataFrame()

In [ ]:
emptyRDD = spark.sparkContext.emptyRDD()
df_uploaded_data_statistics = spark.createDataFrame(emptyRDD, schema=schema_uploaded_data_statistics)

emptyRDD = spark.sparkContext.emptyRDD()
df_statistics_uniques = spark.createDataFrame(emptyRDD, schema=schema_total_unique)


In [ ]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

In [ ]:
def getStatistics(df_uploaded_data_statistics, fileName, fullFilePath) :

    global unique_issuer_id_dataset 
    global unique_receiver_id_dataset
    global unique_activity_id_dataset

    df = spark.read.csv(fullFilePath, sep=data_separator,inferSchema=True, header=True)


    df_counts = pd.DataFrame(df.groupBy('document_type').count().collect())

    df_total_document_by_type = spark.createDataFrame(df_counts, schema_total_documents)

    #invoices
    total_by_type = df_total_document_by_type.filter(df_total_document_by_type.document_type == 'I').collect()
    if len(total_by_type) > 0 :
        total_number_sale_invoices = total_by_type[0][1]
    else :
        total_number_sale_invoices = 0
    
    #debit notes
    total_by_type = df_total_document_by_type.filter(df_total_document_by_type.document_type == 'D').collect()
    if len(total_by_type) > 0 :
        total_number_debit_notes = total_by_type[0][1]
    else :
        total_number_debit_notes = 0

    #credit notes
    total_by_type = df_total_document_by_type.filter(df_total_document_by_type.document_type == 'C').collect()
    if len(total_by_type) > 0 :
        total_number_credit_notes = total_by_type[0][1]
    else :
        total_number_credit_notes = 0        

    #purchase
    total_by_type = df_total_document_by_type.filter(df_total_document_by_type.document_type == 'P').collect()
    if len(total_by_type) > 0 :
        total_number_purchase = total_by_type[0][1]
    else :
        total_number_purchase = 0  

    #export invoice
    total_by_type = df_total_document_by_type.filter(df_total_document_by_type.document_type == 'X').collect()
    if len(total_by_type) > 0 :
        total_number_export = total_by_type[0][1]
    else :
        total_number_export = 0          

    total_issuer_ids = df.select('issuer_id').distinct().count()
    total_receiver_ids = df.select('receiver_id').distinct().count()
    total_ISICs = df.select('activity_issuer').distinct().count()

    total_rows_in_dataset = df.count()


    file_name_splits = (fileName.split("-"))
    month_number = (file_name_splits[2])
    year_of_analysis = (file_name_splits[3])
    month_of_analysis = month_number + ' - ' + year_of_analysis

    new_row = [[fileName,month_of_analysis, total_number_sale_invoices, total_number_debit_notes, 
    total_number_credit_notes, total_issuer_ids , total_receiver_ids, total_ISICs, 
    total_rows_in_dataset, total_number_purchase, total_number_export]]
    
    unknown_df = spark.createDataFrame(new_row)
    df_uploaded_data_statistics = df_uploaded_data_statistics.union(unknown_df)

    unique_issuer_id_in_file = pd.DataFrame(df.select('issuer_id').distinct().collect())
    unique_receiver_id_in_file = pd.DataFrame(df.select('receiver_id').distinct().collect())
    unique_activity_id_in_file = pd.DataFrame(df.select('activity_issuer').distinct().collect())

    unique_issuer_id_dataset = unique_issuer_id_dataset.append(unique_issuer_id_in_file, ignore_index=True)
    unique_receiver_id_dataset = unique_receiver_id_dataset.append(unique_receiver_id_in_file, ignore_index=True)
    unique_activity_id_dataset = unique_activity_id_dataset.append(unique_activity_id_in_file, ignore_index=True)


    return df_uploaded_data_statistics    

In [ ]:
file_names = mssparkutils.fs.ls(invoice_schema_applied_path)
for filename in file_names:
    logger.info(f'Calculating data statistics for file: {filename.name}')  
    with tracer.span('Calculating data statistics for invoice file'):
        df_uploaded_data_statistics = getStatistics(df_uploaded_data_statistics, filename.name, filename.path)

In [ ]:
total_unique_issuer_id_dataset = unique_issuer_id_dataset[0].nunique()
total_unique_receiver_id_dataset = unique_receiver_id_dataset[0].nunique()
total_unique_activity_id_dataset = unique_activity_id_dataset[0].nunique()

new_row = [[total_unique_issuer_id_dataset, total_unique_receiver_id_dataset, total_unique_activity_id_dataset]]
unknown_df_time = spark.createDataFrame(new_row)

df_statistics_uniques = df_statistics_uniques.union(unknown_df_time)



# Write out the statistics
if the notebook receives the "run_time_stamp" parameter from Synapse Ingest the parameter is used to create a folder to store the statistics, if no parameter is received we write the statistics to the default output folder

In [ ]:
with tracer.span('Writing data statistics to ADLS'):
    #writeout statistics
    df_uploaded_data_statistics.repartition(1).write.mode("overwrite").parquet(f'{statistics_path}load_stats')
    df_statistics_uniques.repartition(1).write.mode("overwrite").parquet(f'{statistics_path}load_stats_unique_receiver_issuer')

### Write out SQL table

In [ ]:
# serverless SQL config
database = 'eiad'
driver= '{ODBC Driver 17 for SQL Server}'

sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SyanpseServerlessSQLEndpoint")

In [ ]:
def generate_schema_string(dataframe):
    schema_string = ""
    for name in dataframe.schema.fieldNames():
        schema_string += "[" + name + "] "
        datatype = str(dataframe.schema[name].dataType.simpleString())
        if datatype == 'double': datatype = 'float'
        if datatype == 'string': datatype = 'nvarchar(MAX)'
        if datatype == 'timestamp': datatype = 'datetime2(7)'
        schema_string += datatype + ", "
    return schema_string[:-2]

In [ ]:
with tracer.span('Creating SQL table for invoice load statistics'):
    table_name = statistics_path.split('/')[3] + '_' + statistics_path.split('/')[2].split('@')[0] + '_' + statistics_path.split('/')[4] + '_' + 'load_stats'
    schema_string = generate_schema_string(df_uploaded_data_statistics)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(statistics_path.split('/')) if idx > 2]) + 'load_stats'
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)

In [ ]:
with tracer.span('Creating SQL table for invoice unique statistics'):
    table_name = statistics_path.split('/')[3] + '_' + statistics_path.split('/')[2].split('@')[0] + '_' + statistics_path.split('/')[4] + '_' + 'load_stats_unique_receiver_issuer'
    schema_string = generate_schema_string(df_statistics_uniques)
    drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
    location = "/".join([i for idx, i in enumerate(statistics_path.split('/')) if idx > 2]) + 'load_stats_unique_receiver_issuer'
    df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
    with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
        with conn.cursor() as cursor:
            try:
                cursor.execute(drop_table_command)
            except:
                pass
            cursor.execute(df_sql_command)