# Change source file to standard schema
Reads the file with uploaded data and applies the standard naming schema to the columns. The output is dataset with new columns names without any change to the data values.


In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 20
   }
}

In [ ]:
batch_id = ''
statistics_path = ''
data_encoding = ''
data_separator = ''
invoice_uncompressed_path = ''
invoice_schema_applied_path = ''
output_container_path = ''

In [ ]:
import datetime
import time
import csv
import pyodbc
from pyspark.sql.functions import col, year, month, dayofmonth, isnan, when, count, current_timestamp
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, TimestampType, FloatType

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'statistics_path': statistics_path,
    'data_encoding': data_encoding,
    'data_separator': data_separator,
    'invoice_uncompressed_path': invoice_uncompressed_path,
    'invoice_schema_applied_path': invoice_schema_applied_path,
    'output_container_path': output_container_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
# serverless SQL config
database = 'eiad'
driver= '{ODBC Driver 17 for SQL Server}'

sql_user_name = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLUserName")
sql_user_pwd = mssparkutils.credentials.getSecretWithLS("keyvault", "SynapseSQLPassword")
serverless_sql_endpoint = mssparkutils.credentials.getSecretWithLS("keyvault", "SyanpseServerlessSQLEndpoint")

In [ ]:
# Create new row to append to DataFrame
row = [[f'{batch_id}', 'Starting Data Loading']]
columns = ['batch_id', 'status']
new_batch_status_df = spark.createDataFrame(row, columns)
new_batch_status_df = new_batch_status_df.withColumn("date_submitted", current_timestamp())
new_batch_status_df = new_batch_status_df.withColumn("update_time_stamp", current_timestamp())

# update the batch status table with new row to use in Power BI and webapp
new_batch_status_df.write.mode("append").parquet(f'{output_container_path}/batch_status')
logger.info(f'Create new batch status entry of "Started Data Loading" for batch id: {batch_id}')

In [ ]:
def generate_schema_string(dataframe):
    schema_string = ""
    for name in dataframe.schema.fieldNames():
        schema_string += "[" + name + "] "
        datatype = str(dataframe.schema[name].dataType.simpleString())
        if datatype == 'double': datatype = 'float'
        if datatype == 'string': datatype = 'nvarchar(MAX)'
        if datatype == 'timestamp': datatype = 'datetime2(7)'
        schema_string += datatype + ", "
    return schema_string[:-2]

In [ ]:
table_name = 'batch_status'
schema_string = generate_schema_string(new_batch_status_df)
drop_table_command = f"DROP EXTERNAL TABLE [{table_name}]"
location = 'batch_status'
df_sql_command = f"CREATE EXTERNAL TABLE [{table_name}] ({schema_string}) WITH (LOCATION = '{location}/**', DATA_SOURCE = [output_<<STORAGE_ACCOUNT_NAME>>_dfs_core_windows_net], FILE_FORMAT = [SynapseParquetFormat])"
with pyodbc.connect('DRIVER='+driver+';SERVER=tcp:'+serverless_sql_endpoint+';PORT=1433;DATABASE='+database+';UID='+sql_user_name+';PWD='+ sql_user_pwd) as conn:
    with conn.cursor() as cursor:
        try:
            cursor.execute(drop_table_command)
        except:
            pass
        cursor.execute(df_sql_command)

In [ ]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

In [ ]:
def applySchema(fileName, fullFilePath) :
    df = spark.read.csv(fullFilePath, sep=data_separator,inferSchema=True, header=True)

    col_names = ['issuer_type', 'issuer_id','activity_issuer','receiver_type', 'receiver_id', 
    'document_type', 'document_id','issued_date','sales_terms','credit_term',
    'currency','exchange_rate_r','payment_method1','payment_method2','payment_method3',
    'payment_method4','payment_method5','payment_method99','total_taxable_services','total_non_taxable_services',
    'total_taxable_goods','total_non_taxable_goods','total_taxable','total_non_taxable','total_sales',
    'total_discounts','total_voucher','total_tax']

    df = df.select(*col_names)

    # Change data types
    df = df.withColumn('issuer_type' , df['issuer_type'].cast(StringType()))
    df = df.withColumn('issuer_id' , df['issuer_id'].cast(StringType()))
    df = df.withColumn('activity_issuer' , df['activity_issuer'].cast(StringType()))
    df = df.withColumn('receiver_type' , df['receiver_type'].cast(StringType()))
    df = df.withColumn('receiver_id' , df['receiver_id'].cast(StringType()))
    df = df.withColumn('document_type' , df['document_type'].cast(StringType()))
    df = df.withColumn('document_id' , df['document_id'].cast(StringType()))
    df = df.withColumn('issued_date' , df['issued_date'].cast(TimestampType()))
    df = df.withColumn('sales_terms' , df['sales_terms'].cast(StringType()))
    df = df.withColumn('credit_term' , df['credit_term'].cast(IntegerType()))
    df = df.withColumn('currency' , df['currency'].cast(StringType()))
    df = df.withColumn('exchange_rate_r' , df['exchange_rate_r'].cast(FloatType()))
    df = df.withColumn('payment_method1' , df['payment_method1'].cast(StringType()))
    df = df.withColumn('payment_method2' , df['payment_method2'].cast(StringType()))
    df = df.withColumn('payment_method3' , df['payment_method3'].cast(StringType()))
    df = df.withColumn('payment_method4' , df['payment_method4'].cast(StringType()))
    df = df.withColumn('payment_method5' , df['payment_method5'].cast(StringType()))
    df = df.withColumn('payment_method5' , df['payment_method5'].cast(StringType()))
    df = df.withColumn('payment_method99' , df['payment_method99'].cast(StringType()))
    df = df.withColumn('total_taxable_services' , df['total_taxable_services'].cast(FloatType()))
    df = df.withColumn('total_non_taxable_services' , df['total_non_taxable_services'].cast(FloatType()))
    df = df.withColumn('total_taxable_goods' , df['total_taxable_goods'].cast(FloatType()))
    df = df.withColumn('total_non_taxable_goods' , df['total_non_taxable_goods'].cast(FloatType()))
    df = df.withColumn('total_taxable' , df['total_taxable'].cast(FloatType()))
    df = df.withColumn('total_non_taxable' , df['total_non_taxable'].cast(FloatType()))
    df = df.withColumn('total_sales' , df['total_sales'].cast(FloatType()))
    df = df.withColumn('total_discounts' , df['total_discounts'].cast(FloatType()))
    df = df.withColumn('total_voucher' , df['total_voucher'].cast(FloatType()))
    df = df.withColumn('total_tax' , df['total_tax'].cast(FloatType()))
     
    df.write.mode("overwrite").csv(f'{invoice_schema_applied_path}/{fileName}', header=True, sep=data_separator)

In [ ]:
file_names = deep_ls(invoice_uncompressed_path,20)
for filename in file_names:  
    with tracer.span('Applying schema on invoice file'):
        logger.info(f'Processing file: {filename.name}')
        applySchema(filename.name.replace('.CSV',''), filename.path)