In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 20
   }
}

In [ ]:
batch_id = ''
taxpayer_profile_schema_applied_path = ''
taxpayer_profile_cleaned_path = ''
statistics_path = ''
data_separator = ''
data_encoding = ''

In [ ]:
import datetime
import csv
import pandas as pd
from datetime import date
from calendar import monthrange
import time
from pyspark.sql.functions import col, year, month, dayofmonth, isnan, when, count
from pyspark.sql.types import StructType,StructField, StringType, IntegerType, DoubleType, FloatType

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'statistics_path': statistics_path,
    'data_encoding': data_encoding,
    'data_separator': data_separator,
    'taxpayer_profile_schema_applied_path': taxpayer_profile_schema_applied_path,
    'taxpayer_profile_cleaned_path': taxpayer_profile_cleaned_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

# Tax payer profile cleaning

In [ ]:
with tracer.span('Loading schema applied taxpayer profile file'):
    fullFilePath = taxpayer_profile_schema_applied_path + 'tax_payer_profile'
    df = spark.read.csv(fullFilePath, sep=data_separator,inferSchema=True, header=True)

In [ ]:
col_names = ['taxpayer_id', 'taxpayer_type','fiscal_condition',
    'regime_name', 'taxpayer_size',
    'main_activity', 'sec1_activity','sec2_activity','employees_number',
    'legal_reg_date', 'tax_reg_date','e_inv_enroll_date','reported_assets',
    'total_capital','social_capital', 'total_assets',
    'total_fixed_assets','total_liabilities','gross_income',
    'net_income','total_vat_sales','credited_einvoicing_value',
    'state','municipality','city']

stringtype_col_names = ['taxpayer_id','taxpayer_type', 'fiscal_condition',
'regime_name','taxpayer_size','main_activity', 
'sec1_activity','sec2_activity','state','municipality','city']

datetype_col_names = ['legal_reg_date', 'tax_reg_date','e_inv_enroll_date']

numerictype_col_names = ['employees_number',
    'total_capital','social_capital','reported_assets','total_assets',
    'total_fixed_assets','total_liabilities','gross_income',
    'net_income','total_vat_sales','credited_einvoicing_value']

In [ ]:
#change all columns to string type in order to clean it before type conversion
df = df.select([col('`{}`'.format(c)).cast(StringType()).alias(c) for c in col_names])

# Invalid values handling

In the input dataset several numeric, date and string columns have invalid values like #N/A, -, null, NULL we clean this data here

In [ ]:
# replace invalid values in columns, string columns will be set to null initially and in the next process we will replace with the proper value specific for each column
for colname in stringtype_col_names:
    df = df.withColumn(colname, 
       when((col(colname) == "")  | (col(colname) == "#N/A") | (col(colname) == "-") | (col(colname) == "null") | (col(colname) == "NULL"), None) 
          .otherwise(col(colname))) 


for colname in numerictype_col_names:
    df = df.withColumn(colname, 
       when((col(colname) == "")  | (col(colname) == "#N/A") | (col(colname) == "-") | (col(colname) == "null") | (col(colname) == "NULL"), 0) 
          .otherwise(col(colname))) 


for colname in datetype_col_names:
    df = df.withColumn(colname, 
       when((col(colname) == "")  | (col(colname) == "#N/A") | (col(colname) == "-") | (col(colname) == "null") | (col(colname) == "NULL"), "31-12-50") 
          .otherwise(col(colname))) 

In [ ]:
#Get All column names and it's types
from pyspark.sql.functions import col,isnan, when, count

for col in df.dtypes:
    logger.info(f'{col[0]} , {col[1]}')


In [ ]:
# Find Count of Null, None, NaN of All DataFrame Columns
from pyspark.sql.functions import col,isnan, when, count
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show(vertical=True)

## Apply the cleansing rules for each column

In [ ]:
#taxpayer_type	StringType	Replace Nulls with "no_taxpayer_type". Replace any value with three or more characters with "no_taxpayer_type"
df = df.na.fill("no_taxpayer_type",["taxpayer_type"])

#fiscal_condition	StringType	Replace Nulls with "no_fiscal_condition". Replace any value with three or more characters with "no_fiscal_condition"
df = df.na.fill("no_fiscal_condition",["fiscal_condition"])

#regime_name	StringType	Replace Nulls with "no_regime_name".
df = df.na.fill("no_regime_name",["regime_name"])

#taxpayer_size	StringType	Replace Nulls with "no_taxpayer_size". Convert long names to the two character codes. Convert any local language name to the English equivalent in two digit codes.
df = df.na.fill("no_taxpayer_size",["taxpayer_size"])

#main_activity	StringType	Replace Nulls with code "999999". Replace any value < 3 characters and > 8 characters with code "999999". Replace any value between 4 to 8 characters that is not a numeric string characters with code "999999".
df = df.na.fill("999999",["main_activity"])

#sec1_activity	StringType	Replace Nulls with code "999999". Replace any value < 3 characters and > 8 characters with code "999999". Replace any value between 4 to 8 characters that is not a numeric string characters with code "999999".
df = df.na.fill("999999",["sec1_activity"])

#sec2_activity	StringType	Replace Nulls with code "999999". Replace any value < 3 characters and > 8 characters with code "999999". Replace any value between 4 to 8 characters that is not a numeric string characters with code "999999".
df = df.na.fill("999999",["sec2_activity"])

#employees_number	IntegerType	Replace Nulls with integer "1". Replace any float number, alpha characters with integer "1".
df = df.na.fill(1,["employees_number"])

#legal_reg_date	DateType	Replace Nulls with 1900-1-1. Replace any other value different that DataType to 1900-1-1
df = df.na.fill("1900-1-1",["legal_reg_date"])

#tax_reg_date	DateType	Replace Nulls with 1900-1-1. Replace any other value different that DataType to 1900-1-1
df = df.na.fill("1900-1-1",["tax_reg_date"])

#e_inv_enroll_date	DataType	Replace Nulls with 1900-1-1. Replace any other value different that DataType to 1900-1-1
df = df.na.fill("1900-1-1",["e_inv_enroll_date"])

#reported_assets	IntegerType.	Replace Nulls with value "0". Convert any locale value to English equivalent. Replace any other value to "0"
df = df.na.fill(0,["reported_assets"])

#total_capital	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["total_capital"])

#social_capital	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["social_capital"])

#total_assets	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["total_assets"])

#total_fixed_assets	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["total_fixed_assets"])

#total_liabilities	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["total_liabilities"])

#gross_income	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["gross_income"])

#net_income	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["net_income"])

#total_vat_sales	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["total_vat_sales"])

#credited_einvoicing_value	FloatType	Replace Nulls with Float Number "1.0". Replace any alpha value with float number "1.0". Covert integers to float.
df = df.na.fill(1.0,["credited_einvoicing_value"])

#state	StringType	Replace Nulls with code "00". If there are values in the format of "006, 0006, 00006, 000006, etc., keep the first two characters from right to left, and remove the rest. If there are values in zero "0", "00", "000", etc., replace with the code "00".
df = df.na.fill("00",["state"])

#municipality	StringType	Replace Nulls with "000000" code. Covert any other code out of the range of 000001 to 999999 to "000000"
df = df.na.fill("000000",["municipality"])

#city	StringType	Replace Nulls with "000000" code. Covert any other code out of t
df = df.na.fill("000000",["city"])

In [ ]:
# Check we don't have any null values left 
# Find Count of Null, None, NaN of All DataFrame Columns
df.select([count(when(isnan(c) | col(c).isNull(), c)).alias(c) for c in df.columns]
   ).show(vertical=True)

# Write out the clean file to the bronze zone



In [ ]:
with tracer.span('Saving cleaned taxpayer profile file to ADLS'):
    output_file_name = "tax_payer_profile"
    df.write.mode("overwrite").parquet(taxpayer_profile_cleaned_path)