# Change source tax payer profile to standard schema
Reads the file with the tax payer profiles and applies the standard naming schema to the columns. The output is dataset with new columns names without any change to the data values.


In [ ]:
%%configure -f
{
"conf": {
    "spark.dynamicAllocation.disableIfMinMaxNotSpecified.enabled": true,
    "spark.dynamicAllocation.enabled": true,
    "spark.dynamicAllocation.minExecutors": 2,
    "spark.dynamicAllocation.maxExecutors": 20
   }
}

In [ ]:
batch_id = ''
taxpayer_profile_uncompressed_path = ''
taxpayer_profile_schema_applied_path = ''
statistics_path = ''
data_separator = ''
data_encoding = ''

In [ ]:
import datetime
import csv
import pandas as pd
from datetime import date
from calendar import monthrange
import time
import pyodbc
from pyspark.sql.functions import col, year, month, dayofmonth, isnan, when, count
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, FloatType

In [ ]:
# Initiate logging
import logging
from opencensus.ext.azure.log_exporter import AzureLogHandler
from opencensus.ext.azure.trace_exporter import AzureExporter
from opencensus.trace import config_integration
from opencensus.trace.samplers import AlwaysOnSampler
from opencensus.trace.tracer import Tracer

instrumentation_connection_string = mssparkutils.credentials.getSecretWithLS("keyvault", "AppInsightsConnectionString")
config_integration.trace_integrations(['logging'])

logger = logging.getLogger(__name__)
logger.addHandler(AzureLogHandler(connection_string=instrumentation_connection_string))
logger.setLevel(logging.INFO)

tracer = Tracer(
    exporter=AzureExporter(
        connection_string=instrumentation_connection_string
    ),
    sampler=AlwaysOnSampler()
)

# Spool parameters
run_time_parameters = {'custom_dimensions': {
    'batch_id': batch_id,
    'statistics_path': statistics_path,
    'data_encoding': data_encoding,
    'data_separator': data_separator,
    'taxpayer_profile_uncompressed_path': taxpayer_profile_uncompressed_path,
    'taxpayer_profile_schema_applied_path': taxpayer_profile_schema_applied_path,
    'notebook_name': mssparkutils.runtime.context['notebookname']
} }
  
logger.info(f"{mssparkutils.runtime.context['notebookname']}: INITIALISED", extra=run_time_parameters)

In [ ]:
def deep_ls(path: str, max_depth=1):
    """
    List all files and folders in specified path and
    subfolders within maximum recursion depth.
    """

    # List all files in path
    li = mssparkutils.fs.ls(path)

    # Return all files
    for x in li:
        if x.size != 0:
            yield x

    # If the max_depth has not been reached, start
    # listing files and folders in subdirectories
    if max_depth > 1:
        for x in li:
            if x.size != 0:
                continue
            for y in deep_ls(x.path, max_depth - 1):
                yield y

    # If max_depth has been reached,
    # return the folders
    else:
        for x in li:
            if x.size == 0:
                yield x

In [ ]:
def applySchema(fileName, fullFilePath) :
    df = spark.read.csv(fullFilePath, sep=data_separator,inferSchema=True, header=True)

    col_names = ['taxpayer_id', 'taxpayer_type','fiscal_condition',
    'regime_name', 'taxpayer_size',
    'main_activity', 'sec1_activity','sec2_activity','employees_number',
    'legal_reg_date', 'tax_reg_date','e_inv_enroll_date','reported_assets',
    'total_capital','social_capital', 'total_assets',
    'total_fixed_assets','total_liabilities','gross_income',
    'net_income','total_vat_sales','credited_einvoicing_value',
    'state','municipality','city']

    df = df.select(*col_names)

    # Change data types
    df = df.withColumn('taxpayer_id' , df['taxpayer_id'].cast(StringType()))
    df = df.withColumn('taxpayer_type' , df['taxpayer_type'].cast(StringType()))
    df = df.withColumn('fiscal_condition' , df['fiscal_condition'].cast(StringType()))
    df = df.withColumn('regime_name' , df['regime_name'].cast(StringType()))
    df = df.withColumn('taxpayer_size' , df['taxpayer_size'].cast(StringType()))
    df = df.withColumn('main_activity' , df['main_activity'].cast(StringType()))
    df = df.withColumn('sec1_activity' , df['sec1_activity'].cast(StringType()))
    df = df.withColumn('sec2_activity' , df['sec2_activity'].cast(StringType()))
    df = df.withColumn('employees_number' , df['employees_number'].cast(IntegerType()))
    df = df.withColumn('legal_reg_date' , df['legal_reg_date'].cast(DateType()))
    df = df.withColumn('tax_reg_date' , df['tax_reg_date'].cast(DateType()))
    df = df.withColumn('e_inv_enroll_date' , df['e_inv_enroll_date'].cast(DateType()))
    df = df.withColumn('reported_assets' , df['reported_assets'].cast(IntegerType()))
    df = df.withColumn('total_capital' , df['total_capital'].cast(FloatType()))
    df = df.withColumn('social_capital' , df['social_capital'].cast(FloatType()))
    df = df.withColumn('total_assets' , df['total_assets'].cast(FloatType()))
    df = df.withColumn('total_fixed_assets' , df['total_fixed_assets'].cast(FloatType()))
    df = df.withColumn('total_liabilities' , df['total_liabilities'].cast(FloatType()))
    df = df.withColumn('gross_income' , df['gross_income'].cast(FloatType()))
    df = df.withColumn('net_income' , df['net_income'].cast(FloatType()))
    df = df.withColumn('total_vat_sales' , df['total_vat_sales'].cast(FloatType()))
    df = df.withColumn('credited_einvoicing_value' , df['credited_einvoicing_value'].cast(FloatType()))
    df = df.withColumn('state' , df['state'].cast(StringType()))
    df = df.withColumn('municipality' , df['municipality'].cast(StringType()))
    df = df.withColumn('city' , df['city'].cast(StringType()))
    
    df.write.mode("overwrite").option("encoding", "UTF-8").csv(f'{taxpayer_profile_schema_applied_path}tax_payer_profile', header=True, sep=data_separator)

In [ ]:
file_names = deep_ls(taxpayer_profile_uncompressed_path,20)
for filename in file_names:  
    with tracer.span('Applying schema on taxpayer profile file'):
        logger.info(f'Processing file: {filename.name}')
        applySchema(filename.name.replace('.CSV',''), filename.path)