In [0]:
#Esto es para que la librería se actualice automáticamente (el .py de los includes en este caso), en producción hay que tomar la decisión si debería quitarse para evitar cualquier problema y hacer un restart si se cambia la librería o si dejarlo y cuando se haga un cambio, lo va a tomar automáticamente
%load_ext autoreload
%autoreload 2

# To disable autoreload; run %autoreload 0

In [0]:
#IMPORT LIBRARIES
from pyspark.sql import SparkSession
import json
import sys
import importlib
import pyspark.sql.functions as F
from pyspark.sql.types import StructType

from delta.tables import DeltaTable

sys.path.append("/Workspace/BI-OVC")

#esto es funciona porque existe un __init__.py dentro de cada carpeta
from includes import control_functions
from includes import validations
from schema import fin_act_sch


In [0]:
#CREATE PYSPARK SESSION

spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#LOG START
process_setup_name = 'Load BI OVC'
process_setup_step_name = 'bronze to silver'
sys_modified_by_name = 'NBK - Load Finance - bronze to silver'
source_system_code = 'sapbr'

process_run_id = control_functions.log_process_run_start(process_setup_name,process_setup_step_name,source_system_code,sys_modified_by_name)

In [0]:
#GET DATA FROM CONTROL TABLE
df = control_functions.get_process_setup_parameters(process_setup_name,process_setup_step_name)

sourceSchema = df.select('process_setup_source_data_definition').collect()[0][0]
SourceBucket = df.select('process_setup_source_bucket_name').collect()[0][0]
SourceBucketFolderKey = df.select('process_setup_source_bucket_folder_key').collect()[0][0]
targetSchema = df.select('process_setup_target_data_definition').collect()[0][0]
TargetBucket = df.select('process_setup_target_bucket_name').collect()[0][0]
TargetBucketFolderKey = df.select('process_setup_target_bucket_folder_key').collect()[0][0]
targetTable = df.select('process_setup_target_table_name').collect()[0][0]
sourceTable = df.select('process_setup_source_table_name').collect()[0][0]
sourceLayer = df.select('process_setup_source_layer').collect()[0][0]
targetLayer = df.select('process_setup_target_layer').collect()[0][0]

In [0]:
# READ SCHEMAS - from .py
src_schema = fin_act_sch.get_schema(sourceSchema)
trgt_schema = fin_act_sch.get_schema(targetSchema)

In [0]:
# get current data from Bronze
source_path = "s3://" + SourceBucket + "/" + SourceBucketFolderKey + "/"
df_brz = []

try:
    # check file existense in DBFS
    dbutils.fs.ls(source_path)

    # read table
    df_brz = spark.read.format("delta").load(source_path)

except Exception as e:
    print(f"❌ CAN'T READ DELTA {source_path}: {e}")
    raise

df_brz = df_brz.where(F.col('DW_CURR_ROW_FLG') == F.lit(True)) #get current
df_brz = df_brz.drop('DW_VALID_FROM_DT','DW_VALID_TO_DT') #drop this fields as in silver will have the current date
df_brz = df_brz.withColumn("POSTING_MTH_ID",F.date_format(F.to_date("POSTING_DATE", "yyyyMMdd"), "yyyyMM"))

#get last version available per month (this is a business definition)
df_brz_last_version = df_brz.groupBy("POSTING_MTH_ID").agg(
    F.max("DW_FILE_NAME").alias("DW_FILE_NAME")
)

#display(df_brz)

In [0]:
#GET LAST DATA AVAILABLE 

df_brz = (
    df_brz.join(
        df_brz_last_version,
        on=["DW_FILE_NAME", "POSTING_MTH_ID"],  # union key
        how="inner"
    )
)

#display(df_brz)

In [0]:
sys.path.append("/Workspace/BI-OVC")

#esto es funciona porque existe un __init__.py dentro de cada carpeta
from includes import control_functions
from includes import validations
from schema import fin_act_sch


In [0]:
#BS VALIDATIONS

#Get all BS validations that applies for the current process
df_validations = validations.get_object_validation(process_run_id, 'BS')

#Validate and get records validated and records rejected
df_validated, df_rejected = validations.business_validation(df_brz, df_validations)

In [0]:
#display(df_rejected)
display(df_validated)

In [0]:
#WRITE IN SILVER

if df_validated.limit(1).count() > 0:
    target_path = "s3://" + TargetBucket + "/" + TargetBucketFolderKey + "/"

    #check if target path exists, if not will create it
    try:
        dbutils.fs.ls(target_path)
    except:
        dbutils.fs.mkdirs(target_path)

    df_silver = []

    # check if target is an existing delta table, if not will create it
    if DeltaTable.isDeltaTable(spark, target_path):

        delta_table = DeltaTable.forPath(spark, target_path)

        # get all current records from silver already loaded
        df_silver = (
            spark.read.format("delta").load(target_path)
            .filter(F.col('DW_CURR_ROW_FLG') == F.lit(True))
            .drop('DW_VALID_FROM_DT','DW_VALID_TO_DT')
        )
        #generate Posting Month ID for reprocess
        df_silver = df_silver.withColumn("POSTING_MTH_ID",F.date_format(F.to_date("POSTING_DATE", "yyyyMMdd"), "yyyyMM"))

        #from the current, get those to be reprocessed to mark them as false
        df_silver_reprocess = (
            df_silver.join(
                df_brz_last_version,
                on=["POSTING_MTH_ID"],  # key for union
                how="inner"
            )
        ).select("POSTING_MTH_ID").distinct()
        
        #Mark records to reprocess as false
        delta_table.alias("tgt").merge(
            df_silver_reprocess.alias("src"),
            "tgt.POSTING_MTH_ID = src.POSTING_MTH_ID"
        ).whenMatchedUpdate(
            condition="tgt.DW_CURR_ROW_FLG = true",
            set={
                "DW_CURR_ROW_FLG": F.lit(False),
                "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
            }
        ).execute()
        
        #Add DW columns to the validated data
        df_validated.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
            .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
            .withColumn('DW_CURR_ROW_FLG', F.lit(True))

        df_validated.write.format("delta").mode("append").save(target_path)

    else:
        #Create initial Delta table with DW columns
        new_data = (
            df_validated
            .withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East"))
            .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp"))
            .withColumn('DW_CURR_ROW_FLG', F.lit(True))
        )
        # Grabar como nueva Delta Table
        new_data.write.format("delta").mode("append").save(target_path)


In [0]:
#WRITE IN SILVER REJECTED

if df_rejected.limit(1).count() > 0:
    target_path = "s3://" + TargetBucket + "-rejected" + "/" + TargetBucketFolderKey + "/"

    #check if target path exists, if not will create it
    try:
        dbutils.fs.ls(target_path)
    except:
        dbutils.fs.mkdirs(target_path)

    df_silver = []

    # check if target is an existing delta table, if not will create it
    if DeltaTable.isDeltaTable(spark, target_path):

        delta_table_rejected = DeltaTable.forPath(spark, target_path)

        if "DW_CURR_ROW_FLG" in delta_table_rejected.toDF().columns:
            delta_table_rejected.update(
                condition="DW_CURR_ROW_FLG = true",
                set={
                    "DW_CURR_ROW_FLG": "false",
                    "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
                }
            )
        
        #Add DW columns to the rejected data
        df_rejected.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
            .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
            .withColumn('DW_CURR_ROW_FLG', F.lit(True))

        df_rejected.write.format("delta").mode("append").save(target_path)

    else:
        #Create initial Delta table with DW columns
        new_data = (
            df_rejected
            .withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East"))
            .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp"))
            .withColumn('DW_CURR_ROW_FLG', F.lit(True))
        )
        # Grabar como nueva Delta Table
        new_data.write.format("delta").mode("append").save(target_path)


In [0]:
#table = "`latam-md-finance`.silver_rejected.tb_fin_variable_cost_act" 
#spark.sql("DROP TABLE IF EXISTS " + table + "")
#spark.sql("CREATE TABLE IF NOT EXISTS " + table + " USING DELTA LOCATION 's3a://latam-md-finance-silver-rejected/tb_fin_variable_cost_act'")