In [0]:
#IMPORT LIBRARIES
from pyspark.sql import SparkSession
import json
import sys
import importlib
import pyspark.sql.functions as F
from pyspark.sql.types import StructType

from delta.tables import DeltaTable


In [0]:
#CREATE PYSPARK SESSION

spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#GET GENERAL CONFIGURATION FROM EXTERNAL WORKBOOKS

# Execute config notebook
# 360 is the time in sec before timeout
subject_area = "bi-ovc"

config_str = dbutils.notebook.run("../config/ibs-config", 360, {"subject_area": subject_area})
config = json.loads(config_str)



In [0]:
env = 'dev'
bronze_generic_path = config[subject_area]['bronze']
silver_generic_path = config[subject_area]['silver']

source_system = 'SAPBR'

sourceSchema = "bronze-bi-ovc-sapbr"
targetSchema = "silver-bi-ovc-sapbr"

sourceFolder = "fin_cost_act/sapbr"
targetFolder = "fin_cost_act/sapbr"

sourceLayer = "bronze"
targetLayer = "silver"

sourceTable = "bi_ovc_sapbr_bronze"
targetTable = "bi_ovc_sapbr_silver"

In [0]:
# READ SOURCE SCHEMA
params_sch = {
    "bi-ovc-sch": sourceSchema
}

try:
    all_sch_config_str = dbutils.notebook.run("schema/bi-ovc-schema", 360, params_sch)
    all_sch_config = json.loads(all_sch_config_str) #load all elements
    
    src_schema = StructType.fromJson(all_sch_config["bi-ovc-sch"])

except Exception as e:
    print(f"[ERROR] Execution of bi-ovc-schema notebook failed: {e}")
    raise


In [0]:
# READ TARGET SCHEMA
params_sch = {
    "bi-ovc-sch": targetSchema
}

try:
    all_sch_config_str = dbutils.notebook.run("schema/bi-ovc-schema", 360, params_sch)
    all_sch_config = json.loads(all_sch_config_str) #load all elements
    
    trgt_schema = StructType.fromJson(all_sch_config["bi-ovc-sch"])

except Exception as e:
    print(f"[ERROR] Execution of bi-ovc-schema notebook failed: {e}")
    raise


In [0]:
# get current data from Bronze
source_path = bronze_generic_path + sourceFolder 
df_brz = []

try:
    # check file existense in DBFS
    dbutils.fs.ls(source_path)

    # read table
    df_brz = spark.read.format("delta").load(source_path)
except Exception as e:
    print(f"xxx Can't read Delta {source_path}: {e}")
    raise

df_brz = df_brz.where(F.col('DW_CURR_ROW_FLG') == F.lit(True)) #filtro los current
df_brz = df_brz.drop('DW_VALID_FROM_DT','DW_VALID_TO_DT') #elimino estos campos porque en silver van con la nueva fecha y hora
df_brz = df_brz.withColumn("POSTING_MTH_ID",F.date_format(F.to_date("POSTING_DATE", "yyyyMMdd"), "yyyyMM"))

#get last version available per month
df_brz_last_version = df_brz.groupBy("POSTING_MTH_ID").agg(
    F.max("DW_FILE_NAME").alias("DW_FILE_NAME")
)

#display(df_brz)

In [0]:
#display(df_brz_last_version)

In [0]:
#GET LAST DATA AVAILABLE 

df_brz = (
    df_brz.join(
        df_brz_last_version,
        on=["DW_FILE_NAME", "POSTING_MTH_ID"],  # clave de unión
        how="inner"
    )
)

#display(df_brz)

In [0]:
#WRITE IN SILVER

target_path = silver_generic_path + targetFolder 

try:
    dbutils.fs.ls(target_path)
except:
    dbutils.fs.mkdirs(target_path)

df_slv = []

# check if target is an existing delta table, if not will create it
if DeltaTable.isDeltaTable(spark, target_path):

    delta_table = DeltaTable.forPath(spark, target_path)

    # tomo los registros de silver que actualmente están current
    df_slv = (
        spark.read.format("delta").load(target_path)
        .filter(F.col('DW_CURR_ROW_FLG') == F.lit(True))
        .drop('DW_VALID_FROM_DT','DW_VALID_TO_DT')
    )
    df_slv = df_slv.withColumn("POSTING_MTH_ID",F.date_format(F.to_date("POSTING_DATE", "yyyyMMdd"), "yyyyMM"))

    #me quedo con los que van a ser reprocesados para cambiar su current a false
    df_slv_reprocess = (
        df_slv.join(
            df_brz_last_version,
            on=["POSTING_MTH_ID"],  # clave de unión
            how="inner"
        )
    ).select("POSTING_MTH_ID").distinct()
    
    # set records to reprocess in false
    delta_table.alias("tgt").merge(
        df_slv_reprocess.alias("src"),
        "tgt.POSTING_MTH_ID = src.POSTING_MTH_ID"
    ).whenMatchedUpdate(
        condition="tgt.DW_CURR_ROW_FLG = true",
        set={
            "DW_CURR_ROW_FLG": F.lit(False),
            "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
        }
    ).execute()
    
    df_brz.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
        .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
        .withColumn('DW_CURR_ROW_FLG', F.lit(True))


    df_brz.write.format("delta").mode("append").save(target_path)

else:
    # Crear tabla Delta inicial con DW columns
    new_data = (
        df_brz
        .withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East"))
        .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp"))
        .withColumn('DW_CURR_ROW_FLG', F.lit(True))
    )
    # Grabar como nueva Delta Table
    new_data.write.format("delta").mode("append").save(target_path)


In [0]:
#table = "`bi-ovc`" + ".test.bi_ovc_sapbr_silver" 
#spark.sql("DROP TABLE IF EXISTS " + table + "")
#spark.sql("CREATE TABLE IF NOT EXISTS " + table + " USING DELTA LOCATION '" +  silver_generic_path + targetFolder + "'")