In [0]:
#Esto es para que la librería se actualice automáticamente (el .py de los includes en este caso), en producción hay que tomar la decisión si debería quitarse para evitar cualquier problema y hacer un restart si se cambia la librería o si dejarlo y cuando se haga un cambio, lo va a tomar automáticamente
%load_ext autoreload
%autoreload 2

# To disable autoreload; run %autoreload 0

In [0]:
#IMPORT LIBRARIES
from pyspark.sql import SparkSession
import json
import sys
import importlib
import pyspark.sql.functions as F
from pyspark.sql.types import StructType
from datetime import datetime
from delta.tables import DeltaTable

sys.path.append("/Workspace/BI-OVC")

#esto es funciona porque existe un __init__.py dentro de includes y de config
from includes import file_functions 
from includes import control_functions
from schema import fin_act_sch


In [0]:
spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#LOG START
process_setup_name = 'Load BI OVC'
process_setup_step_name = 'raw to bronze'
sys_modified_by_name = 'NBK - Load Finance - raw to bronze'

process_run_id = control_functions.log_process_run_start(process_setup_name,process_setup_step_name,sys_modified_by_name)

In [0]:
#GET DATA FROM CONTROL TABLE
df = control_functions.get_process_setup_parameters(process_setup_name,process_setup_step_name)

sourceFileNamePrefix = df.select('process_setup_source_file_name').collect()[0][0]
sourceFileExtension = df.select('process_setup_source_file_extension').collect()[0][0]
sourceFileDelimiter = df.select('process_setup_source_file_delimiter').collect()[0][0]
sourceFileEncoding = df.select('process_setup_source_file_encoding').collect()[0][0]
sourceFileNameMask = df.select('process_setup_source_file_name_mask').collect()[0][0]
sourceSchema = df.select('process_setup_source_file_schema').collect()[0][0]
SourceBucket = df.select('process_setup_source_bucket_name').collect()[0][0]
SourceBucketFolderKey = df.select('process_setup_target_bucket_folder_key').collect()[0][0]
TargetBucket = df.select('process_setup_target_bucket_name').collect()[0][0]
TargetBucketFolderKey = df.select('process_setup_target_bucket_folder_key').collect()[0][0]
tableName = df.select('process_setup_target_table_name').collect()[0][0]
ArchiveBucket = df.select('process_setup_archive_bucket_name').collect()[0][0]
ArchiveBucketFolderKey = df.select('process_setup_archive_bucket_folder_key').collect()[0][0]


In [0]:
# READ SCHEMAS - from .py
raw_schema = fin_act_sch.get_schema(sourceSchema)

In [0]:
#GET FILE NAMES FROM RAW SOURCE
files_to_process = []

raw_generic_path = "s3://" + SourceBucket + "/" + SourceBucketFolderKey + "/"

files = dbutils.fs.ls(raw_generic_path)

files_to_process = [
    f for f in files
    if f.name.lower().startswith(sourceFileNamePrefix.lower()) and f.name.lower().endswith(f".{sourceFileExtension.lower()}")
]

files_to_process.sort()

if not files_to_process:
    print(f"⚠️ NO MATCHING FILES")

In [0]:
archive_path = "s3://" + ArchiveBucket + "/" + ArchiveBucketFolderKey
bronze_generic_path = "s3://" + TargetBucket + "/" 

source_cnt = 0
target_cnt = 0
curr_row_flg_updated = 0 

if files_to_process:
    for i, file in enumerate(files_to_process, start=1):
        #CHECK FILENAME

        #READ SOURCE FILE AND ADD COLUMN NAMES TO DATAFRAME
        file_path = raw_generic_path + file.name

        try: 
            df_raw = spark.read.options(encoding=sourceFileEncoding,delimiter=sourceFileDelimiter, header=False, schema=raw_schema).csv(file_path) 
        
        except Exception as e:
            print(f"❌ ERROR READING FILE {file_path}: {e}")
            raise

        #Rename columns using schema names
        df_raw = df_raw.toDF(*[f.name for f in raw_schema.fields])

        #Accumulate count of rows from source before any validation, count inside loop in case of many files loaded
        source_cnt += df_raw.count()

        #PH VALIDATIONS

        #Get all validations that applies for the current process
        df_validations = control_functions.get_object_validation(process_run_id, '')

        # For each validation, build a condition for each type
        conditions = []

        for validation in df_validations.collect():
            col_name = validation['object_name']
            rule_type = validation['validation_rule']
            rule_detail = validation['validation_rule_detail']
            
            if rule_type == 'date_format':
                conditions.append(F.expr(f"try_to_date({col_name}, '{rule_detail}') is not null") | F.col(col_name).isNull() )
            elif rule_type == 'numeric_format':
                conditions.append(F.expr(f"try_cast({col_name} as {rule_detail}) is not null") | F.col(col_name).isNull() )
            elif rule_type == 'not_null':
                conditions.append(F.col(col_name).isNotNull())

        # combine all conditions in one
        cond_bronze = conditions[0]

        for condition in conditions[1:]:
            cond_bronze = cond_bronze & condition

        #get all records that pass the validation
        df_bronze = df_raw.filter(cond_bronze)

        #get all records that dont pass the validation
        df_bronze_rejected = df_raw.filter(~cond_bronze)

        #WRITE IN BRONZE
        #if df_bronze and df_bronze.limit(1).count() > 0:
        if df_bronze.limit(1).count() > 0:
            bronze_delta_path = bronze_generic_path.rstrip("/") + "/" + tableName

            try:
                dbutils.fs.ls(bronze_delta_path)
            except:
                #dbutils.fs.mkdirs(bronze_delta_path)
                df_empty = spark.createDataFrame([], "DW_VALID_FROM_DT timestamp")  # ajusta el esquema
                df_empty.write.format("delta").save(bronze_delta_path)
                
            try:
                # Enrich DF ALWAYS before writing
                df_bronze = df_bronze.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
                    .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
                    .withColumn('DW_CURR_ROW_FLG', F.lit(True)) \
                    .withColumn('DW_FILE_NAME', F.lit(file.name))
                
                if DeltaTable.isDeltaTable(spark, bronze_delta_path):
                    delta_table = DeltaTable.forPath(spark, bronze_delta_path)
                    
                    # Update existing records (only the first time (i==1), so doesnt mark as False the records of other files inserted in the same process running).
                    if i == 1 and curr_row_flg_updated == 0 and "DW_CURR_ROW_FLG" in delta_table.toDF().columns:
                        delta_table.update(
                            condition="DW_CURR_ROW_FLG = true",
                            set={
                                "DW_CURR_ROW_FLG": "false",
                                "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
                            }
                        )

                        curr_row_flg_updated == 1 
                    mode = "append"
                else:
                    mode = "overwrite"  #first loading
                
                # DataFrame with only the schema
                df_bronze.write.format("delta").mode(mode).option("mergeSchema", "true").save(bronze_delta_path)
            
            except Exception as e:
                print(f"❌ ERROR READING FILE {bronze_generic_path}: {e}")
                raise
        else:
            print(f"⚠️ EMPTY DATAFRAME")

        #WRITE IN BRONZE_REJECTED
        #if df_bronze_rejected and df_bronze_rejected.limit(1).count() > 0:
        df_empty = []
        if df_bronze_rejected.limit(1).count() > 0:
            bronze_delta_path = bronze_generic_path.rstrip("/") + "-rejected" + "/" + tableName 

            try:
                dbutils.fs.ls(bronze_delta_path)
            except:
                df_empty = spark.createDataFrame([], "DW_VALID_FROM_DT timestamp")  # ajusta el esquema
                df_empty.write.format("delta").save(bronze_delta_path)
                
            try:
                # Enrich DF ALWAYS before writing
                df_bronze_rejected = df_bronze_rejected.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
                    .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
                    .withColumn('DW_CURR_ROW_FLG', F.lit(True)) \
                    .withColumn('DW_FILE_NAME', F.lit(file.name))
                
                if DeltaTable.isDeltaTable(spark, bronze_delta_path):
                    delta_table_rejected = DeltaTable.forPath(spark, bronze_delta_path)
                    
                    # Update existing records (only the first time (i==1), so doesnt mark as False the records of other files inserted in the same process running) And if it wasn't updated for Write in Bronze first

                    if i == 1 and curr_row_flg_updated == 0 and "DW_CURR_ROW_FLG" in delta_table_rejected.toDF().columns:
                        delta_table_rejected.update(
                            condition="DW_CURR_ROW_FLG = true",
                            set={
                                "DW_CURR_ROW_FLG": "false",
                                "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
                            }
                        )
                    
                    curr_row_flg_updated == 1 
                    
                    mode = "append"
                else:
                    mode = "overwrite"  # primera carga
                
                # DataFrame mínimo solo con esquema

                df_bronze_rejected.write.format("delta").mode(mode).option("mergeSchema", "true").save(bronze_delta_path)

            except Exception as e:
                print(f"Error writing file {bronze_generic_path}: {e}")
                raise
        else:
            print("DataFrame vacío, no se sobrescribió el bronze.")

        #Accumulate count of rows moved to target (not rejected), count inside loop in case of many files loaded
        target_cnt += df_bronze.count()

        #Always move to archive
        source_path = raw_generic_path.rstrip("/")
        target_path = archive_path.rstrip("/")

        # Ruta completa del archivo origen
        source_file_full = f"{source_path}/{file.name}"

        # Verificar existencia
        try:
            files = [f.name for f in dbutils.fs.ls(source_path)]
            if file.name not in files:
                print(f"Archivo no encontrado: {source_file_full}")
        except Exception as e:
            print(f"Error al acceder al path origen: {e}")
            raise

        # Crear carpeta destino si no existe
        try:
            dbutils.fs.ls(target_path)
        except:
            dbutils.fs.mkdirs(target_path)

        # Generar nombre con timestamp
        target_file_name = file.name

        # Rutas completas
        target_file_full = f"{target_path}/{target_file_name}"

        # Copiar y borrar
        dbutils.fs.cp(source_file_full, target_file_full)
        dbutils.fs.rm(source_file_full)

        print(f"Archivo movido: {source_file_full} → {target_file_full}")

    
    #LOG SOURCE & TARGET RECORD COUNT
    #update outside of the loop
    
    control_functions.log_process_run_update_value(process_run_id,sys_modified_by_name,'process_run_source_record_count', source_cnt)

    control_functions.log_process_run_update_value(process_run_id,sys_modified_by_name,'process_run_target_record_count', target_cnt)
else:
    print(f"⚠️ NO FILES TO PROCESS")


In [0]:
#table = "`latam-md-finance`.bronze_rejected.tb_fin_variable_cost_act" 
#spark.sql("DROP TABLE IF EXISTS " + table + "")
#spark.sql("CREATE TABLE IF NOT EXISTS " + table + " USING DELTA LOCATION 's3a://latam-md-finance-bronze-rejected/tb_fin_variable_cost_act'")

#print(table)

In [0]:
#LOG END
control_functions.log_process_run_end(process_run_id,sys_modified_by_name)