In [0]:
#IMPORT LIBRARIES
from pyspark.sql import SparkSession
import json
import sys
import importlib
import pyspark.sql.functions as F
from pyspark.sql.types import StructType
from datetime import datetime
from delta.tables import DeltaTable

sys.path.append("/Workspace/BI-OVC")
from includes import file_functions #esto es funciona porque existe un __init__.py dentro de includes
from config import config #esto es el config.py de la carpeta config, funciona porque existe un __init__.py
from schema import fin_act_sch


In [0]:
spark = SparkSession.builder \
    .appName("bi-ovc-test") \
    .getOrCreate()

In [0]:
#GET DATA FROM CONTROL TABLE

query = "SELECT * FROM " + config.control_table + " WHERE fileName = '" + config.fin_var_cost_act_file_name + "'"

df = spark.sql(query)

sourceFileNamePrefix = df.select('fileName').collect()[0][0]
sourceFileExtension = df.select('fileExtension').collect()[0][0]
sourceFileDelimiter = df.select('fileDelimiter').collect()[0][0]
sourceFileEncoding = df.select('fileEncoding').collect()[0][0]
sourceSchema = df.select('fileSchema').collect()[0][0]
sourceFileNameMask = df.select('fileNameMask').collect()[0][0]
tableName = df.select('tableName').collect()[0][0]



In [0]:
# READ SCHEMAS - from .py
raw_schema = fin_act_sch.get_schema(sourceSchema)

In [0]:
#GET FILE NAMES FROM RAW SOURCE
files_to_process = []

raw_generic_path = "s3://" + config.raw_bucket + "/" + tableName + "/"

files = dbutils.fs.ls(raw_generic_path)

files_to_process = [
    f for f in files
    if f.name.lower().startswith(sourceFileNamePrefix.lower()) and f.name.lower().endswith(f".{sourceFileExtension.lower()}")
]

files_to_process.sort()

if not files_to_process:
    print("No matching files")

In [0]:
#Only process the newest file (list sorted), the rest goes directly to archive
archive_path = "s3://" + config.archive_bucket + "/" + tableName
bronze_generic_path = "s3://" + config.bronze_bucket + "/" 

if files_to_process:
    for i, file in enumerate(files_to_process, start=1):

        #CHECK FILENAME

        #READ SOURCE FILE AND ADD COLUMN NAMES TO DATAFRAME

        file_path = raw_generic_path + file.name

        try: 
            df_raw = spark.read.options(encoding=sourceFileEncoding,delimiter=sourceFileDelimiter, header=False, schema=raw_schema).csv(file_path) 
        
        except Exception as e:
            print(f"Error reading file {file_path}: {e}")
            raise

        df_raw = df_raw.toDF(*[f.name for f in raw_schema.fields])
        source_cnt = df_raw.count()

        #WRITE IN BRONZE
        if df_raw and df_raw.limit(1).count() > 0:
            bronze_delta_path = bronze_generic_path.rstrip("/") + "/" + tableName

            try:
                dbutils.fs.ls(bronze_delta_path)
            except:
                #dbutils.fs.mkdirs(bronze_delta_path)
                df_empty = spark.createDataFrame([], "DW_VALID_FROM_DT timestamp")  # ajusta el esquema
                df_empty.write.format("delta").save(bronze_delta_path)
                
            try:
                # Enriquecer el DF SIEMPRE antes de escribir
                df_raw = df_raw.withColumn('DW_VALID_FROM_DT', F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")) \
                    .withColumn('DW_VALID_TO_DT', F.lit(None).cast("timestamp")) \
                    .withColumn('DW_CURR_ROW_FLG', F.lit(True)) \
                    .withColumn('DW_FILE_NAME', F.lit(file.name))
                
                if DeltaTable.isDeltaTable(spark, bronze_delta_path):
                    delta_table = DeltaTable.forPath(spark, bronze_delta_path)
                    
                    # Actualizar registros existentes (solo en la primera vuelta, para que no marque false los registros de otros files insertados en la misma corrida)
                    if i == 1 and "DW_CURR_ROW_FLG" in delta_table.toDF().columns:
                        delta_table.update(
                            condition="DW_CURR_ROW_FLG = true",
                            set={
                                "DW_CURR_ROW_FLG": "false",
                                "DW_VALID_TO_DT": F.from_utc_timestamp(F.current_timestamp(), "Brazil/East")
                            }
                        )
                    
                    mode = "append"
                else:
                    mode = "overwrite"  # primera carga
                
                # DataFrame mínimo solo con esquema

                df_raw.write.format("delta").mode(mode).option("mergeSchema", "true").save(bronze_delta_path)
            
            except Exception as e:
                print(f"Error writing file {bronze_generic_path}: {e}")
                raise
        else:
            print("DataFrame vacío, no se sobrescribió el bronze.")

        target_cnt = df_raw.count()

        #Always move to archive
        source_path = raw_generic_path.rstrip("/")
        target_path = archive_path.rstrip("/")

        # Ruta completa del archivo origen
        source_file_full = f"{source_path}/{file.name}"

        # Verificar existencia
        try:
            files = [f.name for f in dbutils.fs.ls(source_path)]
            if file.name not in files:
                print(f"Archivo no encontrado: {source_file_full}")
        except Exception as e:
            print(f"Error al acceder al path origen: {e}")
            raise

        # Crear carpeta destino si no existe
        try:
            dbutils.fs.ls(target_path)
        except:
            dbutils.fs.mkdirs(target_path)

        # Generar nombre con timestamp
        #source_file_name, source_file_extension = file.name.split(".", 1)
        #today = datetime.today().strftime("%Y%m%d_%H%M%S")
        #target_file_name = f"{source_file_name}_{today}.{source_file_extension}"
        target_file_name = file.name

        # Rutas completas
        target_file_full = f"{target_path}/{target_file_name}"

        # Copiar y borrar
        dbutils.fs.cp(source_file_full, target_file_full)
        dbutils.fs.rm(source_file_full)

        print(f"Archivo movido: {source_file_full} → {target_file_full}")
else:
    print("No files to process")
