In [1]:
%run file_ingestion.py

Ingestion Process
Checking for updates
                   2024-04-02 08:20:09
no updates available


In [2]:
import os
from datetime import datetime as dt



import pyspark.sql.functions as F
from config import *
from delta import *
from pyspark.sql import Window
from pyspark.sql.types import *

spark.sql("set spark.sql.streaming.schemaInference=true")
spark.sql("set SQLConf.ADAPTIVE_EXECUTION_ENABLED.key= true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.sql(
    "set spark.databricks.delta.changeDataFeed.timestampOutOfRange.enabled = true;"
)

DataFrame[key: string, value: string]

In [3]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema} ")
spark.sql(f"""
  CREATE EXTERNAL TABLE IF NOT EXISTS {bronze_table} 
  (nombre string,
  apellido string,
  cedula string,
  cargo string,
  salario double,
  gasto double,
  estado string,
  fecha_de_inicio Timestamp,
  fecha_actualizacion Timestamp,
  fecha_consulta timestamp,
  nombre_reporte string,
  institucion string 
  
  ) USING delta
  LOCATION '{bronze_location}'
  TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")
spark.sql(
    f"ALTER TABLE  {bronze_table}  SET TBLPROPERTIES (delta.enableChangeDataFeed = true)"
)

24/04/12 16:09:43 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/04/12 16:09:43 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/04/12 16:09:44 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/04/12 16:09:44 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hadoop@127.0.1.1
24/04/12 16:09:45 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
                                                                                

DataFrame[]

In [4]:
spark.sql(""" show databases; """)

DataFrame[namespace: string]

In [5]:

def load_raw_into_bronze():

    raw_data_sdf = spark.readStream.schema(
        StructType(
            [
                StructField("Nombre", StringType(), True),
                StructField("Apellido", StringType(), True),
                StructField("Cédula", StringType(), True),
                StructField("Cargo", StringType(), True),
                StructField("Salario", DoubleType(), True),
                StructField("Gasto", DoubleType(), True),
                StructField("Estado", StringType(), True),
                StructField("Fecha de inicio", StringType(), True),
                StructField("Fecha Actualizacion", StringType(), True),
                StructField("Fecha Consulta", StringType(), True),
                StructField("Institucion", StringType(), True),
            ]
        )
    ).parquet(f"{contraloria_source_folder}")

    columns = raw_data_sdf.columns

    # Load Raw Data into Bronze

    (
        raw_data_sdf.select(
            list(map(lambda x: F.col(x).alias(x.lower().replace(" ", "_")), columns))
        )
        .withColumnRenamed("cédula", "cedula")
        .withColumn("fecha_de_inicio", F.col("fecha_de_inicio").cast(TimestampType()))
        .withColumn(
            "fecha_actualizacion", F.col("fecha_actualizacion").cast(TimestampType())
        )
        .withColumn("fecha_consulta", F.col("fecha_consulta").cast(TimestampType()))
        .withColumn("nombre_reporte", F.input_file_name())
        .writeStream.format("delta")
        .outputMode("append")
        .option("checkpointLocation", "contraloria/planillas/_checkpoint/bronze_planillas")
        .trigger(availableNow=True)
        .toTable(bronze_table)
    ).awaitTermination()


load_raw_into_bronze()

24/04/12 16:09:49 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [6]:
def create_or_update_silver():

    key_columns = [
        "nombre",
        "apellido",
        "cedula",
        "salario",
        "gasto",
        "estado",
        "fecha_de_inicio",
        "institucion",
        "cargo",
    ]

    def deduplicate_bronze(bronze_table, key_columns=key_columns):
        window = Window.partitionBy(F.col("institucion"))
        return (
            bronze_table.withColumn(
                "actualizado", F.max(F.col("fecha_actualizacion")).over(window)
            )
            .where("fecha_actualizacion == actualizado")
            .drop("actualizado")
            .orderBy(F.col("fecha_consulta"), ascending=False)
            .dropDuplicates(subset=key_columns)
        )

    if DeltaTable.isDeltaTable(spark, silver_location):

        sink_table = DeltaTable.forPath(spark, silver_location)
        last_update = (
            sink_table.history().select(F.max(F.col("timestamp"))).collect()[0][0]
        )

        source_sdf = deduplicate_bronze(
            spark.read.format("delta")
            .option("readChangeFeed", "true")
            .option("startingTimestamp", str(last_update))
            .table(bronze_table)
        )

        (
            sink_table.alias("target")
            .merge(
                source_sdf.alias("source"),
                " AND ".join(
                    list(map(lambda x: f"(source.{x} == target.{x})", key_columns))
                ),
            )
            .whenNotMatchedInsertAll()
            .execute()
        )

    else:

        source_sdf = deduplicate_bronze(spark.read.format("delta").table(bronze_table))

        (
            source_sdf.write.format("delta")
            .partitionBy(["institucion"])
            .saveAsTable(silver_table)
        )
        spark.sql(
            f"ALTER TABLE  {silver_table}  SET TBLPROPERTIES (delta.enableChangeDataFeed = true)"
        )


create_or_update_silver()

AnalysisException: [TABLE_OR_VIEW_ALREADY_EXISTS] Cannot create table or view `planilla_publicas_panama`.`silver` because it already exists.
Choose a different name, drop or replace the existing object, or add the IF NOT EXISTS clause to tolerate pre-existing objects.

In [None]:
window = Window.partitionBy(F.col("institucion"))
silver_sdf = (
    spark.read.format("delta")
    .load(silver_location)
    .withColumn("actualizado", F.max(F.col("fecha_actualizacion")).over(window))
    .where("fecha_actualizacion == actualizado")
    .drop("actualizado")
)

In [None]:
def create_overwrite_agg_by_institucion(silver_sdf):

    def agg_silver_by_institucion(sdf, group_by):
        return (
            silver_sdf.groupBy(group_by)
            .agg(
                F.sum(F.col("salario")).alias("salarios_totales"),
                F.sum(F.col("gasto")).alias("gastos_totales"),
                F.countDistinct(F.col("cedula")).alias("total_personas"),
            )
            .withColumn(
                "salario_mas_gasto", F.col("salarios_totales") + F.col("gastos_totales")
            )
        )

    agg_silver_by_institucion(silver_sdf, "institucion").withColumn(
        "estado", F.lit("TODOS")
    ).unionByName(
        agg_silver_by_institucion(silver_sdf, ["institucion", "estado"])
    ).write.format(
        "delta"
    ).mode(
        "overwrite"
    ).option(
        "overwriteSchema", "True"
    ).save(
        agg_by_institucion
    )


create_overwrite_agg_by_institucion(silver_sdf)

In [None]:
def latest_report_agg_by_id(silver_sdf):

    agg_by_cedula = (
        silver_sdf.select(
            "*",
            F.to_json(
                F.struct(
                    F.col("institucion"),
                    F.col("estado"),
                    F.col("fecha_de_inicio"),
                    F.col("salario"),
                    F.col("gasto"),
                    F.col("fecha_consulta"),
                )
            ).alias("detalle"),
        )
        .withColumn(
            "id_trabajo",
            F.concat(
                F.col("institucion"),
                F.col("estado"),
                F.col("fecha_de_inicio"),
                F.col("salario"),
            ),
        )
        .groupBy("cedula")
        .agg(
            F.countDistinct("id_trabajo").alias("trabajos"),
            F.countDistinct("institucion").alias("cantidad_de_instituciones"),
            F.sum(F.col("salario")).alias("salario_total"),
            F.sum(F.col("gasto")).alias("gastos_totales"),
            F.min("fecha_de_inicio").alias("fecha_primer_trabajo_activo"),
            F.max("fecha_de_inicio").alias("fecha_ultimo_trabajo_activo"),
            F.max("fecha_actualizacion").alias("fecha_actualizacion"),
            F.collect_list(F.struct(F.col("detalle"))).alias("detalle"),
        )
        .withColumn(
            "salario_mas_gasto", F.col("salario_total") + F.col("gastos_totales")
        )
        .write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "True")
        .save(agg_by_id)
    )


latest_report_agg_by_id(silver_sdf)