In [13]:
%run file_ingestion.py

Ingestion Process
Checking for updates


AttributeError: 'NoneType' object has no attribute 'group'

In [2]:
import os
from datetime import datetime as dt



import pyspark.sql.functions as F

from delta import *
from pyspark.sql import Window
from pyspark.sql.types import *

schema = 'planilla_publicas_panama'
contraloria_source_folder = 'contraloria/planillas/raw_data/'
folder = 'planilla_publicas_panama.db'
bronze_table = f'{schema}.bronze'
bronze_location = f'{folder}/bronze'
silver_location =  f'{folder}/silver'
silver_table = f'{schema}.silver'
agg_by_institucion = f'{folder}/agg_by_institucion'
agg_by_id = f'{folder}/summary_by_id'

spark.sql("set spark.sql.streaming.schemaInference=true")
spark.sql("set SQLConf.ADAPTIVE_EXECUTION_ENABLED.key= true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.sql(
    "set spark.databricks.delta.changeDataFeed.timestampOutOfRange.enabled = true;"
)

DataFrame[key: string, value: string]

In [3]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema} ")
spark.sql(f"""
  CREATE EXTERNAL TABLE IF NOT EXISTS {bronze_table} 
  (nombre string,
  apellido string,
  cedula string,
  cargo string,
  salario double,
  gasto double,
  estado string,
  fecha_de_inicio Timestamp,
  fecha_actualizacion Timestamp,
  fecha_consulta timestamp,
  nombre_reporte string,
  institucion string 
  
  ) USING delta
  LOCATION '{bronze_location}'
  TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

spark.sql(f"""
  CREATE EXTERNAL TABLE IF NOT EXISTS {silver_table} 
  (nombre string,
  apellido string,
  cedula string,
  cargo string,
  salario double,
  gasto double,
  estado string,
  fecha_de_inicio Timestamp,
  fecha_actualizacion Timestamp,
  fecha_consulta timestamp,
  nombre_reporte string,
  institucion string 
  
  ) USING delta
  LOCATION '{silver_location}'
  TBLPROPERTIES (delta.enableChangeDataFeed = true)
""")

24/04/14 19:52:23 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/04/14 19:52:23 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/04/14 19:52:24 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/04/14 19:52:24 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hadoop@127.0.1.1


DataFrame[]

In [4]:
spark.sql(""" show databases; """).toPandas()

Unnamed: 0,namespace
0,default
1,planilla_publicas_panama


In [5]:

def load_raw_into_bronze():

    raw_data_sdf = spark.readStream.schema(
        StructType(
            [
                StructField("Nombre", StringType(), True),
                StructField("Apellido", StringType(), True),
                StructField("Cédula", StringType(), True),
                StructField("Cargo", StringType(), True),
                StructField("Salario", DoubleType(), True),
                StructField("Gasto", DoubleType(), True),
                StructField("Estado", StringType(), True),
                StructField("Fecha de inicio", StringType(), True),
                StructField("Fecha Actualizacion", StringType(), True),
                StructField("Fecha Consulta", StringType(), True),
                StructField("Institucion", StringType(), True),
            ]
        )
    ).parquet(f"{contraloria_source_folder}")

    columns = raw_data_sdf.columns

    # Load Raw Data into Bronze

    (
        raw_data_sdf.select(
            list(map(lambda x: F.col(x).alias(x.lower().replace(" ", "_")), columns))
        )
        .withColumnRenamed("cédula", "cedula")
        .withColumn("fecha_de_inicio", F.col("fecha_de_inicio").cast(TimestampType()))
        .withColumn(
            "fecha_actualizacion", F.col("fecha_actualizacion").cast(TimestampType())
        )
        .withColumn("fecha_consulta", F.col("fecha_consulta").cast(TimestampType()))
        .withColumn("nombre_reporte", F.input_file_name())
        .writeStream.format("delta")
        .outputMode("append")
        .option("checkpointLocation", "contraloria/planillas/_checkpoint/bronze_planillas")
        .trigger(availableNow=True)
        .toTable(bronze_table)
    ).awaitTermination()


load_raw_into_bronze()

24/04/14 19:52:25 WARN ObjectStore: Failed to get database global_temp, returning NoSuchObjectException
24/04/14 19:52:27 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


In [6]:
DeltaTable.forName(spark,bronze_table).toDF().count()

                                                                                

1420329

In [7]:
def create_or_update_silver():

    key_columns = [
        "nombre",
        "apellido",
        "cedula",
        "salario",
        "gasto",
        "estado",
        "fecha_de_inicio",
        "institucion",
        "cargo",
    ]

    def deduplicate_bronze(bronze_table, key_columns=key_columns):
        window = Window.partitionBy(F.col("institucion"))
        return (
            bronze_table.withColumn(
                "actualizado", F.max(F.col("fecha_actualizacion")).over(window)
            )
            .where("fecha_actualizacion == actualizado")
            .drop("actualizado")
            .orderBy(F.col("fecha_consulta"), ascending=False)
            .dropDuplicates(subset=key_columns)
        )



    sink_table = DeltaTable.forName(spark, silver_table)
    last_update = (
        sink_table.history().select(F.max(F.col("timestamp"))).collect()[0][0]
    )

    source_sdf = deduplicate_bronze(
        spark.read.format("delta")
        .option("readChangeFeed", "true")
        .option("startingTimestamp", str(last_update))
        .table(bronze_table)
    )

    (
        sink_table.alias("target")
        .merge(
            source_sdf.alias("source"),
            " AND ".join(
                list(map(lambda x: f"(source.{x} == target.{x})", key_columns))
            ),
        )
        .whenNotMatchedInsertAll()
        .execute()
    )


create_or_update_silver()

In [8]:
DeltaTable.forName(spark, silver_table).toDF().count()

168633

In [9]:
window = Window.partitionBy(F.col("institucion"))
silver_sdf = (
    spark.read.format("delta")
    .load(f'spark-warehouse/{silver_location}')
    .withColumn("actualizado", F.max(F.col("fecha_actualizacion")).over(window))
    .where("fecha_actualizacion == actualizado")
    .drop("actualizado")
)

In [10]:
def create_overwrite_agg_by_institucion(silver_sdf):

    def agg_silver_by_institucion(sdf, group_by):
        return (
            silver_sdf.groupBy(group_by)
            .agg(
                F.sum(F.col("salario")).alias("salarios_totales"),
                F.sum(F.col("gasto")).alias("gastos_totales"),
                F.countDistinct(F.col("cedula")).alias("total_personas"),
            )
            .withColumn(
                "salario_mas_gasto", F.col("salarios_totales") + F.col("gastos_totales")
            )
        )

    agg_silver_by_institucion(silver_sdf, "institucion").withColumn(
        "estado", F.lit("TODOS")
    ).unionByName(
        agg_silver_by_institucion(silver_sdf, ["institucion", "estado"])
    ).write.format(
        "delta"
    ).mode(
        "overwrite"
    ).option(
        "overwriteSchema", "True"
    ).save(
        agg_by_institucion
    )


create_overwrite_agg_by_institucion(silver_sdf)

In [11]:
spark.read.load(agg_by_institucion).toPandas()

Unnamed: 0,institucion,salarios_totales,gastos_totales,total_personas,salario_mas_gasto,estado
0,CONTRALORIA GENERAL DE LA REPUBLICA,6.564859e+06,44000.0,3917,6.608859e+06,PERMANENTE
1,CONTRALORIA GENERAL DE LA REPUBLICA,5.242466e+05,800.0,492,5.250466e+05,EVENTUAL
2,FISCALÍA DE CUENTAS,1.859500e+05,8500.0,112,1.944500e+05,PERMANENTE
3,FISCALÍA DE CUENTAS,5.750000e+04,0.0,37,5.750000e+04,EVENTUAL
4,FISCALÍA GENERAL ELECTORAL,3.584950e+05,23500.0,213,3.819950e+05,PERMANENTE
...,...,...,...,...,...,...
177,MINISTERIO DE DESARROLLO SOCIAL,2.621678e+06,23200.0,2808,2.644878e+06,TODOS
178,MINISTERIO DE EDUCACION,1.022351e+08,21500.0,59961,1.022566e+08,TODOS
179,MINISTERIO DE SALUD,4.065927e+07,38000.0,20801,4.069727e+07,TODOS
180,PROCURADURÍA GENERAL DE LA NACIÓN,1.217174e+07,701750.0,7111,1.287349e+07,TODOS


In [12]:
def latest_report_agg_by_id(silver_sdf):

    agg_by_cedula = (
        silver_sdf.select(
            "*",
            F.to_json(
                F.struct(
                    F.col("institucion"),
                    F.col("estado"),
                    F.col("fecha_de_inicio"),
                    F.col("salario"),
                    F.col("gasto"),
                    F.col("fecha_consulta"),
                )
            ).alias("detalle"),
        )
        .withColumn(
            "id_trabajo",
            F.concat(
                F.col("institucion"),
                F.col("estado"),
                F.col("fecha_de_inicio"),
                F.col("salario"),
            ),
        )
        .groupBy("cedula")
        .agg(
            F.countDistinct("id_trabajo").alias("trabajos"),
            F.countDistinct("institucion").alias("cantidad_de_instituciones"),
            F.sum(F.col("salario")).alias("salario_total"),
            F.sum(F.col("gasto")).alias("gastos_totales"),
            F.min("fecha_de_inicio").alias("fecha_primer_trabajo_activo"),
            F.max("fecha_de_inicio").alias("fecha_ultimo_trabajo_activo"),
            F.max("fecha_actualizacion").alias("fecha_actualizacion"),
            F.collect_list(F.struct(F.col("detalle"))).alias("detalle"),
        )
        .withColumn(
            "salario_mas_gasto", F.col("salario_total") + F.col("gastos_totales")
        )
        .write.format("delta")
        .mode("overwrite")
        .option("overwriteSchema", "True")
        .save(agg_by_id)
    )


latest_report_agg_by_id(silver_sdf)

                                                                                