In [2]:
from datetime import datetime as dt
import pyspark.sql.functions as F
from pyspark.sql.types import *
from delta import *
from config import *
import os


spark.sql("set spark.sql.streaming.schemaInference=true")
spark.conf.set("spark.sql.debug.maxToStringFields", 1000)
spark.sql("set spark.databricks.delta.changeDataFeed.timestampOutOfRange.enabled = true;")

os.makedirs(bronze_location,exist_ok = True)
os.makedirs(silver_location,exist_ok = True)

In [3]:
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {schema}")

spark.sql(
    f"""
CREATE TABLE IF NOT EXISTS  {schema}.{bronze_table} (
    nombre string, 
    apellido string, 
    cedula string, 
    salario double, 
    gasto double, 
    estado string, 
    fecha_de_inicio date, 
    fecha_actualizacion timestamp, 
    fecha_consulta timestamp, 
    institucion string,
    nombre_reporte string
    )
USING delta
LOCATION '{bronze_location}'
TBLPROPERTIES (delta.enableChangeDataFeed = true)
"""
)

24/02/15 20:19:14 WARN HiveConf: HiveConf of name hive.stats.jdbc.timeout does not exist
24/02/15 20:19:14 WARN HiveConf: HiveConf of name hive.stats.retries.wait does not exist
24/02/15 20:19:15 WARN ObjectStore: Version information not found in metastore. hive.metastore.schema.verification is not enabled so recording the schema version 2.3.0
24/02/15 20:19:15 WARN ObjectStore: setMetaStoreSchemaVersion called but recording version is disabled: version = 2.3.0, comment = Set by MetaStore hadoop@127.0.1.1


DataFrame[]

In [4]:
def load_raw_into_bronze():
    
    raw_data_sdf =(spark.readStream.schema(StructType([
                 StructField("Nombre", StringType(), True),
                 StructField("Apellido", StringType(), True),
                 StructField("Cédula",StringType(), True),
                 StructField("Salario", DoubleType(), True),
                 StructField("Gasto", DoubleType(), True),
                 StructField("Estado", StringType(), True),
                 StructField("Fecha de inicio", StringType(), True),
                StructField("Fecha Actualizacion", TimestampType(), True),
                 StructField("Fecha Consulta", TimestampType(), True),
                 StructField("Institucion",StringType(), True)]))
                .parquet("s3a://pty-planilla-publica/")
                  )
    
    columns = raw_data_sdf.columns
    
    # Load Raw Data into Bronze
    
    
    
    (
    raw_data_sdf
    .select(list(map(lambda x: F.col(x).alias(x.lower().replace(' ','_')),columns)))
    .withColumnRenamed('cédula','cedula')
    .withColumn('fecha_de_inicio',F.to_date(F.col('fecha_de_inicio'),'dd/MM/yyyy'))
    .withColumn('nombre_reporte', F.input_file_name())
            .writeStream.format('delta')
            .outputMode('append')
            .option("checkpointLocation",   "_checkpoint/bronze_planillas")
            .trigger(availableNow=True)
            .option("path", bronze_location).start()
    ).awaitTermination()
    
load_raw_into_bronze()

24/02/15 20:18:00 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-s3a-file-system.properties,hadoop-metrics2.properties
24/02/15 20:18:03 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.
                                                                                

In [5]:

def create_or_update_silver():

    key_columns = ['nombre','apellido','cedula','salario','gasto','estado','fecha_de_inicio','institucion']

    def deduplicate_bronze(bronze_table,key_columns=key_columns):
        
        return (bronze_table.orderBy(F.col('fecha_consulta'),ascending=False)
                        .dropDuplicates( subset = key_columns )
               )


    
    if  DeltaTable.isDeltaTable(spark,silver_location):
        
        sink_table = DeltaTable.forPath(spark,silver_location)
        last_update = sink_table.history().select(F.max(F.col('timestamp'))).collect()[0][0]
        
        source_sdf = deduplicate_bronze(
            spark.read.format('delta').option("readChangeFeed", "true") 
                  .option("startingTimestamp", str(last_update) )
                         .load(bronze_location)
                     )
    
        

        (sink_table.alias('target')
          .merge(source_sdf.alias('source'), ' AND '.join(list(map(lambda x: f'(source.{x} == target.{x})',key_columns))) )
          .whenNotMatchedInsertAll() 
          .execute()
        )

    else:
        
        source_sdf = deduplicate_bronze(spark.read.format('delta').load(bronze_location))

        source_sdf.write.format('delta').partitionBy(['institucion']).save(silver_location)
        
    


create_or_update_silver()
        

                                                                                

In [8]:
def create_overwrite_agg_by_institucion():
    ( spark.read.format('delta').load(silver_location).groupBy(['institucion','estado'])\
                                            .agg(
                                                F.sum(F.col('salario')).alias('total_salarios'),
                                                F.sum(F.col('gasto')).alias('total_gastos'), 
                                                F.countDistinct(F.col('cedula')).alias('total_personas')
                                                    )\
                                                .withColumn('total_salario_+_gasto', F.col('total_salarios') + F.col('total_gastos'))
                                                 .write.format('delta').mode('overwrite').option("overwriteSchema", "True").save(agg_by_institucion)
    )
create_overwrite_agg_by_institucion()

In [None]:
def create_overwrite_agg_by_institucion():
    ( spark.read.format('delta').load(silver_location).groupBy(['cedula','estado'])\
                                            .agg(
                                                F.sum(F.col('salario')).alias('total_salarios'),
                                                F.sum(F.col('gasto')).alias('total_gastos'), 
                                                F.countDistinct(F.col('cedula')).alias('total_personas')
                                                    )\
                                                .withColumn('total_salario_+_gasto', F.col('total_salarios') + F.col('total_gastos'))
                                                 .write.format('delta').mode('overwrite').option("overwriteSchema", "True").save(agg_by_institucion)
    )

In [None]:
  ( spark.read.format('delta').load(silver_location).groupBy(['cedula','estado'])\
                                            .agg(
                                                F.sum(F.col('salario')).alias('total_salarios'),
                                                F.sum(F.col('gasto')).alias('total_gastos'), 
                                                F.countDistinct(F.col('cedula')).alias('total_personas')
                                                    )\
                                                .withColumn('total_salario_+_gasto', F.col('total_salarios') + F.col('total_gastos'))
                                                 .write.format('delta').mode('overwrite').option("overwriteSchema", "True").save(agg_by_institucion)
    )

In [9]:
spark.stop()