![Verne Academy Summit 2024](https://github.com/javendia/verne-academy-summit-2024/blob/main/header.png?raw=true)

## Paso 1: Ingesta de datos
- Leemos el fichero **Invoices.csv**, especificando el formato (CSV) e indicando que el archivo contiene las cabeceras de las columnas
- Creamos la tabla delta destino en caso de no existir

In [9]:
from pyspark.sql.types import *
from delta.tables import *
from pyspark.sql.functions import regexp_replace

df = spark.read.load('Files/wwi/Invoices.csv',
    format='csv',
    header=True
)

DeltaTable.createIfNotExists(spark) \
     .tableName("Invoices") \
     .addColumn("InvoiceID", IntegerType()) \
     .addColumn("InvoiceLineID", IntegerType()) \
     .addColumn("CustomerID", IntegerType()) \
     .addColumn("OrderID", IntegerType()) \
     .addColumn("DeliveryMethodID", IntegerType()) \
     .addColumn("ContactPersonID", IntegerType()) \
     .addColumn("AccountsPersonID", IntegerType()) \
     .addColumn("SalespersonPersonID", IntegerType()) \
     .addColumn("PackedByPersonID", IntegerType()) \
     .addColumn("InvoiceDate", DateType()) \
     .addColumn("CustomerPurchaseOrderNumber", StringType()) \
     .addColumn("IsCreditNote", BooleanType()) \
     .addColumn("TotalDryItems", IntegerType()) \
     .addColumn("TotalChillerItems", IntegerType()) \
     .addColumn("DeliveryRun", StringType()) \
     .addColumn("RunPosition", StringType()) \
     .addColumn("StockItemID", IntegerType()) \
     .addColumn("Description", StringType()) \
     .addColumn("PackageTypeID", IntegerType()) \
     .addColumn("Quantity", IntegerType()) \
     .addColumn("UnitPrice", DecimalType(18,2)) \
     .addColumn("TaxRate", DecimalType(18,3)) \
     .addColumn("TaxAmount", DecimalType(18,2)) \
     .addColumn("LineProfit", DecimalType(18,2)) \
     .addColumn("ExtendedPrice", DecimalType(18,2)) \
     .addColumn("LastUpdated", TimestampType()) \
     .execute()

deltaTable = DeltaTable.forPath(spark, 'Tables/invoices')

StatementMeta(, 36d40039-5cae-4489-b6fa-baf3f97ee1aa, 11, Finished, Available)

## Paso 2: Transformación
- Adecuamos el separador de miles, reemplazando **','** por **'.'** y definiendo el tipo de datos para el destino

In [10]:
df = df.withColumn('UnitPrice', regexp_replace('UnitPrice', ',', '.').cast(DecimalType(18,2))) \
        .withColumn('TaxRate', regexp_replace('TaxRate', ',', '.').cast(DecimalType(18,3))) \
        .withColumn('TaxAmount', regexp_replace('TaxAmount', ',', '.').cast(DecimalType(18,2))) \
        .withColumn('LineProfit', regexp_replace('LineProfit', ',', '.').cast(DecimalType(18,2))) \
        .withColumn('ExtendedPrice', regexp_replace('ExtendedPrice', ',', '.').cast(DecimalType(18,2)))

StatementMeta(, 36d40039-5cae-4489-b6fa-baf3f97ee1aa, 12, Finished, Available)

## Paso 3: Creación de vista temporal
- Creamos la vista temporal **vw_invoices** donde almacenar los datos tratados anteriormente

In [11]:
df.createOrReplaceTempView("vw_invoices")

StatementMeta(, 36d40039-5cae-4489-b6fa-baf3f97ee1aa, 13, Finished, Available)

## Paso 3: Instrucción MERGE

- Si el valor del registro para la columna **InvoiceID** existe **(MATCHED)** en la tabla destino y alguna columna difiere del registro existente, actualizamos la fila
- Si el valor del registro para la columna **InvoiceID** no existe **(NOT MATCHED)** en la tabla destino, insertamos una nueva fila
- En caso de que el registro de la tabla destino no exista en el fichero origen, eliminamos esa fila

In [12]:
%%sql

MERGE INTO invoices AS target
USING vw_invoices AS source
ON target.InvoiceID = source.InvoiceID AND target.InvoiceLineID = source.InvoiceLineID
WHEN MATCHED AND
(
        target.CustomerID <> source.CustomerID
        OR target.OrderID <> source.OrderID
        OR target.DeliveryMethodID <> source.DeliveryMethodID
        OR target.ContactPersonID <> source.ContactPersonID
        OR target.AccountsPersonID <> source.AccountsPersonID
        OR target.SalespersonPersonID <> source.SalespersonPersonID
        OR target.PackedByPersonID <> source.PackedByPersonID
        OR target.InvoiceDate <> source.InvoiceDate
        OR target.CustomerPurchaseOrderNumber <> source.CustomerPurchaseOrderNumber
        OR target.IsCreditNote <> source.IsCreditNote
        OR target.TotalDryItems <> source.TotalDryItems
        OR target.TotalChillerItems <> source.TotalChillerItems
        OR target.DeliveryRun <> source.DeliveryRun
        OR target.RunPosition <> source.RunPosition
        OR target.StockItemID <> source.StockItemID
        OR target.Description <> source.Description
        OR target.PackageTypeID <> source.PackageTypeID
        OR target.Quantity <> source.Quantity
        OR target.UnitPrice <> source.UnitPrice
        OR target.TaxRate <> source.TaxRate
        OR target.TaxAmount <> source.TaxAmount
        OR target.LineProfit <> source.LineProfit
        OR target.ExtendedPrice <> source.ExtendedPrice
)
THEN
UPDATE SET
        target.CustomerID = source.CustomerID
        ,target.OrderID = source.OrderID
        ,target.DeliveryMethodID = source.DeliveryMethodID
        ,target.ContactPersonID = source.ContactPersonID
        ,target.AccountsPersonID = source.AccountsPersonID
        ,target.SalespersonPersonID = source.SalespersonPersonID
        ,target.PackedByPersonID = source.PackedByPersonID
        ,target.InvoiceDate = source.InvoiceDate
        ,target.CustomerPurchaseOrderNumber = source.CustomerPurchaseOrderNumber
        ,target.IsCreditNote = source.IsCreditNote
        ,target.TotalDryItems = source.TotalDryItems
        ,target.TotalChillerItems = source.TotalChillerItems
        ,target.DeliveryRun = source.DeliveryRun
        ,target.RunPosition = source.RunPosition
        ,target.StockItemID = source.StockItemID
        ,target.Description = source.Description
        ,target.PackageTypeID = source.PackageTypeID
        ,target.Quantity = source.Quantity
        ,target.UnitPrice = source.UnitPrice
        ,target.TaxRate = source.TaxRate
        ,target.TaxAmount = source.TaxAmount
        ,target.LineProfit = source.LineProfit
        ,target.ExtendedPrice = source.ExtendedPrice
        ,target.LastUpdated = CURRENT_TIMESTAMP()
WHEN NOT MATCHED THEN
INSERT (InvoiceID, InvoiceLineID, CustomerID, OrderID, DeliveryMethodID, ContactPersonID, AccountsPersonID, SalespersonPersonID, PackedByPersonID,
        InvoiceDate, CustomerPurchaseOrderNumber, IsCreditNote, TotalDryItems, TotalChillerItems, DeliveryRun, RunPosition, 
        StockItemID, Description, PackageTypeID, Quantity, UnitPrice, TaxRate, TaxAmount, LineProfit, ExtendedPrice, LastUpdated)
VALUES (source.InvoiceID, source.InvoiceLineID, source.CustomerID, source.OrderID, source.DeliveryMethodID, source.ContactPersonID, source.AccountsPersonID, source.SalespersonPersonID, source.PackedByPersonID,
        source.InvoiceDate, source.CustomerPurchaseOrderNumber, source.IsCreditNote, source.TotalDryItems, source.TotalChillerItems, source.DeliveryRun, source.RunPosition, 
        source.StockItemID, source.Description, source.PackageTypeID, source.Quantity, source.UnitPrice, source.TaxRate, source.TaxAmount, source.LineProfit, source.ExtendedPrice, CURRENT_TIMESTAMP())
WHEN NOT MATCHED BY SOURCE THEN
DELETE;

StatementMeta(, 36d40039-5cae-4489-b6fa-baf3f97ee1aa, 14, Finished, Available)

<Spark SQL result set with 1 rows and 4 fields>

## Paso 5: Eliminación vista temporal

In [13]:
spark.catalog.dropTempView("vw_invoices")

StatementMeta(, 36d40039-5cae-4489-b6fa-baf3f97ee1aa, 15, Finished, Available)

True