#### Load silver table into a dataframe

In [1]:
# Script written by John Uzoma

df = spark.read.table("lakehouse.dublinweatherforecast_silver")

StatementMeta(, 3aa2c9c3-3333-4f7f-9ab5-500554390480, 3, Finished, Available)

#### Transform dataframe by extracting date, day, month and year values

In [2]:
# Script written by John Uzoma

from pyspark.sql.functions import dayofmonth, year, date_format

# Create dataframe for dimDate_gold

df = df.select(
        ("Date"), \
        dayofmonth("Date").alias("Day"), \
        date_format("Date", "MMM").substr(1, 3).alias("Month"), \
        year("Date").alias("Year")
    ).orderBy("Date")

StatementMeta(, 3aa2c9c3-3333-4f7f-9ab5-500554390480, 4, Finished, Available)

#### Drop duplicates in Date column

In [4]:
# Script written by John Uzoma

df = df.dropDuplicates(["Date"])

StatementMeta(, 3aa2c9c3-3333-4f7f-9ab5-500554390480, 6, Finished, Available)

#### Define schema for gold table

In [4]:
# Script written by John Uzoma

from pyspark.sql.types import DateType, IntegerType, StringType
from delta.tables import DeltaTable

# Define the schema for the dimdate_gold table
DeltaTable.createIfNotExists(spark) \
    .tableName("lakehouse.dimdate_gold") \
    .addColumn("Date", DateType()) \
    .addColumn("Day", IntegerType()) \
    .addColumn("Month", StringType()) \
    .addColumn("Year", IntegerType()) \
    .execute()

StatementMeta(, b6fd7a62-be72-46fd-ab2e-93c0f9ac7314, 6, Finished, Available)

<delta.tables.DeltaTable at 0x7d208ff99390>

#### Optimize delta table writes

In [5]:
# Script written by John Uzoma
 
# Enable V-Order
spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
# Enable automatic Delta optimized write
spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

StatementMeta(, b6fd7a62-be72-46fd-ab2e-93c0f9ac7314, 7, Finished, Available)

#### Write dataframe to silver table (overwrite operation)

In [6]:
# Script written by John Uzoma
# Update existing records and insert new ones based on a condition defined by the column: Date
    
deltaTable = DeltaTable.forPath(spark, 'Tables/dimdate_gold')    

dfUpdates = df
    
deltaTable.alias('gold') \
  .merge(
    dfUpdates.alias('updates'),
    'gold.Date = updates.Date'
  ) \
   .whenMatchedUpdate(set =
    {
      "Date": "updates.Date",
      "Day": "updates.Day",
      "Month": "updates.Month",
      "Year": "updates.Year"
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "Date": "updates.Date",
      "Day": "updates.Day",
      "Month": "updates.Month",
      "Year": "updates.Year"
    }
  ) \
  .execute()

StatementMeta(, b6fd7a62-be72-46fd-ab2e-93c0f9ac7314, 8, Finished, Available)