#### Load JSON file to a flattened pySpark dataframe

In [1]:
from pyspark.sql.functions import expr

# Create a function to flatten the dataframe and select required columns
def flatten_df(df_name):
   return df_name.select(
            expr("forecasts.regions[0][1].issued").alias("Issued"),
            expr("forecasts.regions[0][2].today").alias("Today"),
            expr("forecasts.regions[0][3].tonight").alias("Tonight"),
            expr("forecasts.regions[0][4].tomorrow").alias("Tomorrow"),
            expr("forecasts.regions[0][5].outlook").alias("Outlook")
        )

# Define the path to the JSON file
json_file_path = "abfss://6c2dfd82-79ca-43af-b447-84d78a797dd3@onelake.dfs.fabric.microsoft.com/06a9b01e-67c7-4d49-b134-528745660f6b/Files/Bronze/DublinTextForecast.json"

# Load JSON data into a dataframe
df = flatten_df(spark.read.json(json_file_path))

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 3, Finished, Available)

#### Replace nulls and blanks in Outlook column

In [2]:
 from pyspark.sql.functions import when, lit, col
 
 # Update Outlook to "Unknown" if Outlook is null or empty
 df = df.withColumn("Outlook", when((col("Outlook").isNull() | (col("Outlook")=="")),lit("Unknown")).otherwise(col("Outlook")))

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 4, Finished, Available)

#### More transformations

In [4]:
from pyspark.sql.functions import substring, to_date, from_utc_timestamp

# Convert Issued to Irish timestamp
df = df.withColumn("Issued", from_utc_timestamp(col("Issued"), "Europe/Dublin"))

# Add 'Issue_Time' column and convert Issued column to date type
df = df.withColumn("IssueTime", substring(col("Issued"), 12, 19)) \
       .withColumn("Issued", to_date(col("Issued")))

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 6, Finished, Available)

#### Define schema for silver table

In [5]:
from pyspark.sql.types import TimestampType, StringType, DateType
from delta.tables import DeltaTable

DeltaTable.createIfNotExists(spark) \
    .tableName("lakehouse.dublintextforecast_silver") \
    .addColumn("Issued", DateType()) \
    .addColumn("IssueTime", StringType()) \
    .addColumn("Today", StringType()) \
    .addColumn("Tonight", StringType()) \
    .addColumn("Tomorrow", StringType()) \
    .addColumn("Outlook", StringType()) \
    .execute()

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 7, Finished, Available)

<delta.tables.DeltaTable at 0x7ad757807e50>

#### Optimize Delta table writes

In [6]:
 # Enable V-Order
 spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
 # Enable automatic Delta optimized write
 spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 8, Finished, Available)

#### Write dataframe to the silver table (overwrite operation)

In [7]:
df.write.format("delta").mode("overwrite").save("Tables/dublintextforecast_silver")
"""
# Update existing records and insert new ones based on a condition defined by the columns Issued
    
deltaTable = DeltaTable.forPath(spark, 'Tables/dublintextforecast_silver')    

dfUpdates = df
    
deltaTable.alias('silver') \
  .merge(
    dfUpdates.alias('updates'),
    'silver.Issued = updates.Issued'
  ) \
   .whenMatchedUpdate(set =
    {
          
    }
  ) \
 .whenNotMatchedInsert(values =
    {
      "Issued": "updates.Issued",
      "Today": "updates.Today",
      "Tonight": "updates.Tonight",
      "Tomorrow": "updates.Tomorrow",
      "Outlook": "updates.Outlook",
      "DateTime_Loaded": "updates.DateTime_Loaded"
    }
  ) \
  .execute()
  """

StatementMeta(, a2971272-ffcd-4646-9ad7-23c1395f22c7, 9, Finished, Available)

'\n# Update existing records and insert new ones based on a condition defined by the columns Issued\n    \ndeltaTable = DeltaTable.forPath(spark, \'Tables/dublintextforecast_silver\')    \n\ndfUpdates = df\n    \ndeltaTable.alias(\'silver\')   .merge(\n    dfUpdates.alias(\'updates\'),\n    \'silver.Issued = updates.Issued\'\n  )    .whenMatchedUpdate(set =\n    {\n          \n    }\n  )  .whenNotMatchedInsert(values =\n    {\n      "Issued": "updates.Issued",\n      "Today": "updates.Today",\n      "Tonight": "updates.Tonight",\n      "Tomorrow": "updates.Tomorrow",\n      "Outlook": "updates.Outlook",\n      "DateTime_Loaded": "updates.DateTime_Loaded"\n    }\n  )   .execute()\n  '