#### Define schema for dataframe and load json file into it

In [7]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
    
 # Create the schema for the table
df_schema = StructType([
    StructField("level", StringType()),
    StructField("severity", StringType()),
    StructField("certainty", StringType()),
    StructField("issued", TimestampType()),
    StructField("updated", TimestampType()),
    StructField("onset", TimestampType()),
    StructField("expiry", TimestampType()),
    StructField("headline", StringType()),
    StructField("description", StringType()),
    StructField("status", StringType())
])

# Define the path to the JSON file
json_file_path = "abfss://6c2dfd82-79ca-43af-b447-84d78a797dd3@onelake.dfs.fabric.microsoft.com/06a9b01e-67c7-4d49-b134-528745660f6b/Files/Bronze/DublinWeatherWarning.json"

# Load JSON data into dataframe
df = spark.read.schema(df_schema).json(json_file_path)

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 9, Finished, Available)

#### Rename columns

In [8]:
df = df \
    .withColumnRenamed("level", "Level") \
    .withColumnRenamed("severity", "Severity") \
    .withColumnRenamed("certainty", "Certainty") \
    .withColumnRenamed("issued", "Issued") \
    .withColumnRenamed("updated", "Updated") \
    .withColumnRenamed("onset", "Onset") \
    .withColumnRenamed("expiry", "Expiry") \
    .withColumnRenamed("headline", "Headline") \
    .withColumnRenamed("description", "Description") \
    .withColumnRenamed("status", "Status")

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 10, Finished, Available)

#### More transformations

In [9]:
from pyspark.sql.functions import col, substring, to_date, from_utc_timestamp

timestamp_cols = ["Issued", "Updated", "Onset", "Expiry"]

for col_name in timestamp_cols:
       # Convert the columns to Irish timestamp
       df = df.withColumn(col_name, from_utc_timestamp(col(col_name), "Europe/Dublin"))

# Add 'Issue_Time' column and convert Issued column to date type
df = df.withColumn("IssueTime", substring(col("Issued"), 12, 19)) \
       .withColumn("Issued", to_date(col("Issued")))

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 11, Finished, Available)

#### Define schema for silver table

In [10]:
from pyspark.sql.types import DateType
from delta.tables import DeltaTable

DeltaTable.createIfNotExists(spark) \
    .tableName("lakehouse.dublinweatherwarning_silver") \
    .addColumn("Level", StringType()) \
    .addColumn("Severity", StringType()) \
    .addColumn("Certainty", StringType()) \
    .addColumn("Issued", DateType()) \
    .addColumn("IssueTime", StringType()) \
    .addColumn("Updated", TimestampType()) \
    .addColumn("Onset", TimestampType()) \
    .addColumn("Expiry", TimestampType()) \
    .addColumn("Headline", StringType()) \
    .addColumn("Description", StringType()) \
    .addColumn("Status", StringType()) \
    .execute()

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 12, Finished, Available)

<delta.tables.DeltaTable at 0x7736831f52a0>

#### Optimize delta table writes

In [11]:
 # Enable V-Order
 spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
 # Enable automatic Delta optimized write
 spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 13, Finished, Available)

#### Write the dataframe to silver table (overwrite operation)

In [12]:
df.write.format("delta").mode("overwrite").save("Tables/dublinweatherwarning_silver")

StatementMeta(, 2f70bfbb-9d64-43df-a8f9-5ff34dd9dd7c, 14, Finished, Available)