## <u>Notebook by John Uzoma</u>

#### Define schema for dataframe and load json file into it

In [None]:
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
    
 # Create the schema for the table
df_schema = StructType([
    StructField("level", StringType()),
    StructField("severity", StringType()),
    StructField("certainty", StringType()),
    StructField("issued", TimestampType()),
    StructField("updated", TimestampType()),
    StructField("onset", TimestampType()),
    StructField("expiry", TimestampType()),
    StructField("headline", StringType()),
    StructField("description", StringType()),
    StructField("status", StringType())
])

# Define the path to the JSON file
json_file_path = "Files/Bronze/DublinWeatherWarning.json"

# Load JSON data into dataframe
df = spark.read.schema(df_schema).json(json_file_path)

#### Rename columns

In [None]:
df = df \
    .withColumnRenamed("level", "Level") \
    .withColumnRenamed("severity", "Severity") \
    .withColumnRenamed("certainty", "Certainty") \
    .withColumnRenamed("issued", "Issued") \
    .withColumnRenamed("updated", "Updated") \
    .withColumnRenamed("onset", "Onset") \
    .withColumnRenamed("expiry", "Expiry") \
    .withColumnRenamed("headline", "Headline") \
    .withColumnRenamed("description", "Description") \
    .withColumnRenamed("status", "Status")

#### More transformations

In [None]:
from pyspark.sql.functions import col, substring, to_date, from_utc_timestamp

timestamp_cols = ["Issued", "Updated", "Onset", "Expiry"]

for col_name in timestamp_cols:
       # Convert the columns to Irish timestamp
       df = df.withColumn(col_name, from_utc_timestamp(col(col_name), "Europe/Dublin"))

# Add 'Issue_Time' column and convert Issued column to date type
df = df.withColumn("WarningIssueTime", substring(col("Issued"), 12, 19)) \
       .withColumn("Issued", to_date(col("Issued")))

#### Define schema for silver table

In [None]:
from pyspark.sql.types import DateType
from delta.tables import DeltaTable

DeltaTable.createIfNotExists(spark) \
    .tableName("lakehouse.dublinweatherwarning_silver") \
    .addColumn("Level", StringType()) \
    .addColumn("Severity", StringType()) \
    .addColumn("Certainty", StringType()) \
    .addColumn("Issued", DateType()) \
    .addColumn("WarningIssueTime", StringType()) \
    .addColumn("Updated", TimestampType()) \
    .addColumn("Onset", TimestampType()) \
    .addColumn("Expiry", TimestampType()) \
    .addColumn("Headline", StringType()) \
    .addColumn("Description", StringType()) \
    .addColumn("Status", StringType()) \
    .execute()

#### Optimize delta table writes

In [None]:
 # Enable V-Order
 spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
 # Enable automatic Delta optimized write
 spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

#### Write the dataframe to silver table (append operation)

In [None]:
df.write.format("delta").mode("append").save("Tables/dublinweatherwarning_silver")