## <u>Notebook by John Uzoma</u>

#### Load JSON file to a flattened pySpark dataframe

In [15]:
from pyspark.sql.functions import expr

# Create a function to flatten the dataframe and select required columns
def flatten_df(df_name):
   return df_name.select(
            expr("forecasts.regions[0][1].issued").alias("Issued"),
            expr("forecasts.regions[0][2].today").alias("Today"),
            expr("forecasts.regions[0][3].tonight").alias("Tonight"),
            expr("forecasts.regions[0][4].tomorrow").alias("Tomorrow"),
            expr("forecasts.regions[0][5].outlook").alias("Outlook")
        )

# Define the path to the JSON file
json_file_path = "Files/Bronze/DublinTextForecast.json"

# Load JSON data into a dataframe
df = flatten_df(spark.read.json(json_file_path))

StatementMeta(, 7c9309e7-05c9-464e-b3c1-47e7c061f09f, 17, Finished, Available, Finished)

In [8]:
"""
from pyspark.sql.types import StructType, StructField, StringType, TimestampType
    
# Create the schema for the table
df_schema = StructType([
    StructField("issued", TimestampType()),
    StructField("today", StringType()),
    StructField("tonight", StringType()),
    StructField("tomorrow", StringType()),
    StructField("outlook", StringType())
])
"""

StatementMeta(, 7c9309e7-05c9-464e-b3c1-47e7c061f09f, 10, Finished, Available, Finished)

In [None]:
"""
df1 = flatten_df(spark.read.schema(df_schema).json(json_file_path))
display(df1)
"""

#### Replace nulls and blanks in Outlook column

In [2]:
from pyspark.sql.functions import when, lit, col
 
# Replace null or empty values in Outlook column with "Unknown"
df = df.withColumn("Outlook", when((col("Outlook").isNull() | (col("Outlook")=="")),lit("Unknown")).otherwise(col("Outlook")))

StatementMeta(, 12adeb9d-887a-4c07-af02-79404c8be723, 4, Finished, Available, Finished)

#### More transformations

In [3]:
from pyspark.sql.functions import substring, to_date, from_utc_timestamp

# Convert Issued column to Irish timestamp
df = df.withColumn("Issued", from_utc_timestamp(col("Issued"), "Europe/Dublin"))

# Add 'Issue_Time' column and convert Issued column to date type
df = df.withColumn("TextForecastIssueTime", substring(col("Issued"), 12, 19)) \
       .withColumn("Issued", to_date(col("Issued")))

StatementMeta(, 12adeb9d-887a-4c07-af02-79404c8be723, 5, Finished, Available, Finished)

#### Define schema for silver table

In [4]:
from pyspark.sql.types import TimestampType, StringType, DateType
from delta.tables import DeltaTable

DeltaTable.createIfNotExists(spark) \
    .tableName("lakehouse.dublintextforecast_silver") \
    .addColumn("Issued", DateType()) \
    .addColumn("TextForecastIssueTime", StringType()) \
    .addColumn("Today", StringType()) \
    .addColumn("Tonight", StringType()) \
    .addColumn("Tomorrow", StringType()) \
    .addColumn("Outlook", StringType()) \
    .execute()

StatementMeta(, 12adeb9d-887a-4c07-af02-79404c8be723, 6, Finished, Available, Finished)

<delta.tables.DeltaTable at 0x7e53905516c0>

#### Optimize Delta table writes

In [5]:
 # Enable V-Order
 spark.conf.set("spark.sql.parquet.vorder.enabled", "true")
    
 # Enable automatic Delta optimized write
 spark.conf.set("spark.microsoft.delta.optimizeWrite.enabled", "true")

StatementMeta(, 12adeb9d-887a-4c07-af02-79404c8be723, 7, Finished, Available, Finished)

#### Write dataframe to the silver table (overwrite operation)

In [6]:
df.write.format("delta").mode("overwrite").save("Tables/dublintextforecast_silver")

StatementMeta(, 12adeb9d-887a-4c07-af02-79404c8be723, 8, Finished, Available, Finished)