###### *Notebook developed by John Uzoma*

## Load from JSON to pySpark dataframe

In [4]:
try:
    # Load JSON data into a dataframe
    df = spark.read.json("Files/ForecastWeather.json")
except Exception as err:
    print(err)

StatementMeta(, 3c1d00e5-339f-48ba-a52c-caee84aeebb7, 6, Finished, Available)

## Data cleaning (with code refactoring using functions)

In [5]:
from pyspark.sql.functions import col, to_timestamp, round, explode, lit

# function to convert date column from unix to datetime
def convert_unix_to_datetime(unix_datetime_col):
    return to_timestamp(unix_datetime_col)

# function to convert temperature column from kelvin to celsius and fahrenheit, rounded to 2 decimal places
def temperature_conversion(kelvin_col, to_unit):
    if to_unit == 'celsius':
        return round(kelvin_col - 273.15, 2)
    elif to_unit == 'fah':
        return round((col("list.main.temp") * 9/5) - 459.67, 2)

# explode the list array column
df = df.select(explode(col("list")).alias("list"))

# use df.select to flatten the structure of our nested JSON
flattened_df = df.select(
    convert_unix_to_datetime(col("list.dt")).alias("datetime"),
    col("list.main.temp").alias("temperature_kelvin"),
    temperature_conversion(col("list.main.temp"), to_unit="celsius").alias("temperature_celsius"),
    temperature_conversion(col("list.main.temp"), to_unit="fah").alias("temperature_fahrenheit")
)

# create a new column 'Type' with value 'Forecast'
flattened_df = flattened_df.withColumn("Type", lit("Forecast"))

flattened_df.printSchema()
display(flattened_df)

StatementMeta(, 3c1d00e5-339f-48ba-a52c-caee84aeebb7, 7, Finished, Available)

root
 |-- datetime: timestamp (nullable = true)
 |-- temperature_kelvin: double (nullable = true)
 |-- temperature_celsius: double (nullable = true)
 |-- temperature_fahrenheit: double (nullable = true)
 |-- Type: string (nullable = false)



SynapseWidget(Synapse.DataFrame, c478d2ef-df33-4aa6-b0fc-1b7af8106cb8)

## Load the dataframe into a Lakehouse table
###### I used the append method to load the rows of data into my Lakehouse table, while preserving existing data

In [6]:
flattened_df.write.format("delta").mode("append").save("Tables/forecast_weather_data")

StatementMeta(, 3c1d00e5-339f-48ba-a52c-caee84aeebb7, 8, Finished, Available)