###### *Notebook developed by John Uzoma*

## Load from JSON to pySpark dataframe

In [1]:
try:
    # Load JSON data into a dataframe
    df = spark.read.json("Files/ForecastWeather.json")
except Exception as err:
    print(err)

StatementMeta(, 159e9009-a802-4bbf-83a2-69c47b7da79c, 3, Finished, Available)

## Data cleaning (with code refactoring using functions)

In [None]:
from pyspark.sql.functions import col, to_timestamp, round, explode, lit

# function to convert date column from unix to datetime
def convert_unix_to_datetime(unix_datetime_col):
    return to_timestamp(unix_datetime_col)

# function to convert temperature column from kelvin to celsius and fahrenheit, rounded to 2 decimal places
def temperature_conversion(kelvin_col, to_unit):
    if to_unit == 'celsius':
        return round(kelvin_col - 273.15, 2)
    elif to_unit == 'fah':
        return round((col("list.main.temp") * 9/5) - 459.67, 2)

# explode the list array column
df = df.select(explode(col("list")).alias("list"))

# use df.select to flatten the structure of our nested JSON
flattened_df = df.select(
    convert_unix_to_datetime(col("list.dt")).alias("datetime"),
    col("list.main.temp").alias("temperature_kelvin"),
    temperature_conversion(col("list.main.temp"), to_unit="celsius").alias("temperature_celsius"),
    temperature_conversion(col("list.main.temp"), to_unit="fah").alias("temperature_fahrenheit")
)

# create a new column 'Type' with value 'Forecast'
flattened_df = flattened_df.withColumn("Type", lit("Forecast"))

## Load the dataframe into a Lakehouse table
###### I used the overwrite method to load the rows of data into my Lakehouse table, while not preserving existing data

In [3]:
flattened_df.write.format("delta").mode("overwrite").save("Tables/forecast_weather_data")

StatementMeta(, 159e9009-a802-4bbf-83a2-69c47b7da79c, 5, Finished, Available)