###### *Notebook developed by John Uzoma*

In [1]:
# initialize empty parameter for file path to be passed in from data pipeline
PARAM_file_path = ""

StatementMeta(, 049c1c71-1dae-42aa-86ae-ef7b1e7bf94c, 3, Finished, Available)

## Load from JSON to pySpark dataframe
###### Since I intend to include this notebook as part of a data pipeline that will run automatically on a schedule, I will create a function to retrieve the file path.

In [2]:
from datetime import datetime

def read_in_todays_data():
    '''
    Function that dynamically retrieves the filepath
    Returns a dataframe of today's data
    '''
    try:
        # Load JSON data into a dataframe
        df = spark.read.json(PARAM_file_path)
        return df
    except Exception as err:
        print(err)

df = read_in_todays_data()

StatementMeta(, 049c1c71-1dae-42aa-86ae-ef7b1e7bf94c, 4, Finished, Available)

Can not create a Path from an empty string


## Data cleaning (with code refactoring using functions)

In [None]:
from pyspark.sql.functions import col, to_timestamp, round, lit

# function to convert date column from unix to datetime
def convert_unix_to_datetime(unix_datetime_col):
    return to_timestamp(unix_datetime_col)

# function to convert temperature column from kelvin to celsius and fahrenheit, rounded to 2 decimal places
def temperature_conversion(kelvin_col, to_unit):
    if to_unit == 'celsius':
        return round(kelvin_col - 273.15, 2)
    elif to_unit == 'fah':
        return round((col("main.temp") * 9/5) - 459.67, 2)

# use df.select to flatten the structure of our nested JSON
flattened_df = df.select(
    convert_unix_to_datetime(col("dt")).alias("datetime"),
    col("main.temp").alias("temperature_kelvin"),
    temperature_conversion(col("main.temp"), to_unit="celsius").alias("temperature_celsius"),
    temperature_conversion(col("main.temp"), to_unit="fah").alias("temperature_fahrenheit")
)

# create a new column 'Type' with value 'Historic'
flattened_df = flattened_df.withColumn("Type", lit("Historic"))

## Load the dataframe into a Lakehouse table
###### I used the append method to load the row of data into my Lakehouse table, while preserving existing data.

In [None]:
flattened_df.write.format("delta").mode("append").save("Tables/historic_weather_data")