###### *Notebook developed by John Uzoma*

## Load from JSON to pySpark dataframe
###### Since I intend to include this notebook as part of a data pipeline that will run automatically on a schedule, I will create a function to make the file path dynamic, so I don't read the same file at each run.

In [3]:
from datetime import datetime

def read_in_todays_data():
    '''
    Function that dynamically retrieves the filepath based on today's date
    Returns a dataframe of today's date
    '''
    dt_now = datetime.now()
    # dt_now = 2024-03-25 19:58:49.789007

    dt_string = dt_now.strftime("%Y/%m/%d")
    # dt_string = 2024/03/25

    dynamic_file_path = f"Files/{dt_string}/CurrentWeather.json"
    # dynamic_file_path = "Files/2024/03/25/CurrentWeather.json"

    try:
        # Load JSON data into a dataframe
        df = spark.read.json(dynamic_file_path)
        return df
    except Exception as err:
        print(err)

df = read_in_todays_data()

StatementMeta(, , , Waiting, )

## Data cleaning (with code refactoring using functions)

In [4]:
from pyspark.sql.functions import col, to_timestamp, round, lit

# function to convert date column from unix to datetime
def convert_unix_to_datetime(unix_datetime_col):
    return to_timestamp(unix_datetime_col)

# function to convert temperature column from kelvin to celsius and fahrenheit, rounded to 2 decimal places
def temperature_conversion(kelvin_col, to_unit):
    if to_unit == 'celsius':
        return round(kelvin_col - 273.15, 2)
    elif to_unit == 'fah':
        return round((col("main.temp") * 9/5) - 459.67, 2)

# use df.select to flatten the structure of our nested JSON
flattened_df = df.select(
    convert_unix_to_datetime(col("dt")).alias("datetime"),
    col("main.temp").alias("temperature_kelvin"),
    temperature_conversion(col("main.temp"), to_unit="celsius").alias("temperature_celsius"),
    temperature_conversion(col("main.temp"), to_unit="fah").alias("temperature_fahrenheit")
)

# create a new column 'Type' with value 'Historic'
flattened_df = flattened_df.withColumn("Type", lit("Historic"))

flattened_df.printSchema()
display(flattened_df)

StatementMeta(, , , Waiting, )

root
 |-- datetime: timestamp (nullable = true)
 |-- temperature_kelvin: double (nullable = true)
 |-- temperature_celsius: double (nullable = true)
 |-- temperature_fahrenheit: double (nullable = true)
 |-- Type: string (nullable = false)



SynapseWidget(Synapse.DataFrame, a1967d6e-130e-4fee-8706-1cbd86b930b5)

## Load the dataframe into a Lakehouse table
###### I used the append method to load the row of data into my Lakehouse table, while preserving existing data.

In [4]:
flattened_df.write.format("delta").mode("append").save("Tables/historic_weather_data")

StatementMeta(, , , Waiting, )