WeatherData JSON Source File Path : "abfss://bronze@datalakestorageaccountname.dfs.core.windows.net/weather-data/
"

- <a href="https://spark.apache.org/docs/latest/api/python/reference/pyspark.sql/api/pyspark.sql.DataFrame.join.html#pyspark.sql.DataFrame.join" target="_blank">**DataFrame Joins** </a>

In [0]:
weatherDataSourceLayerName = 'bronze'
weatherDataSourceStorageAccountName = 'lckudadatalakehousedev'
weatherDataSourceFolderName = 'weather-data'

weatherDataSourceFolderPath = f"abfss://{weatherDataSourceLayerName}@{weatherDataSourceStorageAccountName}.dfs.core.windows.net/{weatherDataSourceFolderName}"

In [0]:
weatherDataBronzeDF = (spark
                       .read
                       .json(weatherDataSourceFolderPath))

display(weatherDataBronzeDF)

In [0]:
from pyspark.sql.functions import *
weatherDataDailyDateTransDF = (weatherDataBronzeDF
                          .select(
                          explode("daily.time").alias("weatherDate")
                          ,col("marketName")
                          ,col("latitude").alias("latitude")
                          ,col("longitude").alias("longitude")
                          ,monotonically_increasing_id().alias('sequenceId')
                          ))

display(weatherDataDailyDateTransDF)

In [0]:
weatherDataMaxTemparatureTransDF = (weatherDataBronzeDF
                          .select(
                          explode("daily.temperature_2m_max").alias("maximumTemparature")
                          ,col("marketName")
                          ,col("latitude").alias("latitude")
                          ,col("longitude").alias("longitude")
                          ,monotonically_increasing_id().alias('sequenceId')
                          ,col("daily_units.temperature_2m_max").alias("unitOfTemparature")

                          ))

display(weatherDataMaxTemparatureTransDF)

In [0]:
weatherDataMinTemparatureTransDF = (weatherDataBronzeDF
                          .select(
                          explode("daily.temperature_2m_min").alias("minimumTemparature")
                          ,col("marketName")
                          ,col("latitude").alias("latitude")
                          ,col("longitude").alias("longitude")                          
                          ,monotonically_increasing_id().alias('sequenceId')

                          ))

display(weatherDataMinTemparatureTransDF)

In [0]:
weatherDataRainFallTransDF = (weatherDataBronzeDF
                          .select(
                          explode("daily.rain_sum").alias("rainFall")
                          ,col("marketName")
                          ,col("latitude").alias("latitude")
                          ,col("longitude").alias("longitude")                          
                          ,monotonically_increasing_id().alias('sequenceId')
                          ,col("daily_units.rain_sum").alias("unitOfRainFall")

                          ))

display(weatherDataRainFallTransDF)

In [0]:
weatherDataTransDF = (weatherDataDailyDateTransDF
                      .join(weatherDataMaxTemparatureTransDF, ['marketName','latitude','longitude','sequenceId'])
                      .join(weatherDataMinTemparatureTransDF, ['marketName','latitude','longitude','sequenceId'])
                      .join(weatherDataRainFallTransDF, ['marketName','latitude','longitude','sequenceId'])
                      .select(col("marketName")
                              ,col("weatherDate")
                              ,col("unitOfTemparature")
                              ,col("maximumTemparature")
                              ,col("minimumTemparature")
                              ,col("unitOfRainFall")
                              ,col("rainFall")
                              ,col("latitude")
                              ,col("longitude"))
                     
)

In [0]:
pdf = weatherDataTransDF.toPandas()
weatherDataTransDF = spark.createDataFrame(pdf)

In [0]:
(weatherDataTransDF
 .write
 .mode("overwrite")  
 .saveAsTable("pricing_analytics.silver.weather_data_silver"))

In [0]:
spark.sql("SELECT * FROM pricing_analytics.silver.weather_data_silver").show()
