In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.gold.fact_vn_weather_daily (
    cd_fact_vn_weather_daily  BIGINT PRIMARY KEY 
    GENERATED ALWAYS AS IDENTITY,
    dt_date_record TIMESTAMP,
    nr_weather_code FLOAT,
    ds_weather_description STRING,
    ds_location STRING,
    nr_temperature_2m_mean FLOAT,
    nr_temperature_2m_max FLOAT,
    nr_temperature_2m_min FLOAT,
    nr_apparent_temperature_mean FLOAT,
    nr_apparent_temperature_max FLOAT,
    nr_apparent_temperature_min FLOAT,
    nr_wind_speed_10m_max FLOAT,
    nr_wind_gusts_10m_max FLOAT,
    nr_shortwave_radiation_sum FLOAT,
    nr_wind_direction_10m_dominant FLOAT,
    nr_et0_fao_evapotranspiration FLOAT,
    dt_sunrise BIGINT,
    dt_sunset BIGINT,
    nr_daylight_duration FLOAT,
    nr_sunshine_duration FLOAT,
    nr_precipitation_sum FLOAT,
    nr_rain_sum FLOAT,
    nr_snowfall_sum FLOAT,
    nr_precipitation_hours FLOAT,
    nr_cloud_cover_mean FLOAT,
    nr_cloud_cover_max FLOAT,
    nr_cloud_cover_min FLOAT,
    nr_dew_point_2m_max FLOAT,
    nr_dew_point_2m_min FLOAT,
    nr_dew_point_2m_mean FLOAT,
    nr_et0_fao_evapotranspiration_sum FLOAT,
    nr_relative_humidity_2m_mean FLOAT,
    nr_relative_humidity_2m_max FLOAT,
    nr_relative_humidity_2m_min FLOAT,
    nr_snowfall_water_equivalent_sum FLOAT,
    nr_pressure_msl_mean FLOAT,
    nr_pressure_msl_max FLOAT,
    nr_pressure_msl_min FLOAT,
    nr_wind_speed_10m_min FLOAT,
    nr_wind_gusts_10m_min FLOAT,
    nr_wind_speed_10m_mean FLOAT,
    nr_wind_gusts_10m_mean FLOAT,
    nr_winddirection_10m_dominant FLOAT,
    nr_surface_pressure_min FLOAT,
    nr_surface_pressure_max FLOAT,
    nr_surface_pressure_mean FLOAT,
    nr_wet_bulb_temperature_2m_max FLOAT,
    nr_wet_bulb_temperature_2m_mean FLOAT,
    nr_wet_bulb_temperature_2m_min FLOAT,
    nr_vapour_pressure_deficit_max FLOAT,
    nr_soil_moisture_0_to_100cm_mean FLOAT,
    nr_soil_moisture_0_to_7cm_mean FLOAT,
    nr_soil_moisture_28_to_100cm_mean FLOAT,
    nr_soil_moisture_7_to_28cm_mean FLOAT,
    nr_soil_temperature_0_to_100cm_mean FLOAT,
    nr_soil_temperature_0_to_7cm_mean FLOAT,
    nr_soil_temperature_28_to_100cm_mean FLOAT,
    nr_soil_temperature_7_to_28cm_mean FLOAT,
    cd_location_key INT,
    nr_latitude DOUBLE,
    nr_longitude DOUBLE,
    ds_timezone STRING,
    dt_time_to_gold TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    dt_time_to_bronze TIMESTAMP
) 
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
from pyspark.sql.functions import lit , col

bronze_hcm_daily_df = spark.table(
    "hcmut.bronze.vn_hcm_weather_data_daily"
).withColumn(
    "cd_location_key", lit(1)
)

bronze_danang_daily_df = spark.table(
    "hcmut.bronze.vn_danang_weather_data_daily"
).withColumn(
    "cd_location_key", lit(2)
)

bronze_hanoi_daily_df = spark.table(
    "hcmut.bronze.vn_hanoi_weather_data_daily"
).withColumn(
    "cd_location_key", lit(3)
)

# Standardize column names for union
common_columns = [
    'dt_time_to_bronze', 'dt_date_record', 'nr_weather_code', 'nr_temperature_2m_mean',
    'nr_temperature_2m_max', 'nr_temperature_2m_min', 'nr_apparent_temperature_mean',
    'nr_apparent_temperature_max', 'nr_apparent_temperature_min', 'nr_wind_speed_10m_max',
    'nr_wind_gusts_10m_max', 'nr_shortwave_radiation_sum', 'nr_wind_direction_10m_dominant',
    'nr_et0_fao_evapotranspiration', 'dt_sunrise', 'dt_sunset', 'nr_daylight_duration',
    'nr_sunshine_duration', 'nr_precipitation_sum', 'nr_rain_sum', 'nr_snowfall_sum',
    'nr_precipitation_hours', 'nr_cloud_cover_mean', 'nr_cloud_cover_max', 'nr_cloud_cover_min',
    'nr_dew_point_2m_max', 'nr_dew_point_2m_min', 'nr_dew_point_2m_mean',
    'nr_et0_fao_evapotranspiration_sum', 'nr_relative_humidity_2m_mean', 'nr_relative_humidity_2m_max',
    'nr_relative_humidity_2m_min', 'nr_snowfall_water_equivalent_sum', 'nr_pressure_msl_mean',
    'nr_pressure_msl_max', 'nr_pressure_msl_min', 'nr_wind_speed_10m_min', 'nr_wind_gusts_10m_min',
    'nr_wind_speed_10m_mean', 'nr_wind_gusts_10m_mean', 'nr_winddirection_10m_dominant',
    'nr_surface_pressure_min', 'nr_surface_pressure_max', 'nr_surface_pressure_mean',
    'nr_wet_bulb_temperature_2m_max', 'nr_wet_bulb_temperature_2m_mean', 'nr_wet_bulb_temperature_2m_min',
    'nr_vapour_pressure_deficit_max', 'nr_soil_moisture_0_to_100cm_mean', 'nr_soil_moisture_0_to_7cm_mean',
    'nr_soil_moisture_28_to_100cm_mean', 'nr_soil_moisture_7_to_28cm_mean', 'nr_soil_temperature_0_to_100cm_mean',
    'nr_soil_temperature_0_to_7cm_mean', 'nr_soil_temperature_28_to_100cm_mean', 'nr_soil_temperature_7_to_28cm_mean',
    'cd_location_key'
]

bronze_hcm_daily_df = bronze_hcm_daily_df.select(common_columns)
bronze_danang_daily_df = bronze_danang_daily_df.select(common_columns)
bronze_hanoi_daily_df = bronze_hanoi_daily_df.select(common_columns)

bronze_union_df = bronze_hcm_daily_df.unionByName(
    bronze_danang_daily_df, allowMissingColumns=True
).unionByName(
    bronze_hanoi_daily_df, allowMissingColumns=True
)

dim_weather_desc_df = spark.table("hcmut.gold.dim_weather_code_description")
dim_location_df = spark.table("hcmut.gold.dim_location")

joined_df = bronze_union_df.join(
    dim_weather_desc_df,
    ["nr_weather_code"],
    "left"
).join(
    dim_location_df,
    bronze_union_df["cd_location_key"] == dim_location_df["cd_dim_location_description"],
    "left"
)

deduped_df = joined_df.dropDuplicates(
    ["dt_date_record", "cd_location_key", "nr_weather_code"]
)

deduped_df = deduped_df.drop("cd_dim_location_description").withColumnRenamed(
    "ds_description", "ds_weather_description"
)


display(deduped_df)

display(deduped_df.schema)

In [0]:
deduped_df.write.format("delta").mode("append").saveAsTable(
    "hcmut.gold.fact_vn_weather_daily"
)