In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

In [0]:
from pyspark.sql.functions import coalesce, lit, max as spark_max
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry
from pyspark.sql.functions import col

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_hcm_weather_data_daily (
  cd_vn_hcm_weather_data_daily BIGINT PRIMARY KEY 
  GENERATED ALWAYS AS IDENTITY,
  dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  dt_date_record TIMESTAMP,
  nr_weather_code FLOAT,
  nr_temperature_2m_mean FLOAT,
  nr_temperature_2m_max FLOAT,
  nr_temperature_2m_min FLOAT,
  nr_apparent_temperature_mean FLOAT,
  nr_apparent_temperature_max FLOAT,
  nr_apparent_temperature_min FLOAT,
  nr_wind_speed_10m_max FLOAT,
  nr_wind_gusts_10m_max FLOAT,
  nr_shortwave_radiation_sum FLOAT,
  nr_wind_direction_10m_dominant FLOAT,
  nr_et0_fao_evapotranspiration FLOAT,
  dt_sunrise BIGINT,
  dt_sunset BIGINT,
  nr_daylight_duration FLOAT,
  nr_sunshine_duration FLOAT,
  nr_precipitation_sum FLOAT,
  nr_rain_sum FLOAT,
  nr_snowfall_sum FLOAT,
  nr_precipitation_hours FLOAT,
  nr_cloud_cover_mean FLOAT,
  nr_cloud_cover_max FLOAT,
  nr_cloud_cover_min FLOAT,
  nr_dew_point_2m_max FLOAT,
  nr_dew_point_2m_min FLOAT,
  nr_dew_point_2m_mean FLOAT,
  nr_et0_fao_evapotranspiration_sum FLOAT,
  nr_relative_humidity_2m_mean FLOAT,
  nr_relative_humidity_2m_max FLOAT,
  nr_relative_humidity_2m_min FLOAT,
  nr_snowfall_water_equivalent_sum FLOAT,
  nr_pressure_msl_mean FLOAT,
  nr_pressure_msl_max FLOAT,
  nr_pressure_msl_min FLOAT,
  nr_wind_speed_10m_min FLOAT,
  nr_wind_gusts_10m_min FLOAT,
  nr_wind_speed_10m_mean FLOAT,
  nr_wind_gusts_10m_mean FLOAT,
  nr_winddirection_10m_dominant FLOAT,
  nr_surface_pressure_min FLOAT,
  nr_surface_pressure_max FLOAT,
  nr_surface_pressure_mean FLOAT,
  nr_wet_bulb_temperature_2m_max FLOAT,
  nr_wet_bulb_temperature_2m_mean FLOAT,
  nr_wet_bulb_temperature_2m_min FLOAT,
  nr_vapour_pressure_deficit_max FLOAT,
  nr_soil_moisture_0_to_100cm_mean FLOAT,
  nr_soil_moisture_0_to_7cm_mean FLOAT,
  nr_soil_moisture_28_to_100cm_mean FLOAT,
  nr_soil_moisture_7_to_28cm_mean FLOAT,
  nr_soil_temperature_0_to_100cm_mean FLOAT,
  nr_soil_temperature_0_to_7cm_mean FLOAT,
  nr_soil_temperature_28_to_100cm_mean FLOAT,
  nr_soil_temperature_7_to_28cm_mean FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_danang_weather_data_daily (
  cd_vn_danang_weather_data_daily BIGINT PRIMARY KEY 
  GENERATED ALWAYS AS IDENTITY,
  dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  dt_date_record TIMESTAMP,
  nr_weather_code FLOAT,
  nr_temperature_2m_mean FLOAT,
  nr_temperature_2m_max FLOAT,
  nr_temperature_2m_min FLOAT,
  nr_apparent_temperature_mean FLOAT,
  nr_apparent_temperature_max FLOAT,
  nr_apparent_temperature_min FLOAT,
  nr_wind_speed_10m_max FLOAT,
  nr_wind_gusts_10m_max FLOAT,
  nr_shortwave_radiation_sum FLOAT,
  nr_wind_direction_10m_dominant FLOAT,
  nr_et0_fao_evapotranspiration FLOAT,
  dt_sunrise BIGINT,
  dt_sunset BIGINT,
  nr_daylight_duration FLOAT,
  nr_sunshine_duration FLOAT,
  nr_precipitation_sum FLOAT,
  nr_rain_sum FLOAT,
  nr_snowfall_sum FLOAT,
  nr_precipitation_hours FLOAT,
  nr_cloud_cover_mean FLOAT,
  nr_cloud_cover_max FLOAT,
  nr_cloud_cover_min FLOAT,
  nr_dew_point_2m_max FLOAT,
  nr_dew_point_2m_min FLOAT,
  nr_dew_point_2m_mean FLOAT,
  nr_et0_fao_evapotranspiration_sum FLOAT,
  nr_relative_humidity_2m_mean FLOAT,
  nr_relative_humidity_2m_max FLOAT,
  nr_relative_humidity_2m_min FLOAT,
  nr_snowfall_water_equivalent_sum FLOAT,
  nr_pressure_msl_mean FLOAT,
  nr_pressure_msl_max FLOAT,
  nr_pressure_msl_min FLOAT,
  nr_wind_speed_10m_min FLOAT,
  nr_wind_gusts_10m_min FLOAT,
  nr_wind_speed_10m_mean FLOAT,
  nr_wind_gusts_10m_mean FLOAT,
  nr_winddirection_10m_dominant FLOAT,
  nr_surface_pressure_min FLOAT,
  nr_surface_pressure_max FLOAT,
  nr_surface_pressure_mean FLOAT,
  nr_wet_bulb_temperature_2m_max FLOAT,
  nr_wet_bulb_temperature_2m_mean FLOAT,
  nr_wet_bulb_temperature_2m_min FLOAT,
  nr_vapour_pressure_deficit_max FLOAT,
  nr_soil_moisture_0_to_100cm_mean FLOAT,
  nr_soil_moisture_0_to_7cm_mean FLOAT,
  nr_soil_moisture_28_to_100cm_mean FLOAT,
  nr_soil_moisture_7_to_28cm_mean FLOAT,
  nr_soil_temperature_0_to_100cm_mean FLOAT,
  nr_soil_temperature_0_to_7cm_mean FLOAT,
  nr_soil_temperature_28_to_100cm_mean FLOAT,
  nr_soil_temperature_7_to_28cm_mean FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_hanoi_weather_data_daily (
  cd_vn_hanoi_weather_data_daily BIGINT PRIMARY KEY 
  GENERATED ALWAYS AS IDENTITY,
  dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
  dt_date_record TIMESTAMP,
  nr_weather_code FLOAT,
  nr_temperature_2m_mean FLOAT,
  nr_temperature_2m_max FLOAT,
  nr_temperature_2m_min FLOAT,
  nr_apparent_temperature_mean FLOAT,
  nr_apparent_temperature_max FLOAT,
  nr_apparent_temperature_min FLOAT,
  nr_wind_speed_10m_max FLOAT,
  nr_wind_gusts_10m_max FLOAT,
  nr_shortwave_radiation_sum FLOAT,
  nr_wind_direction_10m_dominant FLOAT,
  nr_et0_fao_evapotranspiration FLOAT,
  dt_sunrise BIGINT,
  dt_sunset BIGINT,
  nr_daylight_duration FLOAT,
  nr_sunshine_duration FLOAT,
  nr_precipitation_sum FLOAT,
  nr_rain_sum FLOAT,
  nr_snowfall_sum FLOAT,
  nr_precipitation_hours FLOAT,
  nr_cloud_cover_mean FLOAT,
  nr_cloud_cover_max FLOAT,
  nr_cloud_cover_min FLOAT,
  nr_dew_point_2m_max FLOAT,
  nr_dew_point_2m_min FLOAT,
  nr_dew_point_2m_mean FLOAT,
  nr_et0_fao_evapotranspiration_sum FLOAT,
  nr_relative_humidity_2m_mean FLOAT,
  nr_relative_humidity_2m_max FLOAT,
  nr_relative_humidity_2m_min FLOAT,
  nr_snowfall_water_equivalent_sum FLOAT,
  nr_pressure_msl_mean FLOAT,
  nr_pressure_msl_max FLOAT,
  nr_pressure_msl_min FLOAT,
  nr_wind_speed_10m_min FLOAT,
  nr_wind_gusts_10m_min FLOAT,
  nr_wind_speed_10m_mean FLOAT,
  nr_wind_gusts_10m_mean FLOAT,
  nr_winddirection_10m_dominant FLOAT,
  nr_surface_pressure_min FLOAT,
  nr_surface_pressure_max FLOAT,
  nr_surface_pressure_mean FLOAT,
  nr_wet_bulb_temperature_2m_max FLOAT,
  nr_wet_bulb_temperature_2m_mean FLOAT,
  nr_wet_bulb_temperature_2m_min FLOAT,
  nr_vapour_pressure_deficit_max FLOAT,
  nr_soil_moisture_0_to_100cm_mean FLOAT,
  nr_soil_moisture_0_to_7cm_mean FLOAT,
  nr_soil_moisture_28_to_100cm_mean FLOAT,
  nr_soil_moisture_7_to_28cm_mean FLOAT,
  nr_soil_temperature_0_to_100cm_mean FLOAT,
  nr_soil_temperature_0_to_7cm_mean FLOAT,
  nr_soil_temperature_28_to_100cm_mean FLOAT,
  nr_soil_temperature_7_to_28cm_mean FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 10.8188,
	"longitude": 106.6519,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.today().strftime("%Y-%m-%d"),
	"daily": ["weather_code", "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean", "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max", "shortwave_radiation_sum", "wind_direction_10m_dominant", "et0_fao_evapotranspiration", "sunrise", "sunset", "daylight_duration", "sunshine_duration", "precipitation_sum", "rain_sum", "snowfall_sum", "precipitation_hours", "cloud_cover_mean", "cloud_cover_max", "cloud_cover_min", "dew_point_2m_max", "dew_point_2m_min", "dew_point_2m_mean", "et0_fao_evapotranspiration_sum", "relative_humidity_2m_mean", "relative_humidity_2m_max", "relative_humidity_2m_min", "snowfall_water_equivalent_sum", "pressure_msl_mean", "pressure_msl_max", "pressure_msl_min", "wind_speed_10m_min", "wind_gusts_10m_min", "wind_speed_10m_mean", "wind_gusts_10m_mean", "winddirection_10m_dominant", "surface_pressure_min", "surface_pressure_max", "surface_pressure_mean", "wet_bulb_temperature_2m_max", "wet_bulb_temperature_2m_mean", "wet_bulb_temperature_2m_min", "vapour_pressure_deficit_max", "soil_moisture_0_to_100cm_mean", "soil_moisture_0_to_7cm_mean", "soil_moisture_28_to_100cm_mean", "soil_moisture_7_to_28cm_mean", "soil_temperature_0_to_100cm_mean", "soil_temperature_0_to_7cm_mean", "soil_temperature_28_to_100cm_mean", "soil_temperature_7_to_28cm_mean"],
	"timezone": "Asia/Bangkok",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone: {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(2).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(3).ValuesAsNumpy()
daily_apparent_temperature_mean = daily.Variables(4).ValuesAsNumpy()
daily_apparent_temperature_max = daily.Variables(5).ValuesAsNumpy()
daily_apparent_temperature_min = daily.Variables(6).ValuesAsNumpy()
daily_wind_speed_10m_max = daily.Variables(7).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(8).ValuesAsNumpy()
daily_shortwave_radiation_sum = daily.Variables(9).ValuesAsNumpy()
daily_wind_direction_10m_dominant = daily.Variables(10).ValuesAsNumpy()
daily_et0_fao_evapotranspiration = daily.Variables(11).ValuesAsNumpy()
daily_sunrise = daily.Variables(12).ValuesInt64AsNumpy()
daily_sunset = daily.Variables(13).ValuesInt64AsNumpy()
daily_daylight_duration = daily.Variables(14).ValuesAsNumpy()
daily_sunshine_duration = daily.Variables(15).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(16).ValuesAsNumpy()
daily_rain_sum = daily.Variables(17).ValuesAsNumpy()
daily_snowfall_sum = daily.Variables(18).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(19).ValuesAsNumpy()
daily_cloud_cover_mean = daily.Variables(20).ValuesAsNumpy()
daily_cloud_cover_max = daily.Variables(21).ValuesAsNumpy()
daily_cloud_cover_min = daily.Variables(22).ValuesAsNumpy()
daily_dew_point_2m_max = daily.Variables(23).ValuesAsNumpy()
daily_dew_point_2m_min = daily.Variables(24).ValuesAsNumpy()
daily_dew_point_2m_mean = daily.Variables(25).ValuesAsNumpy()
daily_et0_fao_evapotranspiration_sum = daily.Variables(26).ValuesAsNumpy()
daily_relative_humidity_2m_mean = daily.Variables(27).ValuesAsNumpy()
daily_relative_humidity_2m_max = daily.Variables(28).ValuesAsNumpy()
daily_relative_humidity_2m_min = daily.Variables(29).ValuesAsNumpy()
daily_snowfall_water_equivalent_sum = daily.Variables(30).ValuesAsNumpy()
daily_pressure_msl_mean = daily.Variables(31).ValuesAsNumpy()
daily_pressure_msl_max = daily.Variables(32).ValuesAsNumpy()
daily_pressure_msl_min = daily.Variables(33).ValuesAsNumpy()
daily_wind_speed_10m_min = daily.Variables(34).ValuesAsNumpy()
daily_wind_gusts_10m_min = daily.Variables(35).ValuesAsNumpy()
daily_wind_speed_10m_mean = daily.Variables(36).ValuesAsNumpy()
daily_wind_gusts_10m_mean = daily.Variables(37).ValuesAsNumpy()
daily_winddirection_10m_dominant = daily.Variables(38).ValuesAsNumpy()
daily_surface_pressure_min = daily.Variables(39).ValuesAsNumpy()
daily_surface_pressure_max = daily.Variables(40).ValuesAsNumpy()
daily_surface_pressure_mean = daily.Variables(41).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_max = daily.Variables(42).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_mean = daily.Variables(43).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_min = daily.Variables(44).ValuesAsNumpy()
daily_vapour_pressure_deficit_max = daily.Variables(45).ValuesAsNumpy()
daily_soil_moisture_0_to_100cm_mean = daily.Variables(46).ValuesAsNumpy()
daily_soil_moisture_0_to_7cm_mean = daily.Variables(47).ValuesAsNumpy()
daily_soil_moisture_28_to_100cm_mean = daily.Variables(48).ValuesAsNumpy()
daily_soil_moisture_7_to_28cm_mean = daily.Variables(49).ValuesAsNumpy()
daily_soil_temperature_0_to_100cm_mean = daily.Variables(50).ValuesAsNumpy()
daily_soil_temperature_0_to_7cm_mean = daily.Variables(51).ValuesAsNumpy()
daily_soil_temperature_28_to_100cm_mean = daily.Variables(52).ValuesAsNumpy()
daily_soil_temperature_7_to_28cm_mean = daily.Variables(53).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum
daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
daily_data["et0_fao_evapotranspiration"] = daily_et0_fao_evapotranspiration
daily_data["sunrise"] = daily_sunrise
daily_data["sunset"] = daily_sunset
daily_data["daylight_duration"] = daily_daylight_duration
daily_data["sunshine_duration"] = daily_sunshine_duration
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["rain_sum"] = daily_rain_sum
daily_data["snowfall_sum"] = daily_snowfall_sum
daily_data["precipitation_hours"] = daily_precipitation_hours
daily_data["cloud_cover_mean"] = daily_cloud_cover_mean
daily_data["cloud_cover_max"] = daily_cloud_cover_max
daily_data["cloud_cover_min"] = daily_cloud_cover_min
daily_data["dew_point_2m_max"] = daily_dew_point_2m_max
daily_data["dew_point_2m_min"] = daily_dew_point_2m_min
daily_data["dew_point_2m_mean"] = daily_dew_point_2m_mean
daily_data["et0_fao_evapotranspiration_sum"] = daily_et0_fao_evapotranspiration_sum
daily_data["relative_humidity_2m_mean"] = daily_relative_humidity_2m_mean
daily_data["relative_humidity_2m_max"] = daily_relative_humidity_2m_max
daily_data["relative_humidity_2m_min"] = daily_relative_humidity_2m_min
daily_data["snowfall_water_equivalent_sum"] = daily_snowfall_water_equivalent_sum
daily_data["pressure_msl_mean"] = daily_pressure_msl_mean
daily_data["pressure_msl_max"] = daily_pressure_msl_max
daily_data["pressure_msl_min"] = daily_pressure_msl_min
daily_data["wind_speed_10m_min"] = daily_wind_speed_10m_min
daily_data["wind_gusts_10m_min"] = daily_wind_gusts_10m_min
daily_data["wind_speed_10m_mean"] = daily_wind_speed_10m_mean
daily_data["wind_gusts_10m_mean"] = daily_wind_gusts_10m_mean
daily_data["winddirection_10m_dominant"] = daily_winddirection_10m_dominant
daily_data["surface_pressure_min"] = daily_surface_pressure_min
daily_data["surface_pressure_max"] = daily_surface_pressure_max
daily_data["surface_pressure_mean"] = daily_surface_pressure_mean
daily_data["wet_bulb_temperature_2m_max"] = daily_wet_bulb_temperature_2m_max
daily_data["wet_bulb_temperature_2m_mean"] = daily_wet_bulb_temperature_2m_mean
daily_data["wet_bulb_temperature_2m_min"] = daily_wet_bulb_temperature_2m_min
daily_data["vapour_pressure_deficit_max"] = daily_vapour_pressure_deficit_max
daily_data["soil_moisture_0_to_100cm_mean"] = daily_soil_moisture_0_to_100cm_mean
daily_data["soil_moisture_0_to_7cm_mean"] = daily_soil_moisture_0_to_7cm_mean
daily_data["soil_moisture_28_to_100cm_mean"] = daily_soil_moisture_28_to_100cm_mean
daily_data["soil_moisture_7_to_28cm_mean"] = daily_soil_moisture_7_to_28cm_mean
daily_data["soil_temperature_0_to_100cm_mean"] = daily_soil_temperature_0_to_100cm_mean
daily_data["soil_temperature_0_to_7cm_mean"] = daily_soil_temperature_0_to_7cm_mean
daily_data["soil_temperature_28_to_100cm_mean"] = daily_soil_temperature_28_to_100cm_mean
daily_data["soil_temperature_7_to_28cm_mean"] = daily_soil_temperature_7_to_28cm_mean

daily_dataframe = pd.DataFrame(data = daily_data)
hcm_daily_df = spark.createDataFrame(daily_dataframe)
display(hcm_daily_df)

In [0]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 16.0439,
	"longitude": 108.1994,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.today().strftime("%Y-%m-%d"),
	"daily": ["weather_code", "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean", "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max", "shortwave_radiation_sum", "wind_direction_10m_dominant", "et0_fao_evapotranspiration", "sunrise", "sunset", "daylight_duration", "sunshine_duration", "precipitation_sum", "rain_sum", "snowfall_sum", "precipitation_hours", "cloud_cover_mean", "cloud_cover_max", "cloud_cover_min", "dew_point_2m_max", "dew_point_2m_min", "dew_point_2m_mean", "et0_fao_evapotranspiration_sum", "relative_humidity_2m_mean", "relative_humidity_2m_max", "relative_humidity_2m_min", "snowfall_water_equivalent_sum", "pressure_msl_mean", "pressure_msl_max", "pressure_msl_min", "wind_speed_10m_min", "wind_gusts_10m_min", "wind_speed_10m_mean", "wind_gusts_10m_mean", "winddirection_10m_dominant", "surface_pressure_min", "surface_pressure_max", "surface_pressure_mean", "wet_bulb_temperature_2m_max", "wet_bulb_temperature_2m_mean", "wet_bulb_temperature_2m_min", "vapour_pressure_deficit_max", "soil_moisture_0_to_100cm_mean", "soil_moisture_0_to_7cm_mean", "soil_moisture_28_to_100cm_mean", "soil_moisture_7_to_28cm_mean", "soil_temperature_0_to_100cm_mean", "soil_temperature_0_to_7cm_mean", "soil_temperature_28_to_100cm_mean", "soil_temperature_7_to_28cm_mean"],
	"timezone": "Asia/Bangkok",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone: {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(2).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(3).ValuesAsNumpy()
daily_apparent_temperature_mean = daily.Variables(4).ValuesAsNumpy()
daily_apparent_temperature_max = daily.Variables(5).ValuesAsNumpy()
daily_apparent_temperature_min = daily.Variables(6).ValuesAsNumpy()
daily_wind_speed_10m_max = daily.Variables(7).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(8).ValuesAsNumpy()
daily_shortwave_radiation_sum = daily.Variables(9).ValuesAsNumpy()
daily_wind_direction_10m_dominant = daily.Variables(10).ValuesAsNumpy()
daily_et0_fao_evapotranspiration = daily.Variables(11).ValuesAsNumpy()
daily_sunrise = daily.Variables(12).ValuesInt64AsNumpy()
daily_sunset = daily.Variables(13).ValuesInt64AsNumpy()
daily_daylight_duration = daily.Variables(14).ValuesAsNumpy()
daily_sunshine_duration = daily.Variables(15).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(16).ValuesAsNumpy()
daily_rain_sum = daily.Variables(17).ValuesAsNumpy()
daily_snowfall_sum = daily.Variables(18).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(19).ValuesAsNumpy()
daily_cloud_cover_mean = daily.Variables(20).ValuesAsNumpy()
daily_cloud_cover_max = daily.Variables(21).ValuesAsNumpy()
daily_cloud_cover_min = daily.Variables(22).ValuesAsNumpy()
daily_dew_point_2m_max = daily.Variables(23).ValuesAsNumpy()
daily_dew_point_2m_min = daily.Variables(24).ValuesAsNumpy()
daily_dew_point_2m_mean = daily.Variables(25).ValuesAsNumpy()
daily_et0_fao_evapotranspiration_sum = daily.Variables(26).ValuesAsNumpy()
daily_relative_humidity_2m_mean = daily.Variables(27).ValuesAsNumpy()
daily_relative_humidity_2m_max = daily.Variables(28).ValuesAsNumpy()
daily_relative_humidity_2m_min = daily.Variables(29).ValuesAsNumpy()
daily_snowfall_water_equivalent_sum = daily.Variables(30).ValuesAsNumpy()
daily_pressure_msl_mean = daily.Variables(31).ValuesAsNumpy()
daily_pressure_msl_max = daily.Variables(32).ValuesAsNumpy()
daily_pressure_msl_min = daily.Variables(33).ValuesAsNumpy()
daily_wind_speed_10m_min = daily.Variables(34).ValuesAsNumpy()
daily_wind_gusts_10m_min = daily.Variables(35).ValuesAsNumpy()
daily_wind_speed_10m_mean = daily.Variables(36).ValuesAsNumpy()
daily_wind_gusts_10m_mean = daily.Variables(37).ValuesAsNumpy()
daily_winddirection_10m_dominant = daily.Variables(38).ValuesAsNumpy()
daily_surface_pressure_min = daily.Variables(39).ValuesAsNumpy()
daily_surface_pressure_max = daily.Variables(40).ValuesAsNumpy()
daily_surface_pressure_mean = daily.Variables(41).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_max = daily.Variables(42).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_mean = daily.Variables(43).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_min = daily.Variables(44).ValuesAsNumpy()
daily_vapour_pressure_deficit_max = daily.Variables(45).ValuesAsNumpy()
daily_soil_moisture_0_to_100cm_mean = daily.Variables(46).ValuesAsNumpy()
daily_soil_moisture_0_to_7cm_mean = daily.Variables(47).ValuesAsNumpy()
daily_soil_moisture_28_to_100cm_mean = daily.Variables(48).ValuesAsNumpy()
daily_soil_moisture_7_to_28cm_mean = daily.Variables(49).ValuesAsNumpy()
daily_soil_temperature_0_to_100cm_mean = daily.Variables(50).ValuesAsNumpy()
daily_soil_temperature_0_to_7cm_mean = daily.Variables(51).ValuesAsNumpy()
daily_soil_temperature_28_to_100cm_mean = daily.Variables(52).ValuesAsNumpy()
daily_soil_temperature_7_to_28cm_mean = daily.Variables(53).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum
daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
daily_data["et0_fao_evapotranspiration"] = daily_et0_fao_evapotranspiration
daily_data["sunrise"] = daily_sunrise
daily_data["sunset"] = daily_sunset
daily_data["daylight_duration"] = daily_daylight_duration
daily_data["sunshine_duration"] = daily_sunshine_duration
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["rain_sum"] = daily_rain_sum
daily_data["snowfall_sum"] = daily_snowfall_sum
daily_data["precipitation_hours"] = daily_precipitation_hours
daily_data["cloud_cover_mean"] = daily_cloud_cover_mean
daily_data["cloud_cover_max"] = daily_cloud_cover_max
daily_data["cloud_cover_min"] = daily_cloud_cover_min
daily_data["dew_point_2m_max"] = daily_dew_point_2m_max
daily_data["dew_point_2m_min"] = daily_dew_point_2m_min
daily_data["dew_point_2m_mean"] = daily_dew_point_2m_mean
daily_data["et0_fao_evapotranspiration_sum"] = daily_et0_fao_evapotranspiration_sum
daily_data["relative_humidity_2m_mean"] = daily_relative_humidity_2m_mean
daily_data["relative_humidity_2m_max"] = daily_relative_humidity_2m_max
daily_data["relative_humidity_2m_min"] = daily_relative_humidity_2m_min
daily_data["snowfall_water_equivalent_sum"] = daily_snowfall_water_equivalent_sum
daily_data["pressure_msl_mean"] = daily_pressure_msl_mean
daily_data["pressure_msl_max"] = daily_pressure_msl_max
daily_data["pressure_msl_min"] = daily_pressure_msl_min
daily_data["wind_speed_10m_min"] = daily_wind_speed_10m_min
daily_data["wind_gusts_10m_min"] = daily_wind_gusts_10m_min
daily_data["wind_speed_10m_mean"] = daily_wind_speed_10m_mean
daily_data["wind_gusts_10m_mean"] = daily_wind_gusts_10m_mean
daily_data["winddirection_10m_dominant"] = daily_winddirection_10m_dominant
daily_data["surface_pressure_min"] = daily_surface_pressure_min
daily_data["surface_pressure_max"] = daily_surface_pressure_max
daily_data["surface_pressure_mean"] = daily_surface_pressure_mean
daily_data["wet_bulb_temperature_2m_max"] = daily_wet_bulb_temperature_2m_max
daily_data["wet_bulb_temperature_2m_mean"] = daily_wet_bulb_temperature_2m_mean
daily_data["wet_bulb_temperature_2m_min"] = daily_wet_bulb_temperature_2m_min
daily_data["vapour_pressure_deficit_max"] = daily_vapour_pressure_deficit_max
daily_data["soil_moisture_0_to_100cm_mean"] = daily_soil_moisture_0_to_100cm_mean
daily_data["soil_moisture_0_to_7cm_mean"] = daily_soil_moisture_0_to_7cm_mean
daily_data["soil_moisture_28_to_100cm_mean"] = daily_soil_moisture_28_to_100cm_mean
daily_data["soil_moisture_7_to_28cm_mean"] = daily_soil_moisture_7_to_28cm_mean
daily_data["soil_temperature_0_to_100cm_mean"] = daily_soil_temperature_0_to_100cm_mean
daily_data["soil_temperature_0_to_7cm_mean"] = daily_soil_temperature_0_to_7cm_mean
daily_data["soil_temperature_28_to_100cm_mean"] = daily_soil_temperature_28_to_100cm_mean
daily_data["soil_temperature_7_to_28cm_mean"] = daily_soil_temperature_7_to_28cm_mean

daily_dataframe = pd.DataFrame(data = daily_data)
danang_daily_df = spark.createDataFrame(daily_dataframe)
display(danang_daily_df)

In [0]:
import openmeteo_requests

import pandas as pd
import requests_cache
from retry_requests import retry

# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession('.cache', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 21.2212,
	"longitude": 105.8072,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.today().strftime("%Y-%m-%d"),
	"daily": ["weather_code", "temperature_2m_mean", "temperature_2m_max", "temperature_2m_min", "apparent_temperature_mean", "apparent_temperature_max", "apparent_temperature_min", "wind_speed_10m_max", "wind_gusts_10m_max", "shortwave_radiation_sum", "wind_direction_10m_dominant", "et0_fao_evapotranspiration", "sunrise", "sunset", "daylight_duration", "sunshine_duration", "precipitation_sum", "rain_sum", "snowfall_sum", "precipitation_hours", "cloud_cover_mean", "cloud_cover_max", "cloud_cover_min", "dew_point_2m_max", "dew_point_2m_min", "dew_point_2m_mean", "et0_fao_evapotranspiration_sum", "relative_humidity_2m_mean", "relative_humidity_2m_max", "relative_humidity_2m_min", "snowfall_water_equivalent_sum", "pressure_msl_mean", "pressure_msl_max", "pressure_msl_min", "wind_speed_10m_min", "wind_gusts_10m_min", "wind_speed_10m_mean", "wind_gusts_10m_mean", "winddirection_10m_dominant", "surface_pressure_min", "surface_pressure_max", "surface_pressure_mean", "wet_bulb_temperature_2m_max", "wet_bulb_temperature_2m_mean", "wet_bulb_temperature_2m_min", "vapour_pressure_deficit_max", "soil_moisture_0_to_100cm_mean", "soil_moisture_0_to_7cm_mean", "soil_moisture_28_to_100cm_mean", "soil_moisture_7_to_28cm_mean", "soil_temperature_0_to_100cm_mean", "soil_temperature_0_to_7cm_mean", "soil_temperature_28_to_100cm_mean", "soil_temperature_7_to_28cm_mean"],
	"timezone": "Asia/Bangkok",
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone: {response.Timezone()}{response.TimezoneAbbreviation()}")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process daily data. The order of variables needs to be the same as requested.
daily = response.Daily()
daily_weather_code = daily.Variables(0).ValuesAsNumpy()
daily_temperature_2m_mean = daily.Variables(1).ValuesAsNumpy()
daily_temperature_2m_max = daily.Variables(2).ValuesAsNumpy()
daily_temperature_2m_min = daily.Variables(3).ValuesAsNumpy()
daily_apparent_temperature_mean = daily.Variables(4).ValuesAsNumpy()
daily_apparent_temperature_max = daily.Variables(5).ValuesAsNumpy()
daily_apparent_temperature_min = daily.Variables(6).ValuesAsNumpy()
daily_wind_speed_10m_max = daily.Variables(7).ValuesAsNumpy()
daily_wind_gusts_10m_max = daily.Variables(8).ValuesAsNumpy()
daily_shortwave_radiation_sum = daily.Variables(9).ValuesAsNumpy()
daily_wind_direction_10m_dominant = daily.Variables(10).ValuesAsNumpy()
daily_et0_fao_evapotranspiration = daily.Variables(11).ValuesAsNumpy()
daily_sunrise = daily.Variables(12).ValuesInt64AsNumpy()
daily_sunset = daily.Variables(13).ValuesInt64AsNumpy()
daily_daylight_duration = daily.Variables(14).ValuesAsNumpy()
daily_sunshine_duration = daily.Variables(15).ValuesAsNumpy()
daily_precipitation_sum = daily.Variables(16).ValuesAsNumpy()
daily_rain_sum = daily.Variables(17).ValuesAsNumpy()
daily_snowfall_sum = daily.Variables(18).ValuesAsNumpy()
daily_precipitation_hours = daily.Variables(19).ValuesAsNumpy()
daily_cloud_cover_mean = daily.Variables(20).ValuesAsNumpy()
daily_cloud_cover_max = daily.Variables(21).ValuesAsNumpy()
daily_cloud_cover_min = daily.Variables(22).ValuesAsNumpy()
daily_dew_point_2m_max = daily.Variables(23).ValuesAsNumpy()
daily_dew_point_2m_min = daily.Variables(24).ValuesAsNumpy()
daily_dew_point_2m_mean = daily.Variables(25).ValuesAsNumpy()
daily_et0_fao_evapotranspiration_sum = daily.Variables(26).ValuesAsNumpy()
daily_relative_humidity_2m_mean = daily.Variables(27).ValuesAsNumpy()
daily_relative_humidity_2m_max = daily.Variables(28).ValuesAsNumpy()
daily_relative_humidity_2m_min = daily.Variables(29).ValuesAsNumpy()
daily_snowfall_water_equivalent_sum = daily.Variables(30).ValuesAsNumpy()
daily_pressure_msl_mean = daily.Variables(31).ValuesAsNumpy()
daily_pressure_msl_max = daily.Variables(32).ValuesAsNumpy()
daily_pressure_msl_min = daily.Variables(33).ValuesAsNumpy()
daily_wind_speed_10m_min = daily.Variables(34).ValuesAsNumpy()
daily_wind_gusts_10m_min = daily.Variables(35).ValuesAsNumpy()
daily_wind_speed_10m_mean = daily.Variables(36).ValuesAsNumpy()
daily_wind_gusts_10m_mean = daily.Variables(37).ValuesAsNumpy()
daily_winddirection_10m_dominant = daily.Variables(38).ValuesAsNumpy()
daily_surface_pressure_min = daily.Variables(39).ValuesAsNumpy()
daily_surface_pressure_max = daily.Variables(40).ValuesAsNumpy()
daily_surface_pressure_mean = daily.Variables(41).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_max = daily.Variables(42).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_mean = daily.Variables(43).ValuesAsNumpy()
daily_wet_bulb_temperature_2m_min = daily.Variables(44).ValuesAsNumpy()
daily_vapour_pressure_deficit_max = daily.Variables(45).ValuesAsNumpy()
daily_soil_moisture_0_to_100cm_mean = daily.Variables(46).ValuesAsNumpy()
daily_soil_moisture_0_to_7cm_mean = daily.Variables(47).ValuesAsNumpy()
daily_soil_moisture_28_to_100cm_mean = daily.Variables(48).ValuesAsNumpy()
daily_soil_moisture_7_to_28cm_mean = daily.Variables(49).ValuesAsNumpy()
daily_soil_temperature_0_to_100cm_mean = daily.Variables(50).ValuesAsNumpy()
daily_soil_temperature_0_to_7cm_mean = daily.Variables(51).ValuesAsNumpy()
daily_soil_temperature_28_to_100cm_mean = daily.Variables(52).ValuesAsNumpy()
daily_soil_temperature_7_to_28cm_mean = daily.Variables(53).ValuesAsNumpy()

daily_data = {"date": pd.date_range(
	start = pd.to_datetime(daily.Time(), unit = "s", utc = True),
	end = pd.to_datetime(daily.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = daily.Interval()),
	inclusive = "left"
)}

daily_data["weather_code"] = daily_weather_code
daily_data["temperature_2m_mean"] = daily_temperature_2m_mean
daily_data["temperature_2m_max"] = daily_temperature_2m_max
daily_data["temperature_2m_min"] = daily_temperature_2m_min
daily_data["apparent_temperature_mean"] = daily_apparent_temperature_mean
daily_data["apparent_temperature_max"] = daily_apparent_temperature_max
daily_data["apparent_temperature_min"] = daily_apparent_temperature_min
daily_data["wind_speed_10m_max"] = daily_wind_speed_10m_max
daily_data["wind_gusts_10m_max"] = daily_wind_gusts_10m_max
daily_data["shortwave_radiation_sum"] = daily_shortwave_radiation_sum
daily_data["wind_direction_10m_dominant"] = daily_wind_direction_10m_dominant
daily_data["et0_fao_evapotranspiration"] = daily_et0_fao_evapotranspiration
daily_data["sunrise"] = daily_sunrise
daily_data["sunset"] = daily_sunset
daily_data["daylight_duration"] = daily_daylight_duration
daily_data["sunshine_duration"] = daily_sunshine_duration
daily_data["precipitation_sum"] = daily_precipitation_sum
daily_data["rain_sum"] = daily_rain_sum
daily_data["snowfall_sum"] = daily_snowfall_sum
daily_data["precipitation_hours"] = daily_precipitation_hours
daily_data["cloud_cover_mean"] = daily_cloud_cover_mean
daily_data["cloud_cover_max"] = daily_cloud_cover_max
daily_data["cloud_cover_min"] = daily_cloud_cover_min
daily_data["dew_point_2m_max"] = daily_dew_point_2m_max
daily_data["dew_point_2m_min"] = daily_dew_point_2m_min
daily_data["dew_point_2m_mean"] = daily_dew_point_2m_mean
daily_data["et0_fao_evapotranspiration_sum"] = daily_et0_fao_evapotranspiration_sum
daily_data["relative_humidity_2m_mean"] = daily_relative_humidity_2m_mean
daily_data["relative_humidity_2m_max"] = daily_relative_humidity_2m_max
daily_data["relative_humidity_2m_min"] = daily_relative_humidity_2m_min
daily_data["snowfall_water_equivalent_sum"] = daily_snowfall_water_equivalent_sum
daily_data["pressure_msl_mean"] = daily_pressure_msl_mean
daily_data["pressure_msl_max"] = daily_pressure_msl_max
daily_data["pressure_msl_min"] = daily_pressure_msl_min
daily_data["wind_speed_10m_min"] = daily_wind_speed_10m_min
daily_data["wind_gusts_10m_min"] = daily_wind_gusts_10m_min
daily_data["wind_speed_10m_mean"] = daily_wind_speed_10m_mean
daily_data["wind_gusts_10m_mean"] = daily_wind_gusts_10m_mean
daily_data["winddirection_10m_dominant"] = daily_winddirection_10m_dominant
daily_data["surface_pressure_min"] = daily_surface_pressure_min
daily_data["surface_pressure_max"] = daily_surface_pressure_max
daily_data["surface_pressure_mean"] = daily_surface_pressure_mean
daily_data["wet_bulb_temperature_2m_max"] = daily_wet_bulb_temperature_2m_max
daily_data["wet_bulb_temperature_2m_mean"] = daily_wet_bulb_temperature_2m_mean
daily_data["wet_bulb_temperature_2m_min"] = daily_wet_bulb_temperature_2m_min
daily_data["vapour_pressure_deficit_max"] = daily_vapour_pressure_deficit_max
daily_data["soil_moisture_0_to_100cm_mean"] = daily_soil_moisture_0_to_100cm_mean
daily_data["soil_moisture_0_to_7cm_mean"] = daily_soil_moisture_0_to_7cm_mean
daily_data["soil_moisture_28_to_100cm_mean"] = daily_soil_moisture_28_to_100cm_mean
daily_data["soil_moisture_7_to_28cm_mean"] = daily_soil_moisture_7_to_28cm_mean
daily_data["soil_temperature_0_to_100cm_mean"] = daily_soil_temperature_0_to_100cm_mean
daily_data["soil_temperature_0_to_7cm_mean"] = daily_soil_temperature_0_to_7cm_mean
daily_data["soil_temperature_28_to_100cm_mean"] = daily_soil_temperature_28_to_100cm_mean
daily_data["soil_temperature_7_to_28cm_mean"] = daily_soil_temperature_7_to_28cm_mean

daily_dataframe = pd.DataFrame(data = daily_data)
hanoi_daily_df = spark.createDataFrame(daily_dataframe)
display(hanoi_daily_df)

In [0]:
from pyspark.sql.functions import col

# Prepare DataFrame for merge: rename columns to match table schema
hcm_daily_df_source = (
    hcm_daily_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("temperature_2m_mean", "nr_temperature_2m_mean")
    .withColumnRenamed("temperature_2m_max", "nr_temperature_2m_max")
    .withColumnRenamed("temperature_2m_min", "nr_temperature_2m_min")
    .withColumnRenamed("apparent_temperature_mean", "nr_apparent_temperature_mean")
    .withColumnRenamed("apparent_temperature_max", "nr_apparent_temperature_max")
    .withColumnRenamed("apparent_temperature_min", "nr_apparent_temperature_min")
    .withColumnRenamed("wind_speed_10m_max", "nr_wind_speed_10m_max")
    .withColumnRenamed("wind_gusts_10m_max", "nr_wind_gusts_10m_max")
    .withColumnRenamed("shortwave_radiation_sum", "nr_shortwave_radiation_sum")
    .withColumnRenamed("wind_direction_10m_dominant", "nr_wind_direction_10m_dominant")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("sunrise", "dt_sunrise")
    .withColumnRenamed("sunset", "dt_sunset")
    .withColumnRenamed("daylight_duration", "nr_daylight_duration")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
    .withColumnRenamed("precipitation_sum", "nr_precipitation_sum")
    .withColumnRenamed("rain_sum", "nr_rain_sum")
    .withColumnRenamed("snowfall_sum", "nr_snowfall_sum")
    .withColumnRenamed("precipitation_hours", "nr_precipitation_hours")
    .withColumnRenamed("cloud_cover_mean", "nr_cloud_cover_mean")
    .withColumnRenamed("cloud_cover_max", "nr_cloud_cover_max")
    .withColumnRenamed("cloud_cover_min", "nr_cloud_cover_min")
    .withColumnRenamed("dew_point_2m_max", "nr_dew_point_2m_max")
    .withColumnRenamed("dew_point_2m_min", "nr_dew_point_2m_min")
    .withColumnRenamed("dew_point_2m_mean", "nr_dew_point_2m_mean")
    .withColumnRenamed("et0_fao_evapotranspiration_sum", "nr_et0_fao_evapotranspiration_sum")
    .withColumnRenamed("relative_humidity_2m_mean", "nr_relative_humidity_2m_mean")
    .withColumnRenamed("relative_humidity_2m_max", "nr_relative_humidity_2m_max")
    .withColumnRenamed("relative_humidity_2m_min", "nr_relative_humidity_2m_min")
    .withColumnRenamed("snowfall_water_equivalent_sum", "nr_snowfall_water_equivalent_sum")
    .withColumnRenamed("pressure_msl_mean", "nr_pressure_msl_mean")
    .withColumnRenamed("pressure_msl_max", "nr_pressure_msl_max")
    .withColumnRenamed("pressure_msl_min", "nr_pressure_msl_min")
    .withColumnRenamed("wind_speed_10m_min", "nr_wind_speed_10m_min")
    .withColumnRenamed("wind_gusts_10m_min", "nr_wind_gusts_10m_min")
    .withColumnRenamed("wind_speed_10m_mean", "nr_wind_speed_10m_mean")
    .withColumnRenamed("wind_gusts_10m_mean", "nr_wind_gusts_10m_mean")
    .withColumnRenamed("winddirection_10m_dominant", "nr_winddirection_10m_dominant")
    .withColumnRenamed("surface_pressure_min", "nr_surface_pressure_min")
    .withColumnRenamed("surface_pressure_max", "nr_surface_pressure_max")
    .withColumnRenamed("surface_pressure_mean", "nr_surface_pressure_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_max", "nr_wet_bulb_temperature_2m_max")
    .withColumnRenamed("wet_bulb_temperature_2m_mean", "nr_wet_bulb_temperature_2m_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_min", "nr_wet_bulb_temperature_2m_min")
    .withColumnRenamed("vapour_pressure_deficit_max", "nr_vapour_pressure_deficit_max")
    .withColumnRenamed("soil_moisture_0_to_100cm_mean", "nr_soil_moisture_0_to_100cm_mean")
    .withColumnRenamed("soil_moisture_0_to_7cm_mean", "nr_soil_moisture_0_to_7cm_mean")
    .withColumnRenamed("soil_moisture_28_to_100cm_mean", "nr_soil_moisture_28_to_100cm_mean")
    .withColumnRenamed("soil_moisture_7_to_28cm_mean", "nr_soil_moisture_7_to_28cm_mean")
    .withColumnRenamed("soil_temperature_0_to_100cm_mean", "nr_soil_temperature_0_to_100cm_mean")
    .withColumnRenamed("soil_temperature_0_to_7cm_mean", "nr_soil_temperature_0_to_7cm_mean")
    .withColumnRenamed("soil_temperature_28_to_100cm_mean", "nr_soil_temperature_28_to_100cm_mean")
    .withColumnRenamed("soil_temperature_7_to_28cm_mean", "nr_soil_temperature_7_to_28cm_mean")
)

bronze_max_date = (
    spark.table("hcmut.bronze.vn_hcm_weather_data_daily")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01")).alias("max_date"))
    .collect()[0]["max_date"]
)

hcm_daily_df_source = hcm_daily_df_source.filter(col("dt_date_record") > bronze_max_date)

# Insert data into the target table
hcm_daily_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_hcm_weather_data_daily")

In [0]:
from pyspark.sql.functions import col

# Prepare DataFrame for merge: rename columns to match table schema
hanoi_daily_df_source = (
    hanoi_daily_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("temperature_2m_mean", "nr_temperature_2m_mean")
    .withColumnRenamed("temperature_2m_max", "nr_temperature_2m_max")
    .withColumnRenamed("temperature_2m_min", "nr_temperature_2m_min")
    .withColumnRenamed("apparent_temperature_mean", "nr_apparent_temperature_mean")
    .withColumnRenamed("apparent_temperature_max", "nr_apparent_temperature_max")
    .withColumnRenamed("apparent_temperature_min", "nr_apparent_temperature_min")
    .withColumnRenamed("wind_speed_10m_max", "nr_wind_speed_10m_max")
    .withColumnRenamed("wind_gusts_10m_max", "nr_wind_gusts_10m_max")
    .withColumnRenamed("shortwave_radiation_sum", "nr_shortwave_radiation_sum")
    .withColumnRenamed("wind_direction_10m_dominant", "nr_wind_direction_10m_dominant")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("sunrise", "dt_sunrise")
    .withColumnRenamed("sunset", "dt_sunset")
    .withColumnRenamed("daylight_duration", "nr_daylight_duration")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
    .withColumnRenamed("precipitation_sum", "nr_precipitation_sum")
    .withColumnRenamed("rain_sum", "nr_rain_sum")
    .withColumnRenamed("snowfall_sum", "nr_snowfall_sum")
    .withColumnRenamed("precipitation_hours", "nr_precipitation_hours")
    .withColumnRenamed("cloud_cover_mean", "nr_cloud_cover_mean")
    .withColumnRenamed("cloud_cover_max", "nr_cloud_cover_max")
    .withColumnRenamed("cloud_cover_min", "nr_cloud_cover_min")
    .withColumnRenamed("dew_point_2m_max", "nr_dew_point_2m_max")
    .withColumnRenamed("dew_point_2m_min", "nr_dew_point_2m_min")
    .withColumnRenamed("dew_point_2m_mean", "nr_dew_point_2m_mean")
    .withColumnRenamed("et0_fao_evapotranspiration_sum", "nr_et0_fao_evapotranspiration_sum")
    .withColumnRenamed("relative_humidity_2m_mean", "nr_relative_humidity_2m_mean")
    .withColumnRenamed("relative_humidity_2m_max", "nr_relative_humidity_2m_max")
    .withColumnRenamed("relative_humidity_2m_min", "nr_relative_humidity_2m_min")
    .withColumnRenamed("snowfall_water_equivalent_sum", "nr_snowfall_water_equivalent_sum")
    .withColumnRenamed("pressure_msl_mean", "nr_pressure_msl_mean")
    .withColumnRenamed("pressure_msl_max", "nr_pressure_msl_max")
    .withColumnRenamed("pressure_msl_min", "nr_pressure_msl_min")
    .withColumnRenamed("wind_speed_10m_min", "nr_wind_speed_10m_min")
    .withColumnRenamed("wind_gusts_10m_min", "nr_wind_gusts_10m_min")
    .withColumnRenamed("wind_speed_10m_mean", "nr_wind_speed_10m_mean")
    .withColumnRenamed("wind_gusts_10m_mean", "nr_wind_gusts_10m_mean")
    .withColumnRenamed("winddirection_10m_dominant", "nr_winddirection_10m_dominant")
    .withColumnRenamed("surface_pressure_min", "nr_surface_pressure_min")
    .withColumnRenamed("surface_pressure_max", "nr_surface_pressure_max")
    .withColumnRenamed("surface_pressure_mean", "nr_surface_pressure_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_max", "nr_wet_bulb_temperature_2m_max")
    .withColumnRenamed("wet_bulb_temperature_2m_mean", "nr_wet_bulb_temperature_2m_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_min", "nr_wet_bulb_temperature_2m_min")
    .withColumnRenamed("vapour_pressure_deficit_max", "nr_vapour_pressure_deficit_max")
    .withColumnRenamed("soil_moisture_0_to_100cm_mean", "nr_soil_moisture_0_to_100cm_mean")
    .withColumnRenamed("soil_moisture_0_to_7cm_mean", "nr_soil_moisture_0_to_7cm_mean")
    .withColumnRenamed("soil_moisture_28_to_100cm_mean", "nr_soil_moisture_28_to_100cm_mean")
    .withColumnRenamed("soil_moisture_7_to_28cm_mean", "nr_soil_moisture_7_to_28cm_mean")
    .withColumnRenamed("soil_temperature_0_to_100cm_mean", "nr_soil_temperature_0_to_100cm_mean")
    .withColumnRenamed("soil_temperature_0_to_7cm_mean", "nr_soil_temperature_0_to_7cm_mean")
    .withColumnRenamed("soil_temperature_28_to_100cm_mean", "nr_soil_temperature_28_to_100cm_mean")
    .withColumnRenamed("soil_temperature_7_to_28cm_mean", "nr_soil_temperature_7_to_28cm_mean")
)

bronze_max_date = (
    spark.table("hcmut.bronze.vn_hanoi_weather_data_daily")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01")).alias("max_date"))
    .collect()[0]["max_date"]
)

hanoi_daily_df_source = hanoi_daily_df_source.filter(col("dt_date_record") > bronze_max_date)

# Insert data into the target table
hanoi_daily_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_hanoi_weather_data_daily")

In [0]:
from pyspark.sql.functions import col

# Prepare DataFrame for merge: rename columns to match table schema
danang_daily_df_source = (
    danang_daily_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("temperature_2m_mean", "nr_temperature_2m_mean")
    .withColumnRenamed("temperature_2m_max", "nr_temperature_2m_max")
    .withColumnRenamed("temperature_2m_min", "nr_temperature_2m_min")
    .withColumnRenamed("apparent_temperature_mean", "nr_apparent_temperature_mean")
    .withColumnRenamed("apparent_temperature_max", "nr_apparent_temperature_max")
    .withColumnRenamed("apparent_temperature_min", "nr_apparent_temperature_min")
    .withColumnRenamed("wind_speed_10m_max", "nr_wind_speed_10m_max")
    .withColumnRenamed("wind_gusts_10m_max", "nr_wind_gusts_10m_max")
    .withColumnRenamed("shortwave_radiation_sum", "nr_shortwave_radiation_sum")
    .withColumnRenamed("wind_direction_10m_dominant", "nr_wind_direction_10m_dominant")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("sunrise", "dt_sunrise")
    .withColumnRenamed("sunset", "dt_sunset")
    .withColumnRenamed("daylight_duration", "nr_daylight_duration")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
    .withColumnRenamed("precipitation_sum", "nr_precipitation_sum")
    .withColumnRenamed("rain_sum", "nr_rain_sum")
    .withColumnRenamed("snowfall_sum", "nr_snowfall_sum")
    .withColumnRenamed("precipitation_hours", "nr_precipitation_hours")
    .withColumnRenamed("cloud_cover_mean", "nr_cloud_cover_mean")
    .withColumnRenamed("cloud_cover_max", "nr_cloud_cover_max")
    .withColumnRenamed("cloud_cover_min", "nr_cloud_cover_min")
    .withColumnRenamed("dew_point_2m_max", "nr_dew_point_2m_max")
    .withColumnRenamed("dew_point_2m_min", "nr_dew_point_2m_min")
    .withColumnRenamed("dew_point_2m_mean", "nr_dew_point_2m_mean")
    .withColumnRenamed("et0_fao_evapotranspiration_sum", "nr_et0_fao_evapotranspiration_sum")
    .withColumnRenamed("relative_humidity_2m_mean", "nr_relative_humidity_2m_mean")
    .withColumnRenamed("relative_humidity_2m_max", "nr_relative_humidity_2m_max")
    .withColumnRenamed("relative_humidity_2m_min", "nr_relative_humidity_2m_min")
    .withColumnRenamed("snowfall_water_equivalent_sum", "nr_snowfall_water_equivalent_sum")
    .withColumnRenamed("pressure_msl_mean", "nr_pressure_msl_mean")
    .withColumnRenamed("pressure_msl_max", "nr_pressure_msl_max")
    .withColumnRenamed("pressure_msl_min", "nr_pressure_msl_min")
    .withColumnRenamed("wind_speed_10m_min", "nr_wind_speed_10m_min")
    .withColumnRenamed("wind_gusts_10m_min", "nr_wind_gusts_10m_min")
    .withColumnRenamed("wind_speed_10m_mean", "nr_wind_speed_10m_mean")
    .withColumnRenamed("wind_gusts_10m_mean", "nr_wind_gusts_10m_mean")
    .withColumnRenamed("winddirection_10m_dominant", "nr_winddirection_10m_dominant")
    .withColumnRenamed("surface_pressure_min", "nr_surface_pressure_min")
    .withColumnRenamed("surface_pressure_max", "nr_surface_pressure_max")
    .withColumnRenamed("surface_pressure_mean", "nr_surface_pressure_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_max", "nr_wet_bulb_temperature_2m_max")
    .withColumnRenamed("wet_bulb_temperature_2m_mean", "nr_wet_bulb_temperature_2m_mean")
    .withColumnRenamed("wet_bulb_temperature_2m_min", "nr_wet_bulb_temperature_2m_min")
    .withColumnRenamed("vapour_pressure_deficit_max", "nr_vapour_pressure_deficit_max")
    .withColumnRenamed("soil_moisture_0_to_100cm_mean", "nr_soil_moisture_0_to_100cm_mean")
    .withColumnRenamed("soil_moisture_0_to_7cm_mean", "nr_soil_moisture_0_to_7cm_mean")
    .withColumnRenamed("soil_moisture_28_to_100cm_mean", "nr_soil_moisture_28_to_100cm_mean")
    .withColumnRenamed("soil_moisture_7_to_28cm_mean", "nr_soil_moisture_7_to_28cm_mean")
    .withColumnRenamed("soil_temperature_0_to_100cm_mean", "nr_soil_temperature_0_to_100cm_mean")
    .withColumnRenamed("soil_temperature_0_to_7cm_mean", "nr_soil_temperature_0_to_7cm_mean")
    .withColumnRenamed("soil_temperature_28_to_100cm_mean", "nr_soil_temperature_28_to_100cm_mean")
    .withColumnRenamed("soil_temperature_7_to_28cm_mean", "nr_soil_temperature_7_to_28cm_mean")
)

bronze_max_date = (
    spark.table("hcmut.bronze.vn_danang_weather_data_daily")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01")).alias("max_date"))
    .collect()[0]["max_date"]
)

danang_daily_df_source = danang_daily_df_source.filter(col("dt_date_record") > bronze_max_date)

# Insert data into the target table
danang_daily_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_danang_weather_data_daily")