In [0]:
%pip install openmeteo-requests
%pip install requests-cache retry-requests numpy pandas

In [0]:
from pyspark.sql.functions import coalesce, lit, max as spark_max
import openmeteo_requests
import pandas as pd
import requests_cache
from retry_requests import retry
from pyspark.sql.functions import col

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_hcm_weather_data_hourly (
    cd_vn_hcm_weather_data_hourly BIGINT PRIMARY KEY 
    GENERATED ALWAYS AS IDENTITY,
    dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    dt_date_record TIMESTAMP,
    nr_temperature_2m FLOAT,
    nr_dew_point_2m FLOAT,
    nr_relative_humidity_2m FLOAT,
    nr_snow_depth FLOAT,
    nr_snowfall FLOAT,
    nr_rain FLOAT,
    nr_precipitation FLOAT,
    nr_apparent_temperature FLOAT,
    nr_weather_code FLOAT,
    nr_pressure_msl FLOAT,
    nr_surface_pressure FLOAT,
    nr_cloud_cover FLOAT,
    nr_cloud_cover_mid FLOAT,
    nr_cloud_cover_low FLOAT,
    nr_cloud_cover_high FLOAT,
    nr_et0_fao_evapotranspiration FLOAT,
    nr_vapour_pressure_deficit FLOAT,
    nr_wind_speed_10m FLOAT,
    nr_wind_direction_10m FLOAT,
    nr_wind_speed_100m FLOAT,
    nr_wind_direction_100m FLOAT,
    nr_wind_gusts_10m FLOAT,
    nr_soil_temperature_0_to_7cm FLOAT,
    nr_soil_temperature_28_to_100cm FLOAT,
    nr_soil_temperature_7_to_28cm FLOAT,
    nr_soil_temperature_100_to_255cm FLOAT,
    nr_soil_moisture_0_to_7cm FLOAT,
    nr_soil_moisture_7_to_28cm FLOAT,
    nr_soil_moisture_28_to_100cm FLOAT,
    nr_soil_moisture_100_to_255cm FLOAT,
    nr_is_day FLOAT,
    nr_sunshine_duration FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_hanoi_weather_data_hourly (
    cd_vn_hanoi_weather_data_daily BIGINT PRIMARY KEY 
    GENERATED ALWAYS AS IDENTITY,
    dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    dt_date_record TIMESTAMP,
    nr_temperature_2m FLOAT,
    nr_dew_point_2m FLOAT,
    nr_relative_humidity_2m FLOAT,
    nr_snow_depth FLOAT,
    nr_snowfall FLOAT,
    nr_rain FLOAT,
    nr_precipitation FLOAT,
    nr_apparent_temperature FLOAT,
    nr_weather_code FLOAT,
    nr_pressure_msl FLOAT,
    nr_surface_pressure FLOAT,
    nr_cloud_cover FLOAT,
    nr_cloud_cover_mid FLOAT,
    nr_cloud_cover_low FLOAT,
    nr_cloud_cover_high FLOAT,
    nr_et0_fao_evapotranspiration FLOAT,
    nr_vapour_pressure_deficit FLOAT,
    nr_wind_speed_10m FLOAT,
    nr_wind_direction_10m FLOAT,
    nr_wind_speed_100m FLOAT,
    nr_wind_direction_100m FLOAT,
    nr_wind_gusts_10m FLOAT,
    nr_soil_temperature_0_to_7cm FLOAT,
    nr_soil_temperature_28_to_100cm FLOAT,
    nr_soil_temperature_7_to_28cm FLOAT,
    nr_soil_temperature_100_to_255cm FLOAT,
    nr_soil_moisture_0_to_7cm FLOAT,
    nr_soil_moisture_7_to_28cm FLOAT,
    nr_soil_moisture_28_to_100cm FLOAT,
    nr_soil_moisture_100_to_255cm FLOAT,
    nr_is_day FLOAT,
    nr_sunshine_duration FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
%sql
CREATE TABLE IF NOT EXISTS hcmut.bronze.vn_danang_weather_data_hourly (
    cd_vn_danang_weather_data_daily BIGINT PRIMARY KEY 
    GENERATED ALWAYS AS IDENTITY,
    dt_time_to_bronze TIMESTAMP DEFAULT CURRENT_TIMESTAMP,
    dt_date_record TIMESTAMP,
    nr_temperature_2m FLOAT,
    nr_dew_point_2m FLOAT,
    nr_relative_humidity_2m FLOAT,
    nr_snow_depth FLOAT,
    nr_snowfall FLOAT,
    nr_rain FLOAT,
    nr_precipitation FLOAT,
    nr_apparent_temperature FLOAT,
    nr_weather_code FLOAT,
    nr_pressure_msl FLOAT,
    nr_surface_pressure FLOAT,
    nr_cloud_cover FLOAT,
    nr_cloud_cover_mid FLOAT,
    nr_cloud_cover_low FLOAT,
    nr_cloud_cover_high FLOAT,
    nr_et0_fao_evapotranspiration FLOAT,
    nr_vapour_pressure_deficit FLOAT,
    nr_wind_speed_10m FLOAT,
    nr_wind_direction_10m FLOAT,
    nr_wind_speed_100m FLOAT,
    nr_wind_direction_100m FLOAT,
    nr_wind_gusts_10m FLOAT,
    nr_soil_temperature_0_to_7cm FLOAT,
    nr_soil_temperature_28_to_100cm FLOAT,
    nr_soil_temperature_7_to_28cm FLOAT,
    nr_soil_temperature_100_to_255cm FLOAT,
    nr_soil_moisture_0_to_7cm FLOAT,
    nr_soil_moisture_7_to_28cm FLOAT,
    nr_soil_moisture_28_to_100cm FLOAT,
    nr_soil_moisture_100_to_255cm FLOAT,
    nr_is_day FLOAT,
    nr_sunshine_duration FLOAT
)
TBLPROPERTIES('delta.feature.allowColumnDefaults' = 'supported')

In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(backend = 'memory', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 10.8188,
	"longitude": 106.6519,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.now().strftime("%Y-%m-%d"),
	"hourly": ["temperature_2m", "dew_point_2m", "relative_humidity_2m", "snow_depth", "snowfall", "rain", "precipitation", "apparent_temperature", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_mid", "cloud_cover_low", "cloud_cover_high", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_direction_10m", "wind_speed_100m", "wind_direction_100m", "wind_gusts_10m", "soil_temperature_0_to_7cm", "soil_temperature_28_to_100cm", "soil_temperature_7_to_28cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm", "is_day", "sunshine_duration"],
 	"timezone": "Asia/Bangkok"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(3).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
hourly_rain = hourly.Variables(5).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(6).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(7).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(8).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(9).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(11).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(12).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(13).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(14).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(15).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(17).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(18).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(19).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(20).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(21).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(22).ValuesAsNumpy()
hourly_soil_temperature_28_to_100cm = hourly.Variables(23).ValuesAsNumpy()
hourly_soil_temperature_7_to_28cm = hourly.Variables(24).ValuesAsNumpy()
hourly_soil_temperature_100_to_255cm = hourly.Variables(25).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(26).ValuesAsNumpy()
hourly_soil_moisture_7_to_28cm = hourly.Variables(27).ValuesAsNumpy()
hourly_soil_moisture_28_to_100cm = hourly.Variables(28).ValuesAsNumpy()
hourly_soil_moisture_100_to_255cm = hourly.Variables(29).ValuesAsNumpy()
hourly_is_day = hourly.Variables(30).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(31).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["snowfall"] = hourly_snowfall
hourly_data["rain"] = hourly_rain
hourly_data["precipitation"] = hourly_precipitation
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["weather_code"] = hourly_weather_code
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_temperature_28_to_100cm"] = hourly_soil_temperature_28_to_100cm
hourly_data["soil_temperature_7_to_28cm"] = hourly_soil_temperature_7_to_28cm
hourly_data["soil_temperature_100_to_255cm"] = hourly_soil_temperature_100_to_255cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
hourly_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
hourly_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
hourly_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
hcm_hourly_df = spark.createDataFrame(hourly_dataframe)
display(hcm_hourly_df)

In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(backend = 'memory', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 16.0439,
	"longitude": 108.1994,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.now().strftime("%Y-%m-%d"),
	"hourly": ["temperature_2m", "dew_point_2m", "relative_humidity_2m", "snow_depth", "snowfall", "rain", "precipitation", "apparent_temperature", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_mid", "cloud_cover_low", "cloud_cover_high", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_direction_10m", "wind_speed_100m", "wind_direction_100m", "wind_gusts_10m", "soil_temperature_0_to_7cm", "soil_temperature_28_to_100cm", "soil_temperature_7_to_28cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm", "is_day", "sunshine_duration"],
 	"timezone": "Asia/Bangkok"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(3).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
hourly_rain = hourly.Variables(5).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(6).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(7).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(8).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(9).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(11).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(12).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(13).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(14).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(15).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(17).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(18).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(19).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(20).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(21).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(22).ValuesAsNumpy()
hourly_soil_temperature_28_to_100cm = hourly.Variables(23).ValuesAsNumpy()
hourly_soil_temperature_7_to_28cm = hourly.Variables(24).ValuesAsNumpy()
hourly_soil_temperature_100_to_255cm = hourly.Variables(25).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(26).ValuesAsNumpy()
hourly_soil_moisture_7_to_28cm = hourly.Variables(27).ValuesAsNumpy()
hourly_soil_moisture_28_to_100cm = hourly.Variables(28).ValuesAsNumpy()
hourly_soil_moisture_100_to_255cm = hourly.Variables(29).ValuesAsNumpy()
hourly_is_day = hourly.Variables(30).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(31).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["snowfall"] = hourly_snowfall
hourly_data["rain"] = hourly_rain
hourly_data["precipitation"] = hourly_precipitation
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["weather_code"] = hourly_weather_code
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_temperature_28_to_100cm"] = hourly_soil_temperature_28_to_100cm
hourly_data["soil_temperature_7_to_28cm"] = hourly_soil_temperature_7_to_28cm
hourly_data["soil_temperature_100_to_255cm"] = hourly_soil_temperature_100_to_255cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
hourly_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
hourly_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
hourly_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
danang_hourly_df = spark.createDataFrame(hourly_dataframe)
display(danang_hourly_df)

In [0]:
# Setup the Open-Meteo API client with cache and retry on error
cache_session = requests_cache.CachedSession(backend = 'memory', expire_after = -1)
retry_session = retry(cache_session, retries = 5, backoff_factor = 0.2)
openmeteo = openmeteo_requests.Client(session = retry_session)

# Make sure all required weather variables are listed here
# The order of variables in hourly or daily is important to assign them correctly below
url = "https://archive-api.open-meteo.com/v1/archive"
params = {
	"latitude": 21.2212,
	"longitude": 105.8072,
	"start_date": "2020-01-01",
	"end_date": pd.Timestamp.now().strftime("%Y-%m-%d"),
	"hourly": ["temperature_2m", "dew_point_2m", "relative_humidity_2m", "snow_depth", "snowfall", "rain", "precipitation", "apparent_temperature", "weather_code", "pressure_msl", "surface_pressure", "cloud_cover", "cloud_cover_mid", "cloud_cover_low", "cloud_cover_high", "et0_fao_evapotranspiration", "vapour_pressure_deficit", "wind_speed_10m", "wind_direction_10m", "wind_speed_100m", "wind_direction_100m", "wind_gusts_10m", "soil_temperature_0_to_7cm", "soil_temperature_28_to_100cm", "soil_temperature_7_to_28cm", "soil_temperature_100_to_255cm", "soil_moisture_0_to_7cm", "soil_moisture_7_to_28cm", "soil_moisture_28_to_100cm", "soil_moisture_100_to_255cm", "is_day", "sunshine_duration"],
 	"timezone": "Asia/Bangkok"
}
responses = openmeteo.weather_api(url, params=params)

# Process first location. Add a for-loop for multiple locations or weather models
response = responses[0]
print(f"Coordinates: {response.Latitude()}°N {response.Longitude()}°E")
print(f"Elevation: {response.Elevation()} m asl")
print(f"Timezone difference to GMT+0: {response.UtcOffsetSeconds()}s")

# Process hourly data. The order of variables needs to be the same as requested.
hourly = response.Hourly()
hourly_temperature_2m = hourly.Variables(0).ValuesAsNumpy()
hourly_dew_point_2m = hourly.Variables(1).ValuesAsNumpy()
hourly_relative_humidity_2m = hourly.Variables(2).ValuesAsNumpy()
hourly_snow_depth = hourly.Variables(3).ValuesAsNumpy()
hourly_snowfall = hourly.Variables(4).ValuesAsNumpy()
hourly_rain = hourly.Variables(5).ValuesAsNumpy()
hourly_precipitation = hourly.Variables(6).ValuesAsNumpy()
hourly_apparent_temperature = hourly.Variables(7).ValuesAsNumpy()
hourly_weather_code = hourly.Variables(8).ValuesAsNumpy()
hourly_pressure_msl = hourly.Variables(9).ValuesAsNumpy()
hourly_surface_pressure = hourly.Variables(10).ValuesAsNumpy()
hourly_cloud_cover = hourly.Variables(11).ValuesAsNumpy()
hourly_cloud_cover_mid = hourly.Variables(12).ValuesAsNumpy()
hourly_cloud_cover_low = hourly.Variables(13).ValuesAsNumpy()
hourly_cloud_cover_high = hourly.Variables(14).ValuesAsNumpy()
hourly_et0_fao_evapotranspiration = hourly.Variables(15).ValuesAsNumpy()
hourly_vapour_pressure_deficit = hourly.Variables(16).ValuesAsNumpy()
hourly_wind_speed_10m = hourly.Variables(17).ValuesAsNumpy()
hourly_wind_direction_10m = hourly.Variables(18).ValuesAsNumpy()
hourly_wind_speed_100m = hourly.Variables(19).ValuesAsNumpy()
hourly_wind_direction_100m = hourly.Variables(20).ValuesAsNumpy()
hourly_wind_gusts_10m = hourly.Variables(21).ValuesAsNumpy()
hourly_soil_temperature_0_to_7cm = hourly.Variables(22).ValuesAsNumpy()
hourly_soil_temperature_28_to_100cm = hourly.Variables(23).ValuesAsNumpy()
hourly_soil_temperature_7_to_28cm = hourly.Variables(24).ValuesAsNumpy()
hourly_soil_temperature_100_to_255cm = hourly.Variables(25).ValuesAsNumpy()
hourly_soil_moisture_0_to_7cm = hourly.Variables(26).ValuesAsNumpy()
hourly_soil_moisture_7_to_28cm = hourly.Variables(27).ValuesAsNumpy()
hourly_soil_moisture_28_to_100cm = hourly.Variables(28).ValuesAsNumpy()
hourly_soil_moisture_100_to_255cm = hourly.Variables(29).ValuesAsNumpy()
hourly_is_day = hourly.Variables(30).ValuesAsNumpy()
hourly_sunshine_duration = hourly.Variables(31).ValuesAsNumpy()

hourly_data = {"date": pd.date_range(
	start = pd.to_datetime(hourly.Time(), unit = "s", utc = True),
	end =  pd.to_datetime(hourly.TimeEnd(), unit = "s", utc = True),
	freq = pd.Timedelta(seconds = hourly.Interval()),
	inclusive = "left"
)}

hourly_data["temperature_2m"] = hourly_temperature_2m
hourly_data["dew_point_2m"] = hourly_dew_point_2m
hourly_data["relative_humidity_2m"] = hourly_relative_humidity_2m
hourly_data["snow_depth"] = hourly_snow_depth
hourly_data["snowfall"] = hourly_snowfall
hourly_data["rain"] = hourly_rain
hourly_data["precipitation"] = hourly_precipitation
hourly_data["apparent_temperature"] = hourly_apparent_temperature
hourly_data["weather_code"] = hourly_weather_code
hourly_data["pressure_msl"] = hourly_pressure_msl
hourly_data["surface_pressure"] = hourly_surface_pressure
hourly_data["cloud_cover"] = hourly_cloud_cover
hourly_data["cloud_cover_mid"] = hourly_cloud_cover_mid
hourly_data["cloud_cover_low"] = hourly_cloud_cover_low
hourly_data["cloud_cover_high"] = hourly_cloud_cover_high
hourly_data["et0_fao_evapotranspiration"] = hourly_et0_fao_evapotranspiration
hourly_data["vapour_pressure_deficit"] = hourly_vapour_pressure_deficit
hourly_data["wind_speed_10m"] = hourly_wind_speed_10m
hourly_data["wind_direction_10m"] = hourly_wind_direction_10m
hourly_data["wind_speed_100m"] = hourly_wind_speed_100m
hourly_data["wind_direction_100m"] = hourly_wind_direction_100m
hourly_data["wind_gusts_10m"] = hourly_wind_gusts_10m
hourly_data["soil_temperature_0_to_7cm"] = hourly_soil_temperature_0_to_7cm
hourly_data["soil_temperature_28_to_100cm"] = hourly_soil_temperature_28_to_100cm
hourly_data["soil_temperature_7_to_28cm"] = hourly_soil_temperature_7_to_28cm
hourly_data["soil_temperature_100_to_255cm"] = hourly_soil_temperature_100_to_255cm
hourly_data["soil_moisture_0_to_7cm"] = hourly_soil_moisture_0_to_7cm
hourly_data["soil_moisture_7_to_28cm"] = hourly_soil_moisture_7_to_28cm
hourly_data["soil_moisture_28_to_100cm"] = hourly_soil_moisture_28_to_100cm
hourly_data["soil_moisture_100_to_255cm"] = hourly_soil_moisture_100_to_255cm
hourly_data["is_day"] = hourly_is_day
hourly_data["sunshine_duration"] = hourly_sunshine_duration

hourly_dataframe = pd.DataFrame(data = hourly_data)
hanoi_hourly_df = spark.createDataFrame(hourly_dataframe)
display(hanoi_hourly_df)

In [0]:
hcm_hourly_df_source = (
    hcm_hourly_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("temperature_2m", "nr_temperature_2m")
    .withColumnRenamed("dew_point_2m", "nr_dew_point_2m")
    .withColumnRenamed("relative_humidity_2m", "nr_relative_humidity_2m")
    .withColumnRenamed("snow_depth", "nr_snow_depth")
    .withColumnRenamed("snowfall", "nr_snowfall")
    .withColumnRenamed("rain", "nr_rain")
    .withColumnRenamed("precipitation", "nr_precipitation")
    .withColumnRenamed("apparent_temperature", "nr_apparent_temperature")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("pressure_msl", "nr_pressure_msl")
    .withColumnRenamed("surface_pressure", "nr_surface_pressure")
    .withColumnRenamed("cloud_cover", "nr_cloud_cover")
    .withColumnRenamed("cloud_cover_mid", "nr_cloud_cover_mid")
    .withColumnRenamed("cloud_cover_low", "nr_cloud_cover_low")
    .withColumnRenamed("cloud_cover_high", "nr_cloud_cover_high")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("vapour_pressure_deficit", "nr_vapour_pressure_deficit")
    .withColumnRenamed("wind_speed_10m", "nr_wind_speed_10m")
    .withColumnRenamed("wind_direction_10m", "nr_wind_direction_10m")
    .withColumnRenamed("wind_speed_100m", "nr_wind_speed_100m")
    .withColumnRenamed("wind_direction_100m", "nr_wind_direction_100m")
    .withColumnRenamed("wind_gusts_10m", "nr_wind_gusts_10m")
    .withColumnRenamed("soil_temperature_0_to_7cm", "nr_soil_temperature_0_to_7cm")
    .withColumnRenamed("soil_temperature_28_to_100cm", "nr_soil_temperature_28_to_100cm")
    .withColumnRenamed("soil_temperature_7_to_28cm", "nr_soil_temperature_7_to_28cm")
    .withColumnRenamed("soil_temperature_100_to_255cm", "nr_soil_temperature_100_to_255cm")
    .withColumnRenamed("soil_moisture_0_to_7cm", "nr_soil_moisture_0_to_7cm")
    .withColumnRenamed("soil_moisture_7_to_28cm", "nr_soil_moisture_7_to_28cm")
    .withColumnRenamed("soil_moisture_28_to_100cm", "nr_soil_moisture_28_to_100cm")
    .withColumnRenamed("soil_moisture_100_to_255cm", "nr_soil_moisture_100_to_255cm")
    .withColumnRenamed("is_day", "nr_is_day")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
)


bronze_max_date = (
    spark.table("hcmut.bronze.vn_hcm_weather_data_hourly")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01T00:00:00.000+00:00")).alias("max_date"))
    .collect()[0]["max_date"]
)


hcm_hourly_df_source = hcm_hourly_df_source.filter(col("dt_date_record") > bronze_max_date)

hcm_hourly_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_hcm_weather_data_hourly")

In [0]:
hanoi_hourly_df_source = (
    hanoi_hourly_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("temperature_2m", "nr_temperature_2m")
    .withColumnRenamed("dew_point_2m", "nr_dew_point_2m")
    .withColumnRenamed("relative_humidity_2m", "nr_relative_humidity_2m")
    .withColumnRenamed("snow_depth", "nr_snow_depth")
    .withColumnRenamed("snowfall", "nr_snowfall")
    .withColumnRenamed("rain", "nr_rain")
    .withColumnRenamed("precipitation", "nr_precipitation")
    .withColumnRenamed("apparent_temperature", "nr_apparent_temperature")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("pressure_msl", "nr_pressure_msl")
    .withColumnRenamed("surface_pressure", "nr_surface_pressure")
    .withColumnRenamed("cloud_cover", "nr_cloud_cover")
    .withColumnRenamed("cloud_cover_mid", "nr_cloud_cover_mid")
    .withColumnRenamed("cloud_cover_low", "nr_cloud_cover_low")
    .withColumnRenamed("cloud_cover_high", "nr_cloud_cover_high")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("vapour_pressure_deficit", "nr_vapour_pressure_deficit")
    .withColumnRenamed("wind_speed_10m", "nr_wind_speed_10m")
    .withColumnRenamed("wind_direction_10m", "nr_wind_direction_10m")
    .withColumnRenamed("wind_speed_100m", "nr_wind_speed_100m")
    .withColumnRenamed("wind_direction_100m", "nr_wind_direction_100m")
    .withColumnRenamed("wind_gusts_10m", "nr_wind_gusts_10m")
    .withColumnRenamed("soil_temperature_0_to_7cm", "nr_soil_temperature_0_to_7cm")
    .withColumnRenamed("soil_temperature_28_to_100cm", "nr_soil_temperature_28_to_100cm")
    .withColumnRenamed("soil_temperature_7_to_28cm", "nr_soil_temperature_7_to_28cm")
    .withColumnRenamed("soil_temperature_100_to_255cm", "nr_soil_temperature_100_to_255cm")
    .withColumnRenamed("soil_moisture_0_to_7cm", "nr_soil_moisture_0_to_7cm")
    .withColumnRenamed("soil_moisture_7_to_28cm", "nr_soil_moisture_7_to_28cm")
    .withColumnRenamed("soil_moisture_28_to_100cm", "nr_soil_moisture_28_to_100cm")
    .withColumnRenamed("soil_moisture_100_to_255cm", "nr_soil_moisture_100_to_255cm")
    .withColumnRenamed("is_day", "nr_is_day")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
)

bronze_max_date = (
    spark.table("hcmut.bronze.vn_hanoi_weather_data_hourly")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01T00:00:00.000+00:00")).alias("max_date"))
    .collect()[0]["max_date"]
)

hanoi_hourly_df_source = hanoi_hourly_df_source.filter(col("dt_date_record") > bronze_max_date)

hanoi_hourly_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_hanoi_weather_data_hourly")

In [0]:
danang_hourly_df_source = (
    danang_hourly_df
    .withColumnRenamed("date", "dt_date_record")
    .withColumnRenamed("temperature_2m", "nr_temperature_2m")
    .withColumnRenamed("dew_point_2m", "nr_dew_point_2m")
    .withColumnRenamed("relative_humidity_2m", "nr_relative_humidity_2m")
    .withColumnRenamed("snow_depth", "nr_snow_depth")
    .withColumnRenamed("snowfall", "nr_snowfall")
    .withColumnRenamed("rain", "nr_rain")
    .withColumnRenamed("precipitation", "nr_precipitation")
    .withColumnRenamed("apparent_temperature", "nr_apparent_temperature")
    .withColumnRenamed("weather_code", "nr_weather_code")
    .withColumnRenamed("pressure_msl", "nr_pressure_msl")
    .withColumnRenamed("surface_pressure", "nr_surface_pressure")
    .withColumnRenamed("cloud_cover", "nr_cloud_cover")
    .withColumnRenamed("cloud_cover_mid", "nr_cloud_cover_mid")
    .withColumnRenamed("cloud_cover_low", "nr_cloud_cover_low")
    .withColumnRenamed("cloud_cover_high", "nr_cloud_cover_high")
    .withColumnRenamed("et0_fao_evapotranspiration", "nr_et0_fao_evapotranspiration")
    .withColumnRenamed("vapour_pressure_deficit", "nr_vapour_pressure_deficit")
    .withColumnRenamed("wind_speed_10m", "nr_wind_speed_10m")
    .withColumnRenamed("wind_direction_10m", "nr_wind_direction_10m")
    .withColumnRenamed("wind_speed_100m", "nr_wind_speed_100m")
    .withColumnRenamed("wind_direction_100m", "nr_wind_direction_100m")
    .withColumnRenamed("wind_gusts_10m", "nr_wind_gusts_10m")
    .withColumnRenamed("soil_temperature_0_to_7cm", "nr_soil_temperature_0_to_7cm")
    .withColumnRenamed("soil_temperature_28_to_100cm", "nr_soil_temperature_28_to_100cm")
    .withColumnRenamed("soil_temperature_7_to_28cm", "nr_soil_temperature_7_to_28cm")
    .withColumnRenamed("soil_temperature_100_to_255cm", "nr_soil_temperature_100_to_255cm")
    .withColumnRenamed("soil_moisture_0_to_7cm", "nr_soil_moisture_0_to_7cm")
    .withColumnRenamed("soil_moisture_7_to_28cm", "nr_soil_moisture_7_to_28cm")
    .withColumnRenamed("soil_moisture_28_to_100cm", "nr_soil_moisture_28_to_100cm")
    .withColumnRenamed("soil_moisture_100_to_255cm", "nr_soil_moisture_100_to_255cm")
    .withColumnRenamed("is_day", "nr_is_day")
    .withColumnRenamed("sunshine_duration", "nr_sunshine_duration")
)

bronze_max_date = (
    spark.table("hcmut.bronze.vn_danang_weather_data_hourly")
    .select(coalesce(spark_max(col("dt_date_record")), lit("2020-01-01T00:00:00.000+00:00")).alias("max_date"))
    .collect()[0]["max_date"]
)

display(bronze_max_date)

danang_hourly_df_source = danang_hourly_df_source.filter(col("dt_date_record") > bronze_max_date)

danang_hourly_df_source.write.format("delta").mode("append").saveAsTable("hcmut.bronze.vn_danang_weather_data_hourly")