In [16]:
import xarray as xr
import pandas as pd
from tqdm.auto import tqdm
from pathlib import Path


# Facilities

In [17]:
# read raw facilities data
directory_facilities = "../data/raw/facilities/"
df_facilities_solar_raw = pd.read_parquet(directory_facilities + "facilities_solar.parquet")
df_facilities_wind_raw = pd.read_parquet(directory_facilities + "facilities_wind.parquet")
directory_processed = "../data/processed/"

In [18]:
# define a function to extract first and last date 
def get_first_last_date(df):
    series_first = df.apply(lambda x: [[i['data_first_seen']] for i in x['units'] if i and i['data_first_seen'] is not None],axis=1)
    series_last = df.apply(lambda x: [[i['data_last_seen']] for i in x['units'] if i and i['data_last_seen'] is not None],axis=1)
    df_result = df.copy()
    df_result['date_first'] = series_first
    df_result['date_last'] = series_last
    df_result = df_result[(df_result['date_first'].str.len() > 0) & (df_result['date_last'].str.len() > 0)]    
    return df_result
# define a function to extract year
def extract_year(df):
    series_year_start = df['date_first'].apply(lambda x: [i[0].year for i in x][0])
    series_year_end = df['date_last'].apply(lambda x: [i[0].year for i in x][0])
    df_result = df.copy()
    df_result['year_start'] = series_year_start
    df_result['year_end'] = series_year_end
    df_result['duration_year'] = df_result['year_end'] - df_result['year_start']
    return df_result
# define a function to extract lat & lng
def extract_lat_lng(df):
    df_result = df.copy()
    df_result = df_result.join(df_result['location'].apply(pd.Series))
    return df_result

In [19]:
# extract information
df_facilities_solar_1 = get_first_last_date(df_facilities_solar_raw)
df_facilities_solar_2 = extract_year(df_facilities_solar_1)
df_facilities_solar_3 = extract_lat_lng(df_facilities_solar_2)
df_facilities_wind_1 = get_first_last_date(df_facilities_wind_raw)
df_facilities_wind_2 = extract_year(df_facilities_wind_1)
df_facilities_wind_3 = extract_lat_lng(df_facilities_wind_2)
# 
df_f_solar_selected = df_facilities_solar_3[df_facilities_solar_3['year_end']>=2014]
df_f_wind_selected = df_facilities_wind_3[df_facilities_wind_3['year_end']>=2014]

In [20]:
df_f_location_solar = df_f_solar_selected[['code','lat','lng']].reset_index(drop=True)
df_f_location_wind = df_f_wind_selected[['code','lat','lng']].reset_index(drop=True)

# BARRA_R2

In [21]:
# read raw atomspheric data
directory_wind_speed = "../data/raw/BARRA_R2/wind/sfcWind/"
filename_grid = "sfcWind_AUS-11_ERA5_historical_hres_BOM_BARRA-R2_v1_1hr_201401-201401.nc"

In [22]:
# define a function to find the list of paths
def get_file_paths(feature_type,feature):
    directory_base = "../data/raw/BARRA_R2/{feature_type}/{feature}/"
    filename_base = "{feature}_AUS-11_ERA5_historical_hres_BOM_BARRA-R2_v1_1hr_{ym}-{ym}.nc"
    path_base = directory_base+filename_base
    list_paths = []
    for year in range(2014,2024):
        filenames = [path_base.format(feature_type=feature_type, feature=feature, ym=f"{year}{month:02d}") for month in range(1,13)]
        list_paths += filenames
    return list_paths

# define a function to find the nearest grid point for each farm
def find_grid_location(ds,df):
    for i, location in df.iterrows():
        lat = location['lat']
        lon = location['lng']
        grid_nearest = ds['sfcWind'].sel(lat=lat, lon=lon, method='nearest')
        df.loc[i,'grid_lat'] = grid_nearest['lat'].item()
        df.loc[i,'grid_lon'] = grid_nearest['lon'].item()

# define a function to extract data by farm site
def extract_data_by_site(ds, df_location):
    list_subsets = []
    for i, location in df_location.iterrows():
        subset = ds.sel(lat=location.grid_lat, lon=location.grid_lon)
        subset = subset.assign_coords(code=location.code, lat=location.grid_lat, lon=location.grid_lon)
        list_subsets.append(subset)
    return list_subsets

In [23]:
# get list of filenames for each feature
list_path_sfcwind = get_file_paths("wind","sfcWind")
list_path_rsds = get_file_paths("solar","rsds")

In [24]:
# open one month dataset to extract grid information
ds_grid = xr.open_dataset(directory_wind_speed + filename_grid)
# find the grid points for farms
find_grid_location(ds_grid,df_f_location_wind)
find_grid_location(ds_grid,df_f_location_solar)

In [25]:
# only keep coordinates in grid
df_location_grid_wind = df_f_location_wind.drop(columns=['lat','lng'])
df_location_grid_solar = df_f_location_solar.drop(columns=['lat','lng'])
# rename grid_lat into lat, grid_lon into lon
df_location_grid_wind.rename(columns={'grid_lat':'lat','grid_lon':'lon'},inplace=True)
df_location_grid_solar.rename(columns={'grid_lat':'lat','grid_lon':'lon'},inplace=True)
# save data frames into .parquet files
df_location_grid_wind.to_parquet(directory_processed + "site_wind.parquet")
df_location_grid_solar.to_parquet(directory_processed + "site_solar.parquet")

In [26]:
df_location_grid_solar

Unnamed: 0,code,lat,lon
0,ADP,-35.09,138.53
1,ALDGASF,-23.87,151.07
2,AVLSF,-34.87,146.56
3,BANNSP,-34.65,142.71
4,BARCSF,-23.54,145.35
...,...,...,...
106,WOOLGSF,-26.07,152.39
107,WUNUSF,-36.19,145.35
108,WYASF,-33.77,147.33
109,YARANSF,-27.72,151.51


In [27]:
# define a function to extract all data by farm sites & month
def extract_ds(list_path, df_location):
    list_ = []
    for path in list_path:
        ds = xr.open_dataset(path)
        list_month = extract_data_by_site(ds, df_location)
        list_.append(list_month)
    return list_
# define a function to aggregate all year ranged data by site
def agg_by_site(list_):
    list_sites = list(map(list, zip(*list_)))
    list_result = []
    for site in tqdm(list_sites, total=len(list_sites), desc="Sites", unit="site"):
        site_agg = xr.concat(site, coords='minimal', dim='time')
        df_site = site_agg.to_dataframe()
        list_result.append(df_site)
    return list_result

In [28]:
# extract all data from all .nc files
list_sfcwind = extract_ds(list_path_sfcwind, df_f_location_wind)
list_rsds = extract_ds(list_path_rsds, df_f_location_solar)

In [29]:
# aggregate data from multiple time series for every farm site (sfcWind)
list_sfcwind_by_site = agg_by_site(list_sfcwind)

Sites: 100%|██████████| 101/101 [1:14:16<00:00, 44.13s/site]


In [30]:
# aggregate data from multiple time series for every farm site (rsds)
list_rsds_by_site = agg_by_site(list_rsds)

Sites: 100%|██████████| 111/111 [39:08<00:00, 21.16s/site] 


In [55]:
# define a function to format and save data for each site into .parquet
def save_to_parquet(list_df, columns_to_remove, feature):
    dir_output = Path(directory_processed) / feature
    dir_output.mkdir(parents=True, exist_ok=True)

    for site in list_df:
        code = site['code'].iloc[0]
        site_ = site.reset_index()
        site_ = site_.drop(columns=columns_to_remove)
        site_.to_parquet(f"{dir_output}/{code}.parquet")
# set the columns to be removed
columns_to_remove_sfcwind = ['crs','lon','lat','height','code']
columns_to_remove_rsds = ['crs','lon','lat','code']
# save data of features by site
save_to_parquet(list_sfcwind_by_site, columns_to_remove_sfcwind, 'sfcwind')
save_to_parquet(list_rsds_by_site, columns_to_remove_rsds, 'rsds')