In [1]:
import xarray as xr
import os
import rioxarray as rio
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import geopandas as gpd
from shapely.geometry import mapping
import rasterstats
from rasterio.features import rasterize

### Import boundaries for spatial distribution

In [2]:
# Read the shapefile
shapefile_path = '//gmvstorage.gmv.es/storage/anin/Groundwater/2011 Freshwater Ecosystem Priority Areas 9 Water Management Areas/9WMA.shp'
areas = gpd.read_file(shapefile_path)
areas = areas.set_crs('EPSG:4326')
# Field in the shapefile table of attributes to be used for the averaging of the SGI in this case  "SECONDARY" for secondary catchments
grouping_field = 'WMA_NewID' 


#### Read SPEI and SPI

In [3]:
fN_spei = "//gmvstorage.gmv.es/storage/anin/SPEI/outputs/SPEI3.nc"
fN_spi= "//gmvstorage.gmv.es/storage/anin/SPI/outputs/SPI.nc"

spei = xr.open_dataset(fN_spei)
spi = xr.open_dataset(fN_spi)

# Select time range matching the available GWL observations.
start_date = "1999-10-01"
end_date = "2022-07-01"

spei = spei.sel(time=slice(start_date, end_date))
spei= spei.rio.write_crs(4326, inplace=True)
spei = spei.rename({'__xarray_dataarray_variable__': 'spei'})

spi = spi.sel(time=slice(start_date, end_date))
spi = spi.rio.write_crs(4326, inplace=True)
spi = spi.rename({'__xarray_dataarray_variable__': 'spi'})

#### Rasterize the shapefile to be able to perform zonal statistics over rasterized data (ERA5)

In [4]:

shapes = ((geom, value) for geom, value in zip(areas.geometry, areas['WMA_NewID']))

eco_regions = rasterize(
    shapes=shapes,
    out_shape=(len(spi.y.data), len(spi.x.data)),
    transform=spi.rio.transform(),
    default_value=0,
)

#### Compute Zonal Statistics

In [11]:
def arraylist_to_dataframe(array_list):
    data = {}

    # Iterate over the array_list and assign each array to a column in the dictionary
    for i, array in enumerate(array_list):
        column_name = f'Ecoregion{i+1}'
        data[column_name] = array

    # Create the DataFrame from the dictionary
    df = pd.DataFrame(data)
    return df

In [5]:
eco_spi = []
eco_spei= []

for id_basin in areas['WMA_NewID']:

    spi_aux = spi.spi.data[:, eco_regions==id_basin]
    spi_mean = np.nanmean(spi_aux,axis=1)
    eco_spi.append(spi_mean)

    spei_aux = spei.spei.data[:, eco_regions==id_basin]
    spei_mean = np.nanmean(spei_aux,axis=1)
    eco_spei.append(spei_mean)


In [13]:
df_spi = arraylist_to_dataframe(eco_spi)
df_spei = arraylist_to_dataframe(eco_spei)

TypeError: The parameter "keys" may be a column key, one-dimensional array, or a list containing only valid column keys and one-dimensional arrays.. Received column of type <class 'xarray.core.dataarray.DataArray'>

#### Import GWL stations data

In [None]:
# Import data and metadata

# Load Metadata 
metadata_path = "//gmvstorage.gmv.es/storage/anin/Groundwater/processed_data/stations_metadadata22_active.xlsx"
meta_df = pd.read_excel(metadata_path)
meta_df.set_index('Station',inplace=True)

# Select the columns that are necessary for this code
meta_df = meta_df.iloc[:, 0:2]

# Load data
df_path = "//gmvstorage.gmv.es/storage/anin/Groundwater/processed_data/stations_data22_active.xlsx"
df = pd.read_excel(df_path)
df = df.iloc[:,1:]
df.set_index('date',inplace=True)

# Create a pandas time series for the plot function to work
dates = pd.to_datetime(df.index)

### Compute the SGI for all the stations

In [None]:
# Import the modified sgi function which includes scaling 
from sgi_tools import compute_sgi

# Define groundwater index scale factor
scale = 3

In [None]:
# Create a new DataFrame to store the SGI values
df_sgi = pd.DataFrame()

# Apply the SGI function to each column of the DataFrame

for column in df.columns:
    df_sgi[column] = compute_sgi(df[column],scale)

# Reset the index & Transpose the DataFrame
dates = df_sgi.index.to_pydatetime()
df_sgi = df_sgi.reset_index(drop = True)
df_sgi = df_sgi.T

#### Perform the average of the SGI values at each timestep for each catchment in the shapefile

In [None]:


# Join with coordinate information in metadata dataframe
df_latlon = df_sgi.join(meta_df)

# Convert the stations dataframe into a GeoDataFrame by specifying the geometry column with the coordinates:

geometry = [Point(xy) for xy in zip(df_latlon['Longitude'], df_latlon['Latitude'])]
stations_gdf = gpd.GeoDataFrame(df_latlon, geometry=geometry)

# Perform a spatial join between the polygons and the stations to determine which stations fall within each polygon:
stations_by_polygon = gpd.sjoin(areas, stations_gdf, how='inner', op='contains')

# Select the columns based on their numeric names
time_step_columns = [col for col in stations_by_polygon.columns if str(col).isdigit()]

# Group the stations by polygon and time step, and calculate the average for each group:
sgi_averages = stations_by_polygon.groupby([grouping_field])[time_step_columns].mean()

