In [2]:
def align_to_target(raster_list, reference_da):
    """Aligns a list of DataArrays to a reference coordinate system and grid."""
    aligned_outputs = []
    for da in raster_list:
        if not da.rio.crs:
            da = da.rio.write_crs("EPSG:4326")
        
        reproj = da.rio.reproject_match(reference_da)
        if 'band' in reproj.dims:
            reproj = reproj.squeeze('band', drop=True)
        aligned_outputs.append(reproj)
    return aligned_outputs

In [3]:
# Load input layers
valid_mask = rxr.open_rasterio(valid_mask_path).squeeze('band', drop=True)
ecoregion_raw = rxr.open_rasterio(ecoregion_path)


sos_file_paths = natsorted(glob.glob(os.path.join(sos_raster_path, "*.tif")))

# Filter SOS Trend by validity mask
sos_raster_list = []
for sos_file in sos_file_paths:
    layer_name = os.path.splitext(os.path.basename(sos_file))[0]
    da = rxr.open_rasterio(sos_file, chunks=True)
    da = da.rio.reproject_match(valid_mask).squeeze('band', drop=True) 
    da = da.where(valid_mask == 1)
    da.name = layer_name
    sos_raster_list.append(da)


# Align Ecoregion raster to the SOS Trend grid
ecoregion = ecoregion_raw.rio.reproject_match(valid_mask)
ecoregion = ecoregion.squeeze('band', drop=True)
ecoregion.name = "ecoregion"

sos_ecoregion_stack = xr.merge([ecoregion] + sos_raster_list, compat='override')


del ecoregion_raw, sos_file_paths, da, layer_name

In [4]:
# Batch load all environmental driver TIFs
variables = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']
raw_driver_list = []
for var in variables:
    var_files = natsorted(glob.glob(os.path.join(env_basepath, var, "*.tif")))
    for file_path in var_files:
        layer_name = os.path.splitext(os.path.basename(file_path))[0]
        suffix = layer_name.split('_')[-1] 
        
        if suffix.isdigit():
            if int(suffix) not in months_to_load:
                continue
        
        elif var == 'AveragedCO2':
            # If Spring: only load the first half ('A')
            # If Winter: load BOTH 'A' and 'B' to ensure Dec (B) and Jan/Feb (A) are available
            if preseason == 'spring' and suffix.endswith('B'):
                continue
            # Note: In 'winter' mode, it will naturally bypass this skip and load both
            
        da = rxr.open_rasterio(file_path, chunks=True)
        da.name = layer_name
        raw_driver_list.append(da)
del var_files, layer_name, da

In [5]:
# Align all environmental drivers to the SOS Trend target grid
aligned_drivers = xr.merge(align_to_target(raw_driver_list, valid_mask), compat='override')

# Merge base layers and drivers into a single Dataset
merged_env_dataset = sos_ecoregion_stack.merge(aligned_drivers, compat='override')

del raw_driver_list, aligned_drivers, valid_mask, sos_ecoregion_stack

In [6]:
main_analysis_df = merged_env_dataset.to_dataframe()
del merged_env_dataset

In [7]:
select_ecoregions = [81003, 40115, 40301, 40403, 40401, 81021, 40701, 40501, 40502]
main_analysis_df = main_analysis_df[main_analysis_df['ecoregion'].isin(select_ecoregions)].reset_index()

In [8]:
sos_cols = [col for col in main_analysis_df.columns if col.startswith('sos')]
main_analysis_df = main_analysis_df.dropna(subset=sos_cols, how='all')
main_analysis_df = main_analysis_df.reset_index(drop=True)

In [9]:
# now performing rowwise aggregation of environmental drivers for each year

aggregated_df = {}
cols_to_drop = []
for year in range(2001, 2021, 1):

    # when preseason is winter, we have to include december of previous year for aggregation,
    # therefore we provide the previous year logic
    if preseason == 'spring':
        prev_year = year
    else:
        prev_year = year - 1

    # Process Temperature ---------------------------------
    # select temperature columns for that year and average
    temp_cols = [f"Temperature_{prev_year}_{months_to_load[0]}", 
                 f"Temperature_{year}_{months_to_load[1]}", 
                 f"Temperature_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"Temperature_{year}"] = main_analysis_df[temp_cols].mean(axis=1)

    # drop other monthly temperature columns
    cols_to_drop.extend(temp_cols)

    # Process Precipitation -----------------------------
    prec_cols = [f"Precipitation_{prev_year}_{months_to_load[0]}", 
                 f"Precipitation_{year}_{months_to_load[1]}", 
                 f"Precipitation_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"Precipitation_{year}"] = main_analysis_df[prec_cols].sum(axis=1)

    cols_to_drop.extend(prec_cols)

    # Process Soil Moisture ----------------------------------
    sm_cols = [f"SoilMoisture_{prev_year}_{months_to_load[0]}", 
               f"SoilMoisture_{year}_{months_to_load[1]}", 
               f"SoilMoisture_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"SoilMoisture_{year}"] = main_analysis_df[sm_cols].mean(axis=1)

    cols_to_drop.extend(sm_cols)

    # Process Solar Radiation ----------------------------------
    sr_cols = [f"SolarRadiation_{prev_year}_{months_to_load[0]}", 
               f"SolarRadiation_{year}_{months_to_load[1]}", 
               f"SolarRadiation_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"SolarRadiation_{year}"] = main_analysis_df[sr_cols].sum(axis=1)

    cols_to_drop.extend(sr_cols)

    # Process Averaged CO2 ----------------------------------
    if preseason == 'spring':
        # LOGIC A - has a single column
        aggregated_df[f"AveragedCO2_{year}"] = main_analysis_df[f"AveragedCO2_{year}A"]
        cols_to_drop.extend([f"AveragedCO2_{year}A"])
    else:
            # Average of previous year's second half and current year's first half
        co2_cols = [f"AveragedCO2_{year}A", f"AveragedCO2_{prev_year}B"]
        aggregated_df[f"AveragedCO2_{year}"] = main_analysis_df[co2_cols].mean(axis=1)
        cols_to_drop.extend(co2_cols)
    
    
main_analysis_df = pd.concat([main_analysis_df, pd.DataFrame(aggregated_df, index=main_analysis_df.index)], axis=1)
main_analysis_df.drop(columns=list(set(cols_to_drop)), inplace=True)

In [10]:
# Perform partial correlation analysis using vectorized Precision Matrix (Inverse Covariance) approach
#this has been checked with the usual pingouin partial correlation method and it is correct

# Define variables and year range (2001-2020)
years = list(range(2001, 2021))
env_vars = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']
all_vars = ['sos'] + env_vars

# Reshape data into a 3D array: (pixels, years, variables)
data_list = [main_analysis_df[[f"sos{y}" if v == 'sos' else f"{v}_{y}" for y in years]].values for v in all_vars]
data_3d = np.stack(data_list, axis=2)

# Vectorized standardization
mean = data_3d.mean(axis=1, keepdims=True)
std = data_3d.std(axis=1, ddof=1, keepdims=True)
std[std == 0] = 1 # Avoid division by zero
z_scores = (data_3d - mean) / std

# Calculate correlation matrices (N_pixels, 6, 6)
n_years = len(years)
corr = np.einsum('nij,nik->njk', z_scores, z_scores) / (n_years - 1)

# Vectorized partial correlation using Precision Matrix (inverse correlation matrix)
# We add a small epsilon to the diagonal for stability with constant values
corr[:, np.arange(len(all_vars)), np.arange(len(all_vars))] += 1e-6 
precision = np.linalg.inv(corr)
diag = np.diagonal(precision, axis1=1, axis2=2)

# Extract partial correlations of SOS (index 0) with each driver (indices 1 to 5)
pc_results = {}
for i, var in enumerate(env_vars, start=1):
    pc_results[var] = -precision[:, 0, i] / np.sqrt(diag[:, 0] * diag[:, i])

# Return result as a new dataframe with (x, y) coordinates as the index
partial_corr_df = main_analysis_df[['x', 'y', 'ecoregion']].join(pd.DataFrame(pc_results, index=main_analysis_df.index)).set_index(['x', 'y'])
partial_corr_df = partial_corr_df.round(3)
partial_corr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
x,y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
80.058981,28.899926,40701,-0.225,-0.133,0.260,-0.130,-0.474
80.058981,28.893188,40701,0.143,-0.586,0.311,-0.455,-0.110
80.061227,28.884205,40701,-0.242,0.248,-0.394,0.264,-0.077
80.061227,28.852764,40701,0.318,-0.109,0.245,-0.225,-0.789
80.063473,28.917892,40701,-0.090,-0.002,0.228,-0.082,-0.347
...,...,...,...,...,...,...,...
88.177505,26.759689,40701,-0.112,-0.145,0.285,0.048,-0.223
88.177505,26.710282,40701,0.032,-0.316,-0.164,-0.026,0.270
88.179751,26.809097,40115,-0.675,-0.019,-0.111,-0.198,-0.417
88.181997,26.734986,40701,-0.265,-0.133,-0.417,-0.219,-0.523


In [11]:
env_vars = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']

# Calculate averaged partial correlation for each variable by ecoregion
ecoregion_par_corr = (partial_corr_df.groupby('ecoregion')[env_vars].mean().reset_index()).round(2)
ecoregion_par_corr


Unnamed: 0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
0,40115,-0.01,-0.09,0.06,-0.04,-0.18
1,40301,0.0,-0.08,0.05,-0.02,-0.27
2,40401,-0.05,-0.17,0.17,-0.03,-0.31
3,40403,-0.02,-0.09,0.03,-0.03,-0.19
4,40501,-0.02,-0.01,0.12,0.02,-0.17
5,40502,-0.03,0.01,-0.07,-0.02,0.2
6,40701,-0.08,-0.09,0.03,-0.02,-0.23
7,81003,0.02,0.02,0.03,-0.02,0.08
8,81021,0.0,0.0,0.02,-0.03,0.17


In [12]:
from scipy.stats import rankdata

# 1. Rank the data along the year axis (axis=1) to prepare for Spearman Correlation
# data_3d shape: (pixels, years, variables)
ranked_data = np.apply_along_axis(rankdata, 1, data_3d)

# 2. Standardize the Ranks (Vectorized)
mean_r = ranked_data.mean(axis=1, keepdims=True)
std_r = ranked_data.std(axis=1, ddof=1, keepdims=True)
std_r[std_r == 0] = 1 # Avoid division by zero
z_scores_r = (ranked_data - mean_r) / std_r

# 3. Calculate Correlation Matrix
# Result shape: (N_pixels, N_vars, N_vars)
n_years = data_3d.shape[1]
spearman_matrix = np.einsum('nij,nik->njk', z_scores_r, z_scores_r) / (n_years - 1)

# 4. Extract Correlation of SOS (index 0) with each Driver (indices 1 to 5)
spearman_results = {}
for i, var in enumerate(env_vars, start=1):
    # Direct correlation between SOS (0) and Driver (i)
    spearman_results[var] = spearman_matrix[:, 0, i]

# 5. Create DataFrame and Aggregate by Ecoregion
spearman_df = pd.DataFrame(spearman_results, index=main_analysis_df.index)
spearman_df = (main_analysis_df[['x', 'y', 'ecoregion']].join(spearman_df))

# Calculate averaged Spearman correlation for each variable by ecoregion
ecoregion_spearman = (spearman_df.groupby('ecoregion')[env_vars].mean().reset_index()).round(2)

# Add aggregate row with mean of absolute values
new_row_data = ecoregion_spearman[env_vars].abs().mean().round(2)
new_row_data['ecoregion'] = 'avg abs r'
ecoregion_spearman = pd.concat([ecoregion_spearman, pd.DataFrame([new_row_data])], ignore_index=True)
out_dir = (r"../Data/Processed/Correlation_Analysis/Environmental_Factors")
os.makedirs(out_dir, exist_ok = True)
ecoregion_spearman.to_csv(os.path.join(out_dir, "spmn_r_env_sos_" + preseason + ".csv"), index=False)
ecoregion_spearman

Unnamed: 0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
0,40115,0.11,-0.06,0.03,0.0,-0.23
1,40301,0.15,-0.11,-0.03,0.06,-0.28
2,40401,0.22,-0.16,0.04,-0.01,-0.33
3,40403,0.1,-0.13,-0.07,0.06,-0.19
4,40501,0.07,0.01,0.06,0.02,-0.13
5,40502,-0.07,0.02,-0.01,-0.06,0.2
6,40701,0.13,-0.08,0.08,-0.01,-0.29
7,81003,-0.08,0.06,0.02,-0.06,0.18
8,81021,-0.09,0.1,0.04,-0.11,0.21
9,avg abs r,0.11,0.08,0.04,0.04,0.23
