In [1]:
# partial correlation of EOS with
# environmental drivers: 1. Precipitation, 2. precipitation, 3. solar radiation, 4. soil moisture and 5. atmospheric CO2
# we consider two preseason windows to aggregate environmental factors - Spring (MAM) and Winter(DJF)
# same years first semiannual CO2 image is used for both windows

# make a dataframe composite of all required image stacks for further analysis


import os
import glob
import xarray as xr
import rioxarray as rxr
import pandas as pd
import numpy as np
import pingouin as pg
from natsort import natsorted

# Configuration Paths
env_basepath = r"../Data/Environmental_Drivers/"
eos_raster_path = r"../Data/Scaled_LSP_Rasters/eos/"
valid_mask_path = r"../Data/Processed/Valid_lsp_raster/Valid_lsp_change_eos.tif"
ecoregion_path = r"../Data/Ecoregion_raster/ecoregions_raster.tif"

preseason = 'spring'  

if preseason == 'summer':
    months_to_load = [7, 8, 9]
elif preseason == 'spring':
    months_to_load = [4, 5, 6]

In [2]:
def align_to_target(raster_list, reference_da):
    """Aligns a list of DataArrays to a reference coordinate system and grid."""
    aligned_outputs = []
    for da in raster_list:
        if not da.rio.crs:
            da = da.rio.write_crs("EPSG:4326")
        
        reproj = da.rio.reproject_match(reference_da)
        if 'band' in reproj.dims:
            reproj = reproj.squeeze('band', drop=True)
        aligned_outputs.append(reproj)
    return aligned_outputs

In [3]:
# Load input layers
valid_mask = rxr.open_rasterio(valid_mask_path).squeeze('band', drop=True)
ecoregion_raw = rxr.open_rasterio(ecoregion_path)


eos_file_paths = natsorted(glob.glob(os.path.join(eos_raster_path, "*.tif")))

# Filter EOS Trend by validity mask
eos_raster_list = []
for eos_file in eos_file_paths:
    layer_name = os.path.splitext(os.path.basename(eos_file))[0]
    da = rxr.open_rasterio(eos_file, chunks=True)
    da = da.rio.reproject_match(valid_mask).squeeze('band', drop=True) 
    da = da.where(valid_mask == 1)
    da.name = layer_name
    eos_raster_list.append(da)


# Align Ecoregion raster to the EOS Trend grid
ecoregion = ecoregion_raw.rio.reproject_match(valid_mask)
ecoregion = ecoregion.squeeze('band', drop=True)
ecoregion.name = "ecoregion"

eos_ecoregion_stack = xr.merge([ecoregion] + eos_raster_list, compat='override')


del ecoregion_raw, eos_file_paths, da, layer_name

In [4]:
# Batch load all environmental driver TIFs
variables = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']
raw_driver_list = []
for var in variables:
    var_files = natsorted(glob.glob(os.path.join(env_basepath, var, "*.tif")))
    for file_path in var_files:
        layer_name = os.path.splitext(os.path.basename(file_path))[0]
        suffix = layer_name.split('_')[-1] 
        
        if suffix.isdigit():
            if int(suffix) not in months_to_load:
                continue
        
        elif var == 'AveragedCO2':
            # If Spring: only load the first half ('A')
            # If Winter: load BOTH 'A' and 'B' to ensure Dec (B) and Jan/Feb (A) are available
            if preseason == 'spring' and suffix.endswith('B'):
                continue
            # Note: In 'winter' mode, it will naturally bypass this skip and load both
            
        da = rxr.open_rasterio(file_path, chunks=True)
        da.name = layer_name
        raw_driver_list.append(da)
del var_files, layer_name, da

In [5]:
# Align all environmental drivers to the EOS Trend target grid
aligned_drivers = xr.merge(align_to_target(raw_driver_list, valid_mask), compat='override')

# Merge base layers and drivers into a single Dataset
merged_env_dataset = eos_ecoregion_stack.merge(aligned_drivers, compat='override')

del raw_driver_list, aligned_drivers, valid_mask, eos_ecoregion_stack

In [6]:
main_analysis_df = merged_env_dataset.to_dataframe()
del merged_env_dataset

In [7]:
select_ecoregions = [81003, 40115, 40301, 40403, 40401, 40166, 81021, 40701, 40501, 40502, 40120]
main_analysis_df = main_analysis_df[main_analysis_df['ecoregion'].isin(select_ecoregions)].reset_index()

In [8]:
eos_cols = [col for col in main_analysis_df.columns if col.startswith('eos')]
main_analysis_df = main_analysis_df.dropna(subset=eos_cols, how='all')
main_analysis_df = main_analysis_df.reset_index(drop=True)

In [9]:
# now performing rowwise aggregation of environmental drivers for each year

aggregated_df = {}
cols_to_drop = []
for year in range(2001, 2021, 1):

    # updating from eos logic, for eos both seasons fall in the same year, therefore we use the same year
    if preseason == 'spring':
        prev_year = year
    else:
        prev_year = year

    # Process Temperature ---------------------------------
    # select temperature columns for that year and average
    temp_cols = [f"Temperature_{prev_year}_{months_to_load[0]}", 
                 f"Temperature_{year}_{months_to_load[1]}", 
                 f"Temperature_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"Temperature_{year}"] = main_analysis_df[temp_cols].mean(axis=1)

    # drop other monthly temperature columns
    cols_to_drop.extend(temp_cols)

    # Process Precipitation -----------------------------
    prec_cols = [f"Precipitation_{prev_year}_{months_to_load[0]}", 
                 f"Precipitation_{year}_{months_to_load[1]}", 
                 f"Precipitation_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"Precipitation_{year}"] = main_analysis_df[prec_cols].sum(axis=1)

    cols_to_drop.extend(prec_cols)

    # Process Soil Moisture ----------------------------------
    sm_cols = [f"SoilMoisture_{prev_year}_{months_to_load[0]}", 
               f"SoilMoisture_{year}_{months_to_load[1]}", 
               f"SoilMoisture_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"SoilMoisture_{year}"] = main_analysis_df[sm_cols].mean(axis=1)

    cols_to_drop.extend(sm_cols)

    # Process Solar Radiation ----------------------------------
    sr_cols = [f"SolarRadiation_{prev_year}_{months_to_load[0]}", 
               f"SolarRadiation_{year}_{months_to_load[1]}", 
               f"SolarRadiation_{year}_{months_to_load[2]}"]
    
    aggregated_df[f"SolarRadiation_{year}"] = main_analysis_df[sr_cols].sum(axis=1)

    cols_to_drop.extend(sr_cols)

    # Process Averaged CO2 ----------------------------------
    if preseason == 'spring':
        # LOGIC A - has a single column
        aggregated_df[f"AveragedCO2_{year}"] = main_analysis_df[f"AveragedCO2_{year}A"]
        cols_to_drop.extend([f"AveragedCO2_{year}A"])
    else:
            # Average of previous year's second half and current year's first half
        co2_cols = [f"AveragedCO2_{year}A", f"AveragedCO2_{prev_year}B"]
        aggregated_df[f"AveragedCO2_{year}"] = main_analysis_df[co2_cols].mean(axis=1)
        cols_to_drop.extend(co2_cols)
    
    
main_analysis_df = pd.concat([main_analysis_df, pd.DataFrame(aggregated_df, index=main_analysis_df.index)], axis=1)
main_analysis_df.drop(columns=list(set(cols_to_drop)), inplace=True)

In [10]:
# Perform partial correlation analysis using vectorized Precision Matrix (Inverse Covariance) approach
#this has been checked with the usual pingouin partial correlation method and it is correct

# Define variables and year range (2001-2020)
years = list(range(2001, 2021))
env_vars = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']
all_vars = ['eos'] + env_vars

# Reshape data into a 3D array: (pixels, years, variables)
data_list = [main_analysis_df[[f"eos{y}" if v == 'eos' else f"{v}_{y}" for y in years]].values for v in all_vars]
data_3d = np.stack(data_list, axis=2)

# Vectorized standardization
mean = data_3d.mean(axis=1, keepdims=True)
std = data_3d.std(axis=1, ddof=1, keepdims=True)
std[std == 0] = 1 # Avoid division by zero
z_scores = (data_3d - mean) / std

# Calculate correlation matrices (N_pixels, 6, 6)
n_years = len(years)
corr = np.einsum('nij,nik->njk', z_scores, z_scores) / (n_years - 1)

# Vectorized partial correlation using Precision Matrix (inverse correlation matrix)
# We add a small epsilon to the diagonal for stability with constant values
corr[:, np.arange(len(all_vars)), np.arange(len(all_vars))] += 1e-6 
precision = np.linalg.inv(corr)
diag = np.diagonal(precision, axis1=1, axis2=2)

# Extract partial correlations of EOS (index 0) with each driver (indices 1 to 5)
pc_results = {}
for i, var in enumerate(env_vars, start=1):
    pc_results[var] = -precision[:, 0, i] / np.sqrt(diag[:, 0] * diag[:, i])

# Return result as a new dataframe with (x, y) coordinates as the index
partial_corr_df = main_analysis_df[['x', 'y', 'ecoregion']].join(pd.DataFrame(pc_results, index=main_analysis_df.index)).set_index(['x', 'y'])
partial_corr_df = partial_corr_df.round(3)
partial_corr_df

Unnamed: 0_level_0,Unnamed: 1_level_0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
x,y,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
80.058981,28.902171,40701,0.142,-0.236,0.254,0.329,0.348
80.061227,28.913400,40701,-0.133,0.264,-0.224,0.140,0.551
80.061227,28.904417,40701,0.034,0.112,-0.150,0.096,0.462
80.061227,28.888697,40701,-0.278,0.008,-0.398,-0.019,0.125
80.063473,28.913400,40701,-0.057,0.187,-0.156,0.089,0.427
...,...,...,...,...,...,...,...
88.181997,26.730494,40701,0.171,0.306,0.402,0.316,0.563
88.181997,26.714774,40701,0.222,-0.185,0.478,-0.312,0.524
88.184243,26.739477,40701,-0.159,-0.041,-0.242,-0.089,0.259
88.184243,26.717019,40701,0.266,0.437,-0.188,0.281,0.514


In [11]:
env_vars = ['Temperature', 'Precipitation', 'SoilMoisture', 'SolarRadiation', 'AveragedCO2']

# Calculate averaged partial correlation for each variable by ecoregion
ecoregion_par_corr = (partial_corr_df.groupby('ecoregion')[env_vars].mean().reset_index()).round(2)
ecoregion_par_corr


Unnamed: 0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
0,40115,0.09,0.02,0.0,0.09,0.02
1,40301,0.05,0.01,0.0,0.1,-0.0
2,40401,0.15,0.03,-0.01,0.05,-0.02
3,40403,0.04,-0.05,-0.0,0.09,-0.04
4,40501,0.03,-0.02,0.0,0.01,-0.01
5,40502,-0.0,-0.04,-0.01,0.05,0.22
6,40701,0.05,0.01,-0.0,0.06,0.17
7,81003,0.04,-0.03,0.04,0.02,0.01
8,81021,0.01,-0.08,0.02,0.04,0.27


In [12]:
from scipy.stats import rankdata

# 1. Rank the data along the year axis (axis=1) to prepare for Spearman Correlation
# data_3d shape: (pixels, years, variables)
ranked_data = np.apply_along_axis(rankdata, 1, data_3d)

# 2. Standardize the Ranks (Vectorized)
mean_r = ranked_data.mean(axis=1, keepdims=True)
std_r = ranked_data.std(axis=1, ddof=1, keepdims=True)
std_r[std_r == 0] = 1 # Avoid division by zero
z_scores_r = (ranked_data - mean_r) / std_r

# 3. Calculate Correlation Matrix
# Result shape: (N_pixels, N_vars, N_vars)
n_years = data_3d.shape[1]
spearman_matrix = np.einsum('nij,nik->njk', z_scores_r, z_scores_r) / (n_years - 1)

# 4. Extract Correlation of EOS (index 0) with each Driver (indices 1 to 5)
spearman_results = {}
for i, var in enumerate(env_vars, start=1):
    # Direct correlation between EOS (0) and Driver (i)
    spearman_results[var] = spearman_matrix[:, 0, i]

# 5. Create DataFrame and Aggregate by Ecoregion
spearman_df = pd.DataFrame(spearman_results, index=main_analysis_df.index)
spearman_df = (main_analysis_df[['x', 'y', 'ecoregion']].join(spearman_df))

# Calculate averaged Spearman correlation for each variable by ecoregion
ecoregion_spearman = (spearman_df.groupby('ecoregion')[env_vars].mean().reset_index()).round(2)

# Add aggregate row with mean of absolute values
new_row_data = ecoregion_spearman[env_vars].abs().mean().round(2)
new_row_data['ecoregion'] = 'avg abs r'
ecoregion_spearman = pd.concat([ecoregion_spearman, pd.DataFrame([new_row_data])], ignore_index=True)
out_dir = (r"../Data/Processed/Correlation_Analysis/Environmental_Factors")
os.makedirs(out_dir, exist_ok = True)
ecoregion_spearman.to_csv(os.path.join(out_dir, "spmn_r_env_eos_" + preseason + ".csv"), index=False)
ecoregion_spearman

Unnamed: 0,ecoregion,Temperature,Precipitation,SoilMoisture,SolarRadiation,AveragedCO2
0,40115,0.15,-0.09,-0.07,0.15,-0.01
1,40301,0.07,-0.08,-0.05,0.12,-0.01
2,40401,0.13,-0.06,-0.05,0.11,-0.03
3,40403,0.07,-0.14,-0.07,0.14,-0.06
4,40501,0.05,-0.04,-0.03,0.05,-0.06
5,40502,0.01,-0.02,0.0,0.05,0.2
6,40701,0.06,-0.1,-0.13,0.08,0.15
7,81003,0.03,-0.03,0.04,0.04,-0.12
8,81021,0.02,-0.07,0.06,0.06,0.25
9,avg abs r,0.07,0.07,0.06,0.09,0.1
