In [1]:
from pathlib import Path
import pandas as pd

import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler

from skbio.stats.distance import permanova
from skbio import DistanceMatrix

In [2]:
site_metrics = pd.read_csv("../csvs/site_representative_metrics.csv")
site_metrics = site_metrics.set_index('site')
site_metrics = site_metrics.drop(columns=['year_estab', 'years_since_dist', 'elev_mean', 'slope_mean', 'forest_type'])

# Dropping AGG sites as internal variation would be too high
site_metrics = site_metrics[site_metrics['site_type'] != 'AGG']

site_metrics

Unnamed: 0_level_0,site_type,mean__chm,max__chm,sd__chm,cv__chm,mean__crr,mean__fhd,mean__veg_height_cv,cv__veg_height_median,mean__veg_height_kurt,...,mean__groundstorey_capture,mean__understorey_capture,mean__midstorey_capture,mean__upperstorey_capture,sd__groundstorey_capture,sd__understorey_capture,sd__midstorey_capture,sd__upperstorey_capture,mean__canopy_cover_gt1m,sd__canopy_cover_gt1m
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
EPO_O_04,EPO,38.239886,52.3364,13.330358,0.348752,0.563695,2.177094,0.62075,0.700603,0.187544,...,0.22234,0.206624,0.717144,0.379593,0.311173,0.239067,0.225,0.25725,0.868069,0.125496
EPO_O_13,EPO,27.299484,38.0424,6.769295,0.246819,0.590778,1.972776,0.661154,0.55802,0.207591,...,0.344311,0.267187,0.0,0.561324,0.286719,0.248157,0.0,0.235708,0.685449,0.203709
EPO_Y_10,EPO,25.902798,32.9526,5.477748,0.214736,0.611869,2.086827,0.640253,0.414246,0.399777,...,0.557769,0.118237,0.0,0.606747,0.286587,0.172723,0.0,0.195211,0.652079,0.193605
EPO_Y_15,EPO,19.754897,26.893,4.028354,0.204997,0.534772,2.09369,0.693868,0.58122,-0.691528,...,0.559714,0.589193,0.0,0.441649,0.336339,0.237508,0.0,0.202039,0.774936,0.144095
EPO_Y_36,EPO,14.125612,20.8292,4.098733,0.295683,0.520444,1.622504,0.758431,0.700694,-0.234955,...,0.710741,0.431444,0.0,0.401629,0.318927,0.285077,0.0,0.21059,0.684135,0.197953
EPO_Y_37,EPO,19.546503,29.8392,5.400514,0.284503,0.518785,2.201588,0.601493,0.572872,-0.231919,...,0.793763,0.456935,0.717013,0.348014,0.277015,0.271176,0.193685,0.235817,0.900535,0.104405
EPY_O_01,EPY,24.30072,30.3114,3.348425,0.137875,0.732646,2.10299,0.394931,0.236165,2.615524,...,0.247772,0.187969,0.0,0.65927,0.288067,0.184009,0.0,0.176026,0.723886,0.161273
EPY_O_07,EPY,19.011238,26.9406,3.608839,0.193783,0.623439,2.326841,0.459757,0.338139,-0.267701,...,0.133134,0.124674,0.0,0.682322,0.212306,0.124088,0.0,0.129612,0.723917,0.120105
EPY_O_09,EPY,16.115101,22.355,3.817097,0.236906,0.740153,1.59749,0.37994,0.352142,1.68882,...,0.140091,0.234508,0.0,0.498449,0.218427,0.212327,0.0,0.235802,0.624741,0.190139
EPY_Y_03,EPY,8.354218,15.072,3.496191,0.420746,0.670645,1.355165,0.390761,0.53999,0.368993,...,0.212537,0.296513,0.0,0.302007,0.337732,0.286428,0.0,0.264537,0.533897,0.255072


# Full Permanova

Start with a PERMANOVA across all groups and all metrics

In [3]:
y = site_metrics['site_type']
X = site_metrics.drop(columns=['site_type'])

In [4]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

distances = pdist(X_scaled, metric='euclidean')
distance_matrix = DistanceMatrix(squareform(distances), ids=y.index)

result = permanova(distance_matrix, y, permutations=9999)
result

method name               PERMANOVA
test statistic name        pseudo-F
sample size                      57
number of groups                 10
test statistic             6.234324
p-value                      0.0001
number of permutations         9999
Name: PERMANOVA results, dtype: object

Full permanova is highly significant (p < 0.001).

## Pairwise PERMANOVA

In [5]:
from itertools import combinations

def pair_wise_permanova(site_pair) -> pd.Series:
    site_a, site_b = site_pair
    mask = (y == site_a) | (y == site_b)

    y_mask = y[mask]
    dm_mask = distance_matrix.filter(y_mask.index)

    result = permanova(dm_mask, y_mask, permutations=9999)
    result.name = (site_a, site_b)
    return result

unique_sites = np.unique(y)
pair_combos = list(combinations(unique_sites, 2))

results = map(pair_wise_permanova, pair_combos)
permanova_df = pd.DataFrame(results)

def get_p_val_group(p_val: float):
    if p_val < 0.001:
        significant = 'highly significant (p < 0.001)'
    elif p_val < 0.01:
        significant = 'very significant (p < 0.01)'
    elif p_val < 0.05:
        significant = 'significant (p < 0.05)'
    else:
        significant = 'not significant (p >= 0.5)'
    return significant

permanova_df['p_val_group'] = permanova_df['p-value'].apply(get_p_val_group)
permanova_df

Unnamed: 0,Unnamed: 1,method name,test statistic name,sample size,number of groups,test statistic,p-value,number of permutations,p_val_group
EPO,EPY,PERMANOVA,pseudo-F,12,2,3.454328,0.0027,9999,very significant (p < 0.01)
EPO,NRM,PERMANOVA,pseudo-F,12,2,3.484436,0.0099,9999,very significant (p < 0.01)
EPO,NRO,PERMANOVA,pseudo-F,12,2,4.244638,0.0081,9999,very significant (p < 0.01)
EPO,NRY,PERMANOVA,pseudo-F,12,2,3.765525,0.002,9999,very significant (p < 0.01)
EPO,PPO,PERMANOVA,pseudo-F,12,2,4.00753,0.0114,9999,significant (p < 0.05)
EPO,PPY,PERMANOVA,pseudo-F,12,2,3.802077,0.0025,9999,very significant (p < 0.01)
EPO,ULM,PERMANOVA,pseudo-F,12,2,8.162522,0.0022,9999,very significant (p < 0.01)
EPO,ULO,PERMANOVA,pseudo-F,11,2,2.730503,0.0346,9999,significant (p < 0.05)
EPO,ULY,PERMANOVA,pseudo-F,10,2,8.392784,0.0051,9999,very significant (p < 0.01)
EPY,NRM,PERMANOVA,pseudo-F,12,2,5.837909,0.002,9999,very significant (p < 0.01)


In [6]:
permanova_df.reset_index().rename(columns= { "level_0": "site_type_a", "level_1": "site_type_b" }).to_csv("../csvs/pairwise-permanova-result.csv")

Pairwise permanova is .. 

## Per metric analysis

- Kruskall Wallis test to test amongs multiple groups
- Post hoc dunns test to do pairwise

In [7]:
from scipy.stats import kruskal
import pandas as pd

# Perform Kruskal-Wallis test for each metric
kruskal_results = []

for metric in X.columns:
    # Group the metric values by site_type
    groups = [X.loc[y == site_type, metric].values for site_type in np.unique(y)]
    
    # Remove any groups that are empty or have insufficient data
    groups = [group for group in groups if len(group) > 0]
    
    if len(groups) >= 2:  # Need at least 2 groups for comparison
        # Perform Kruskal-Wallis test
        statistic, p_value = kruskal(*groups)
        
        kruskal_results.append({
            'metric': metric,
            'statistic': statistic,
            'p_value': p_value,
            'significant': 'Yes' if p_value < 0.05 else 'No',
            "p_val_group": get_p_val_group(p_value)
        })

# Convert to DataFrame
kruskal_df = pd.DataFrame(kruskal_results)
kruskal_df = kruskal_df.sort_values('p_value')  # Sort by p-value
print(f"Kruskal-Wallis test results for {len(kruskal_df)} metrics:")
kruskal_df

Kruskal-Wallis test results for 22 metrics:


Unnamed: 0,metric,statistic,p_value,significant,p_val_group
2,sd__chm,45.740956,6.726837e-07,Yes,highly significant (p < 0.001)
1,max__chm,44.810224,1.000304e-06,Yes,highly significant (p < 0.001)
14,mean__midstorey_capture,36.358579,3.423837e-05,Yes,highly significant (p < 0.001)
18,sd__midstorey_capture,36.34682,3.440359e-05,Yes,highly significant (p < 0.001)
0,mean__chm,36.018875,3.9342e-05,Yes,highly significant (p < 0.001)
15,mean__upperstorey_capture,34.4902,7.326634e-05,Yes,highly significant (p < 0.001)
20,mean__canopy_cover_gt1m,32.127465,0.0001892736,Yes,highly significant (p < 0.001)
21,sd__canopy_cover_gt1m,30.48899,0.0003620922,Yes,highly significant (p < 0.001)
8,mean__veg_height_kurt,30.397157,0.0003754066,Yes,highly significant (p < 0.001)
5,mean__fhd,30.379189,0.0003780671,Yes,highly significant (p < 0.001)


In [15]:
import scikit_posthocs as sp

results = {}
flattenend_results = []

for metric in kruskal_df[kruskal_df['significant'] == 'Yes']['metric']:
    res = sp.posthoc_dunn(site_metrics, val_col=metric, group_col='site_type')
    results[metric] = res

    # Only take upper triangle to avoid duplicates (since matrix is symmetric)
    for i_idx, i in enumerate(res.index):
        for j_idx, j in enumerate(res.columns):
            if i_idx < j_idx:  # Only upper triangle (i_idx < j_idx avoids diagonal and lower triangle)
                flattenend_results.append({
                    "site_type_a": i,
                    "site_type_b": j,
                    "metric": metric,
                    "p_val": res.loc[i, j],
                    "p_val_group": get_p_val_group(res.loc[i, j] )
                })

flattenend_results = pd.DataFrame(flattenend_results)
flattenend_results.to_csv("../csvs/dunss_test_pairwise_per_metric.csv")
results['cv__chm']

Unnamed: 0,EPO,EPY,NRM,NRO,NRY,PPO,PPY,ULM,ULO,ULY
EPO,1.0,0.651131,0.767487,0.767487,0.051427,0.454548,0.519898,0.01111,0.537319,0.003811
EPY,0.651131,1.0,0.454548,0.454548,0.13473,0.230122,0.848281,0.036885,0.85266,0.012813
NRM,0.767487,0.454548,1.0,1.0,0.024861,0.651131,0.347646,0.004584,0.368772,0.001589
NRO,0.767487,0.454548,1.0,1.0,0.024861,0.651131,0.347646,0.004584,0.368772,0.001589
NRY,0.051427,0.13473,0.024861,0.024861,1.0,0.007023,0.192098,0.554302,0.214836,0.249678
PPO,0.454548,0.230122,0.651131,0.651131,0.007023,1.0,0.164117,0.001012,0.183544,0.000368
PPY,0.519898,0.848281,0.347646,0.347646,0.192098,0.164117,1.0,0.057997,0.997354,0.020459
ULM,0.01111,0.036885,0.004584,0.004584,0.554302,0.001012,0.057997,1.0,0.071202,0.533788
ULO,0.537319,0.85266,0.368772,0.368772,0.214836,0.183544,0.997354,0.071202,1.0,0.025925
ULY,0.003811,0.012813,0.001589,0.001589,0.249678,0.000368,0.020459,0.533788,0.025925,1.0


In [16]:
significant_results = flattenend_results[flattenend_results['p_val'] < 0.05]
significant_results

Unnamed: 0,site_type_a,site_type_b,metric,p_val,p_val_group
0,EPO,EPY,sd__chm,0.049380,significant (p < 0.05)
3,EPO,NRY,sd__chm,0.007023,very significant (p < 0.01)
5,EPO,PPY,sd__chm,0.003480,very significant (p < 0.01)
10,EPY,NRO,sd__chm,0.029705,significant (p < 0.05)
14,EPY,ULM,sd__chm,0.000161,highly significant (p < 0.001)
...,...,...,...,...,...
925,NRO,PPO,mean__understorey_capture,0.032391,significant (p < 0.05)
930,NRY,PPO,mean__understorey_capture,0.016767,significant (p < 0.05)
931,NRY,PPY,mean__understorey_capture,0.027776,significant (p < 0.05)
936,PPO,ULM,mean__understorey_capture,0.001075,very significant (p < 0.01)


In [10]:
from cliffs_delta import cliffs_delta

# Calculate Cliff's Delta for all significant results
sig_res_with_cliffs = []

for _, row in significant_results.iterrows():
    metric = row['metric']
    site_a = row['site_type_a']
    site_b = row['site_type_b']
    p_val = row['p_val']
    
    # Get the data for each group
    group_a_data = X.loc[y == site_a, metric].values
    group_b_data = X.loc[y == site_b, metric].values
    
    # Calculate Cliff's Delta
    delta, magnitude = cliffs_delta(group_a_data, group_b_data)
    
    sig_res_with_cliffs.append({
        'metric': metric,
        'site_type_a': site_a,
        'site_type_b': site_b,
        'p_val': p_val,
        'p_val_group': row['p_val_group'],
        'cliffs_delta': delta,
        'effect_size': magnitude,
        'mean_a': np.mean(group_a_data),
        'mean_b': np.mean(group_b_data),
        'n_a': len(group_a_data),
        'n_b': len(group_b_data)
    })
    
# Convert to DataFrame
cliffs_delta_df = pd.DataFrame(sig_res_with_cliffs)

# Sort by absolute effect size (largest effects first)
cliffs_delta_df['abs_cliffs_delta'] = abs(cliffs_delta_df['cliffs_delta'])
cliffs_delta_df = cliffs_delta_df.sort_values('abs_cliffs_delta', ascending=False)

cliffs_delta_df.to_csv("../csvs/significant_dunns_results_with_cliffs_delta.csv")
cliffs_delta_df.drop('abs_cliffs_delta', axis=1)

Unnamed: 0,metric,site_type_a,site_type_b,p_val,p_val_group,cliffs_delta,effect_size,mean_a,mean_b,n_a,n_b
0,sd__chm,EPO,EPY,0.049380,significant (p < 0.05),1.000000,large,6.517501,3.070476,6,6
174,mean__veg_height_cv,EPO,PPY,0.000840,highly significant (p < 0.001),1.000000,large,0.662658,0.396539,6,6
161,mean__fhd,NRO,NRY,0.001374,very significant (p < 0.01),1.000000,large,2.133752,1.432872,6,6
163,mean__fhd,NRO,ULY,0.005109,very significant (p < 0.01),1.000000,large,2.133752,1.458746,6,4
164,mean__fhd,NRY,PPO,0.006665,very significant (p < 0.01),-1.000000,large,1.432872,2.080042,6,6
...,...,...,...,...,...,...,...,...,...,...,...
278,sd__crr,PPY,ULM,0.036885,significant (p < 0.05),-0.555556,large,0.127794,0.161118,6,6
310,mean__understorey_capture,NRY,PPO,0.016767,significant (p < 0.05),0.555556,large,0.491063,0.225652,6,6
156,mean__fhd,EPY,NRO,0.047403,significant (p < 0.05),-0.555556,large,1.683919,2.133752,6,6
229,mean__crr,EPY,NRY,0.047403,significant (p < 0.05),0.500000,large,0.631968,0.533691,6,6


In [17]:
import scikit_posthocs as sp

results = {}
flattenend_results = []

for metric in kruskal_df[kruskal_df['significant'] == 'Yes']['metric']:
    res = sp.posthoc_dunn(site_metrics, val_col=metric, group_col='site_type', p_adjust='holm')
    results[metric] = res

    # Only take upper triangle to avoid duplicates (since matrix is symmetric)
    for i_idx, i in enumerate(res.index):
        for j_idx, j in enumerate(res.columns):
            if i_idx < j_idx:  # Only upper triangle (i_idx < j_idx avoids diagonal and lower triangle)
                flattenend_results.append({
                    "site_type_a": i,
                    "site_type_b": j,
                    "metric": metric,
                    "p_val": res.loc[i, j],
                    "p_val_group": get_p_val_group(res.loc[i, j] )
                })

flattenend_results = pd.DataFrame(flattenend_results)
flattenend_results.to_csv("../csvs/dunss_test_pairwise_per_metric_holm.csv")
results['cv__chm']

Unnamed: 0,EPO,EPY,NRM,NRO,NRY,PPO,PPY,ULM,ULO,ULY
EPO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.411062,1.0,0.156256
EPY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.461255
NRM,1.0,1.0,1.0,1.0,0.845258,1.0,1.0,0.183366,1.0,0.068346
NRO,1.0,1.0,1.0,1.0,0.845258,1.0,1.0,0.183366,1.0,0.068346
NRY,1.0,1.0,0.845258,0.845258,1.0,0.266872,1.0,1.0,1.0,1.0
PPO,1.0,1.0,1.0,1.0,0.266872,1.0,1.0,0.044542,1.0,0.016544
PPY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.716067
ULM,0.411062,1.0,0.183366,0.183366,1.0,0.044542,1.0,1.0,1.0,1.0
ULO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.845258
ULY,0.156256,0.461255,0.068346,0.068346,1.0,0.016544,0.716067,1.0,0.845258,1.0


In [18]:
significant_results = flattenend_results[flattenend_results['p_val'] < 0.05]
significant_results

Unnamed: 0,site_type_a,site_type_b,metric,p_val,p_val_group
14,EPY,ULM,sd__chm,0.006746,very significant (p < 0.01)
32,NRY,ULM,sd__chm,0.000293,highly significant (p < 0.001)
33,NRY,ULO,sd__chm,0.036834,significant (p < 0.05)
34,NRY,ULY,sd__chm,0.011870,significant (p < 0.05)
39,PPY,ULM,sd__chm,0.000101,highly significant (p < 0.001)
...,...,...,...,...,...
788,NRM,ULY,sd__crr,0.015590,significant (p < 0.05)
833,NRM,ULY,sd__vci,0.045075,significant (p < 0.05)
869,EPY,ULM,sd__understorey_capture,0.036563,significant (p < 0.05)
914,EPY,ULM,mean__understorey_capture,0.015783,significant (p < 0.05)


In [19]:

# Calculate Cliff's Delta for all significant results
sig_res_with_cliffs = []

for _, row in significant_results.iterrows():
    metric = row['metric']
    site_a = row['site_type_a']
    site_b = row['site_type_b']
    p_val = row['p_val']
    
    # Get the data for each group
    group_a_data = X.loc[y == site_a, metric].values
    group_b_data = X.loc[y == site_b, metric].values
    
    # Calculate Cliff's Delta
    delta, magnitude = cliffs_delta(group_a_data, group_b_data)
    
    sig_res_with_cliffs.append({
        'metric': metric,
        'site_type_a': site_a,
        'site_type_b': site_b,
        'p_val': p_val,
        'p_val_group': row['p_val_group'],
        'cliffs_delta': delta,
        'effect_size': magnitude,
        'mean_a': np.mean(group_a_data),
        'mean_b': np.mean(group_b_data),
        'n_a': len(group_a_data),
        'n_b': len(group_b_data)
    })
    
# Convert to DataFrame
cliffs_delta_df = pd.DataFrame(sig_res_with_cliffs)

# Sort by absolute effect size (largest effects first)
cliffs_delta_df['abs_cliffs_delta'] = abs(cliffs_delta_df['cliffs_delta'])
cliffs_delta_df = cliffs_delta_df.sort_values('abs_cliffs_delta', ascending=False)

cliffs_delta_df.to_csv("../csvs/significant_dunns_results_with_cliffs_delta_holm.csv")
cliffs_delta_df.drop('abs_cliffs_delta', axis=1)

Unnamed: 0,metric,site_type_a,site_type_b,p_val,p_val_group,cliffs_delta,effect_size,mean_a,mean_b,n_a,n_b
0,sd__chm,EPY,ULM,0.006746,very significant (p < 0.01),-1.000000,large,3.070476,15.568905,6,6
46,cv__veg_height_median,EPY,ULY,0.040921,significant (p < 0.05),-1.000000,large,0.470111,1.439095,6,4
31,mean__canopy_cover_gt1m,EPY,NRO,0.010207,significant (p < 0.05),-1.000000,large,0.592883,0.908020,6,6
1,sd__chm,NRY,ULM,0.000293,highly significant (p < 0.001),-1.000000,large,2.206086,15.568905,6,6
34,mean__canopy_cover_gt1m,NRO,ULY,0.013938,significant (p < 0.05),1.000000,large,0.908020,0.510249,6,4
...,...,...,...,...,...,...,...,...,...,...,...
33,mean__canopy_cover_gt1m,NRO,NRY,0.039014,significant (p < 0.05),0.944444,large,0.908020,0.611700,6,6
62,sd__understorey_capture,EPY,ULM,0.036563,significant (p < 0.05),-0.888889,large,0.167024,0.300363,6,6
64,mean__understorey_capture,PPO,ULM,0.047284,significant (p < 0.05),-0.888889,large,0.225652,0.590667,6,6
48,cv__veg_height_median,PPO,ULM,0.038450,significant (p < 0.05),-0.833333,large,0.336437,0.672758,6,6
