In [25]:
from pathlib import Path
import pandas as pd

import numpy as np
from scipy.spatial.distance import pdist, squareform
from sklearn.preprocessing import StandardScaler

from skbio.stats.distance import permanova
from skbio import DistanceMatrix

In [26]:
site_metrics = pd.read_csv("../csvs/site_representative_metrics.csv")
site_metrics = site_metrics.set_index('site')
site_metrics['site_type'] = site_metrics.index.str[0:3]
# site_metrics = site_metrics.drop(columns=['year_estab', 'years_since_dist', 'elev_mean', 'slope_mean', 'forest_type'])

# Dropping AGG sites as internal variation would be too high
# site_metrics = site_metrics[site_metrics['site_type'] != 'AGG']

site_metrics

Unnamed: 0_level_0,mean__chm,max__chm,mean__veg_height_q30,mean__canopy_cover_gt1m,mean__fhd,mean__veg_height_cv,mean__vci,mean__crr,mean__veg_height_kurt,sd__chm,cv__chm,sd__canopy_cover_gt1m,sd__vci,sd__fhd,sd__crr,sd__veg_height_skew,site_type
site,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
EPO_O_04,38.239886,52.3364,11.945576,0.868069,2.177094,0.62075,0.546357,0.563695,0.187544,13.330358,0.348752,0.125496,0.114871,0.457687,0.167676,1.283775,EPO
EPO_O_13,27.299484,38.0424,11.010423,0.685449,1.972776,0.661154,0.536631,0.590778,0.207591,6.769295,0.246819,0.203709,0.141558,0.520719,0.175375,1.208187,EPO
EPO_Y_10,25.902798,32.9526,12.544017,0.652079,2.086827,0.640253,0.588872,0.611869,0.399777,5.477748,0.214736,0.193605,0.152311,0.538408,0.150632,1.099697,EPO
EPO_Y_15,19.754897,26.893,5.932556,0.774936,2.09369,0.693868,0.6244,0.534772,-0.691528,4.028354,0.204997,0.144095,0.103413,0.346978,0.142027,0.93035,EPO
EPO_Y_36,14.125612,20.8292,4.015971,0.684135,1.622504,0.758431,0.523706,0.520444,-0.234955,4.098733,0.295683,0.197953,0.139617,0.430635,0.160847,1.122658,EPO
EPO_Y_37,19.546503,29.8392,6.103131,0.900535,2.201588,0.601493,0.638667,0.518785,-0.231919,5.400514,0.284503,0.104405,0.10292,0.354001,0.13194,0.901474,EPO
EPY_O_01,24.30072,30.3114,16.589948,0.723886,2.10299,0.394931,0.608015,0.732646,2.615524,3.348425,0.137875,0.161273,0.132946,0.459745,0.128598,1.109719,EPY
EPY_O_07,19.011238,26.9406,9.426508,0.723917,2.326841,0.459757,0.694907,0.623439,-0.267701,3.608839,0.193783,0.120105,0.117699,0.392896,0.115948,0.733684,EPY
EPY_O_09,16.115101,22.355,11.017379,0.624741,1.59749,0.37994,0.500202,0.740153,1.68882,3.817097,0.236906,0.190139,0.134313,0.429018,0.153667,1.239037,EPY
EPY_Y_03,8.354218,15.072,4.985573,0.533897,1.355165,0.390761,0.480407,0.670645,0.368993,3.496191,0.420746,0.255072,0.196516,0.55403,0.165312,1.027402,EPY


# Full Permanova

Start with a PERMANOVA across all groups and all metrics

In [27]:
y = site_metrics['site_type']
X = site_metrics.drop(columns=['site_type'])

X.shape

(57, 16)

In [28]:
def get_p_val_group(p_val: float):
    if p_val < 0.001:
        significant = 'highly significant (p < 0.001)'
    elif p_val < 0.01:
        significant = 'very significant (p < 0.01)'
    elif p_val < 0.05:
        significant = 'significant (p < 0.05)'
    else:
        significant = 'not significant (p >= 0.5)'
    return significant

In [29]:
from scipy.stats import shapiro

stat, p = shapiro(X, axis=0)

shapiro_p_vals = pd.DataFrame(pd.Series(p, name='p_val', index=X.columns))
shapiro_p_vals['significant'] = shapiro_p_vals['p_val'] < 0.05
shapiro_p_vals

Unnamed: 0,p_val,significant
mean__chm,0.125858,False
max__chm,0.062499,False
mean__veg_height_q30,0.103337,False
mean__canopy_cover_gt1m,4e-06,True
mean__fhd,0.000111,True
mean__veg_height_cv,0.087825,False
mean__vci,0.130403,False
mean__crr,0.507277,False
mean__veg_height_kurt,0.000758,True
sd__chm,4e-06,True


In [30]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

distances = pdist(X_scaled, metric='euclidean')
distance_matrix = DistanceMatrix(squareform(distances), ids=y.index)

result = permanova(distance_matrix, y, permutations=9999)

print(f'Full PERMANOVA is {get_p_val_group(result['p-value'])}')

result

Full PERMANOVA is highly significant (p < 0.001)


method name               PERMANOVA
test statistic name        pseudo-F
sample size                      57
number of groups                 10
test statistic             6.597396
p-value                      0.0001
number of permutations         9999
Name: PERMANOVA results, dtype: object

## Pairwise PERMANOVA

In [31]:
from itertools import combinations

def pair_wise_permanova(site_pair) -> pd.Series:
    site_a, site_b = site_pair
    mask = (y == site_a) | (y == site_b)

    y_mask = y[mask]
    dm_mask = distance_matrix.filter(y_mask.index)

    result = permanova(dm_mask, y_mask, permutations=9999)
    result.name = (site_a, site_b)
    return result

unique_sites = np.unique(y)
pair_combos = list(combinations(unique_sites, 2))

results = map(pair_wise_permanova, pair_combos)
permanova_df = pd.DataFrame(results)



permanova_df['p_val_group'] = permanova_df['p-value'].apply(get_p_val_group)
permanova_df

Unnamed: 0,Unnamed: 1,method name,test statistic name,sample size,number of groups,test statistic,p-value,number of permutations,p_val_group
EPO,EPY,PERMANOVA,pseudo-F,12,2,2.648831,0.05,9999,not significant (p >= 0.5)
EPO,NRM,PERMANOVA,pseudo-F,12,2,6.871281,0.0024,9999,very significant (p < 0.01)
EPO,NRO,PERMANOVA,pseudo-F,12,2,2.94894,0.0535,9999,not significant (p >= 0.5)
EPO,NRY,PERMANOVA,pseudo-F,12,2,7.200585,0.0018,9999,very significant (p < 0.01)
EPO,PPO,PERMANOVA,pseudo-F,12,2,4.121017,0.0117,9999,significant (p < 0.05)
EPO,PPY,PERMANOVA,pseudo-F,12,2,4.342937,0.0022,9999,very significant (p < 0.01)
EPO,ULM,PERMANOVA,pseudo-F,12,2,7.653592,0.007,9999,very significant (p < 0.01)
EPO,ULO,PERMANOVA,pseudo-F,11,2,3.178991,0.0216,9999,significant (p < 0.05)
EPO,ULY,PERMANOVA,pseudo-F,10,2,10.773883,0.0048,9999,very significant (p < 0.01)
EPY,NRM,PERMANOVA,pseudo-F,12,2,4.567798,0.0088,9999,very significant (p < 0.01)


In [23]:
from statsmodels.stats.multitest import multipletests, multitest_methods_names

permanova_df['p-val-fdr_bh'] = multipletests(permanova_df['p-value'], method='fdr_bh')[1]
permanova_df['p-val-fdr_bg_group'] = permanova_df['p-val-fdr_bh'].apply(get_p_val_group)

In [24]:
permanova_df.reset_index().rename(columns= { "level_0": "site_type_a", "level_1": "site_type_b" }).to_csv("../csvs/pairwise-permanova-result.csv")

Pairwise permanova is .. 

## Per metric analysis

- Kruskall Wallis test to test amongs multiple groups
- Post hoc dunns test to do pairwise

In [23]:
from scipy.stats import kruskal
import pandas as pd

# Perform Kruskal-Wallis test for each metric
kruskal_results = []

for metric in X.columns:
    # Group the metric values by site_type
    groups = [X.loc[y == site_type, metric].values for site_type in np.unique(y)]
    
    # Remove any groups that are empty or have insufficient data
    groups = [group for group in groups if len(group) > 0]
    
    if len(groups) >= 2:  # Need at least 2 groups for comparison
        # Perform Kruskal-Wallis test
        statistic, p_value = kruskal(*groups)
        
        kruskal_results.append({
            'metric': metric,
            'statistic': statistic,
            'p_value': p_value,
            'significant': 'Yes' if p_value < 0.05 else 'No',
            "p_val_group": get_p_val_group(p_value)
        })

# Convert to DataFrame
kruskal_df = pd.DataFrame(kruskal_results)
# kruskal_df = kruskal_df.sort_values('p_value')  # Sort by p-value
print(f"Kruskal-Wallis test results for {len(kruskal_df)} metrics:")
kruskal_df

Kruskal-Wallis test results for 16 metrics:


Unnamed: 0,metric,statistic,p_value,significant,p_val_group
0,mean__chm,36.018875,3.9342e-05,Yes,highly significant (p < 0.001)
1,max__chm,44.810224,1.000304e-06,Yes,highly significant (p < 0.001)
2,mean__veg_height_q30,33.599335,0.0001049767,Yes,highly significant (p < 0.001)
3,mean__canopy_cover_gt1m,32.127465,0.0001892736,Yes,highly significant (p < 0.001)
4,mean__fhd,30.379189,0.0003780671,Yes,highly significant (p < 0.001)
5,mean__veg_height_cv,30.305082,0.000389237,Yes,highly significant (p < 0.001)
6,mean__vci,26.262553,0.001850314,Yes,very significant (p < 0.01)
7,mean__crr,27.337992,0.001229534,Yes,very significant (p < 0.01)
8,mean__veg_height_kurt,30.397157,0.0003754066,Yes,highly significant (p < 0.001)
9,sd__chm,45.740956,6.726837e-07,Yes,highly significant (p < 0.001)


In [24]:
import scikit_posthocs as sp

results = {}
flattenend_results = []

for metric in kruskal_df[kruskal_df['significant'] == 'Yes']['metric']:
    res = sp.posthoc_dunn(site_metrics, val_col=metric, group_col='site_type')
    results[metric] = res

    # Only take upper triangle to avoid duplicates (since matrix is symmetric)
    for i_idx, i in enumerate(res.index):
        for j_idx, j in enumerate(res.columns):
            if i_idx < j_idx:  # Only upper triangle (i_idx < j_idx avoids diagonal and lower triangle)
                flattenend_results.append({
                    "site_type_a": i,
                    "site_type_b": j,
                    "metric": metric,
                    "p_val": res.loc[i, j],
                    "p_val_group": get_p_val_group(res.loc[i, j] )
                })

flattenend_results = pd.DataFrame(flattenend_results)
flattenend_results.to_csv("../csvs/dunss_test_pairwise_per_metric.csv")
results['cv__chm']

Unnamed: 0,EPO,EPY,NRM,NRO,NRY,PPO,PPY,ULM,ULO,ULY
EPO,1.0,0.651131,0.767487,0.767487,0.051427,0.454548,0.519898,0.01111,0.537319,0.003811
EPY,0.651131,1.0,0.454548,0.454548,0.13473,0.230122,0.848281,0.036885,0.85266,0.012813
NRM,0.767487,0.454548,1.0,1.0,0.024861,0.651131,0.347646,0.004584,0.368772,0.001589
NRO,0.767487,0.454548,1.0,1.0,0.024861,0.651131,0.347646,0.004584,0.368772,0.001589
NRY,0.051427,0.13473,0.024861,0.024861,1.0,0.007023,0.192098,0.554302,0.214836,0.249678
PPO,0.454548,0.230122,0.651131,0.651131,0.007023,1.0,0.164117,0.001012,0.183544,0.000368
PPY,0.519898,0.848281,0.347646,0.347646,0.192098,0.164117,1.0,0.057997,0.997354,0.020459
ULM,0.01111,0.036885,0.004584,0.004584,0.554302,0.001012,0.057997,1.0,0.071202,0.533788
ULO,0.537319,0.85266,0.368772,0.368772,0.214836,0.183544,0.997354,0.071202,1.0,0.025925
ULY,0.003811,0.012813,0.001589,0.001589,0.249678,0.000368,0.020459,0.533788,0.025925,1.0


In [25]:
significant_results = flattenend_results[flattenend_results['p_val'] < 0.05]
significant_results

Unnamed: 0,site_type_a,site_type_b,metric,p_val,p_val_group
3,EPO,NRY,mean__chm,0.003110,very significant (p < 0.01)
5,EPO,PPY,mean__chm,0.005109,very significant (p < 0.01)
10,EPY,NRO,mean__chm,0.008635,very significant (p < 0.01)
14,EPY,ULM,mean__chm,0.006665,very significant (p < 0.01)
15,EPY,ULO,mean__chm,0.012749,significant (p < 0.05)
...,...,...,...,...,...
709,NRY,ULY,sd__veg_height_skew,0.000326,highly significant (p < 0.001)
713,PPO,ULY,sd__veg_height_skew,0.020459,significant (p < 0.05)
714,PPY,ULM,sd__veg_height_skew,0.011110,significant (p < 0.05)
715,PPY,ULO,sd__veg_height_skew,0.043754,significant (p < 0.05)


In [26]:
from cliffs_delta import cliffs_delta

# Calculate Cliff's Delta for all significant results
sig_res_with_cliffs = []

for _, row in significant_results.iterrows():
    metric = row['metric']
    site_a = row['site_type_a']
    site_b = row['site_type_b']
    p_val = row['p_val']
    
    # Get the data for each group
    group_a_data = X.loc[y == site_a, metric].values
    group_b_data = X.loc[y == site_b, metric].values
    
    # Calculate Cliff's Delta
    delta, magnitude = cliffs_delta(group_a_data, group_b_data)
    
    sig_res_with_cliffs.append({
        'metric': metric,
        'site_type_a': site_a,
        'site_type_b': site_b,
        'p_val': p_val,
        'p_val_group': row['p_val_group'],
        'cliffs_delta': delta,
        'effect_size': magnitude,
        'mean_a': np.mean(group_a_data),
        'mean_b': np.mean(group_b_data),
        'n_a': len(group_a_data),
        'n_b': len(group_b_data)
    })
    
# Convert to DataFrame
cliffs_delta_df = pd.DataFrame(sig_res_with_cliffs)

# Sort by absolute effect size (largest effects first)
cliffs_delta_df['abs_cliffs_delta'] = abs(cliffs_delta_df['cliffs_delta'])
cliffs_delta_df = cliffs_delta_df.sort_values('abs_cliffs_delta', ascending=False)

cliffs_delta_df.to_csv("../csvs/significant_dunns_results_with_cliffs_delta.csv")
cliffs_delta_df.drop('abs_cliffs_delta', axis=1)

Unnamed: 0,metric,site_type_a,site_type_b,p_val,p_val_group,cliffs_delta,effect_size,mean_a,mean_b,n_a,n_b
0,mean__chm,EPO,NRY,0.003110,very significant (p < 0.01),1.000000,large,24.144863,5.689086,6,6
139,sd__chm,EPO,EPY,0.049380,significant (p < 0.05),1.000000,large,6.517501,3.070476,6,6
122,mean__veg_height_kurt,EPO,ULM,0.041865,significant (p < 0.05),-1.000000,large,-0.060582,0.707808,6,6
123,mean__veg_height_kurt,EPO,ULY,0.026643,significant (p < 0.05),-1.000000,large,-0.060582,0.896139,6,4
127,mean__veg_height_kurt,NRM,ULM,0.002338,very significant (p < 0.01),-1.000000,large,-0.400871,0.707808,6,6
...,...,...,...,...,...,...,...,...,...,...,...
218,sd__crr,NRO,ULM,0.049380,significant (p < 0.05),-0.611111,large,0.131742,0.161118,6,6
224,sd__crr,PPY,ULM,0.036885,significant (p < 0.05),-0.555556,large,0.127794,0.161118,6,6
67,mean__fhd,EPY,NRO,0.047403,significant (p < 0.05),-0.555556,large,1.683919,2.133752,6,6
111,mean__crr,EPY,NRY,0.047403,significant (p < 0.05),0.500000,large,0.631968,0.533691,6,6


In [None]:
import scikit_posthocs as sp

results = {}
flattenend_results = []

for metric in kruskal_df[kruskal_df['significant'] == 'Yes']['metric']:
    res = sp.posthoc_dunn(site_metrics, val_col=metric, group_col='site_type', p_adjust='holm')
    results[metric] = res

    # Only take upper triangle to avoid duplicates (since matrix is symmetric)
    for i_idx, i in enumerate(res.index):
        for j_idx, j in enumerate(res.columns):
            if i_idx < j_idx:  # Only upper triangle (i_idx < j_idx avoids diagonal and lower triangle)
                flattenend_results.append({
                    "site_type_a": i,
                    "site_type_b": j,
                    "metric": metric,
                    "p_val": res.loc[i, j],
                    "p_val_group": get_p_val_group(res.loc[i, j] )
                })

flattenend_results = pd.DataFrame(flattenend_results)
flattenend_results.to_csv("../csvs/dunss_test_pairwise_per_metric_holm.csv")
results['cv__chm']

Unnamed: 0,EPO,EPY,NRM,NRO,NRY,PPO,PPY,ULM,ULO,ULY
EPO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.411062,1.0,0.156256
EPY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.461255
NRM,1.0,1.0,1.0,1.0,0.845258,1.0,1.0,0.183366,1.0,0.068346
NRO,1.0,1.0,1.0,1.0,0.845258,1.0,1.0,0.183366,1.0,0.068346
NRY,1.0,1.0,0.845258,0.845258,1.0,0.266872,1.0,1.0,1.0,1.0
PPO,1.0,1.0,1.0,1.0,0.266872,1.0,1.0,0.044542,1.0,0.016544
PPY,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.716067
ULM,0.411062,1.0,0.183366,0.183366,1.0,0.044542,1.0,1.0,1.0,1.0
ULO,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.845258
ULY,0.156256,0.461255,0.068346,0.068346,1.0,0.016544,0.716067,1.0,0.845258,1.0


In [28]:
significant_results = flattenend_results[flattenend_results['p_val'] < 0.05]
significant_results

Unnamed: 0,site_type_a,site_type_b,metric,p_val,p_val_group
24,NRO,NRY,mean__chm,0.007067,very significant (p < 0.01)
26,NRO,PPY,mean__chm,0.012492,significant (p < 0.05)
32,NRY,ULM,mean__chm,0.005081,very significant (p < 0.01)
33,NRY,ULO,mean__chm,0.013812,significant (p < 0.05)
39,PPY,ULM,mean__chm,0.009108,very significant (p < 0.01)
40,PPY,ULO,mean__chm,0.023623,significant (p < 0.05)
59,EPY,ULM,max__chm,0.004855,very significant (p < 0.01)
69,NRO,NRY,max__chm,0.02387,significant (p < 0.05)
71,NRO,PPY,max__chm,0.017001,significant (p < 0.05)
77,NRY,ULM,max__chm,0.000229,highly significant (p < 0.001)


In [29]:

# Calculate Cliff's Delta for all significant results
sig_res_with_cliffs = []

for _, row in significant_results.iterrows():
    metric = row['metric']
    site_a = row['site_type_a']
    site_b = row['site_type_b']
    p_val = row['p_val']
    
    # Get the data for each group
    group_a_data = X.loc[y == site_a, metric].values
    group_b_data = X.loc[y == site_b, metric].values
    
    # Calculate Cliff's Delta
    delta, magnitude = cliffs_delta(group_a_data, group_b_data)
    
    sig_res_with_cliffs.append({
        'metric': metric,
        'site_type_a': site_a,
        'site_type_b': site_b,
        'p_val': p_val,
        'p_val_group': row['p_val_group'],
        'cliffs_delta': delta,
        'effect_size': magnitude,
        'mean_a': np.mean(group_a_data),
        'mean_b': np.mean(group_b_data),
        'n_a': len(group_a_data),
        'n_b': len(group_b_data)
    })
    
# Convert to DataFrame
cliffs_delta_df = pd.DataFrame(sig_res_with_cliffs)

# Sort by absolute effect size (largest effects first)
cliffs_delta_df['abs_cliffs_delta'] = abs(cliffs_delta_df['cliffs_delta'])
cliffs_delta_df = cliffs_delta_df.sort_values('abs_cliffs_delta', ascending=False)

cliffs_delta_df.to_csv("../csvs/significant_dunns_results_with_cliffs_delta_holm.csv")
cliffs_delta_df.drop('abs_cliffs_delta', axis=1)

Unnamed: 0,metric,site_type_a,site_type_b,p_val,p_val_group,cliffs_delta,effect_size,mean_a,mean_b,n_a,n_b
0,mean__chm,NRO,NRY,0.007067,very significant (p < 0.01),1.0,large,27.406963,5.689086,6,6
35,sd__chm,PPY,ULM,0.000101,highly significant (p < 0.001),-1.0,large,1.921938,15.568905,6,6
25,mean__veg_height_cv,PPY,ULY,0.008059,very significant (p < 0.01),-1.0,large,0.396539,0.859929,6,4
26,mean__vci,NRM,ULY,0.005033,very significant (p < 0.01),1.0,large,0.619787,0.379076,6,4
27,mean__vci,PPO,ULY,0.003734,very significant (p < 0.01),1.0,large,0.616954,0.379076,6,4
28,mean__crr,PPO,ULY,0.003173,very significant (p < 0.01),1.0,large,0.689391,0.447429,6,4
29,mean__crr,ULO,ULY,0.023571,significant (p < 0.05),1.0,large,0.646718,0.447429,5,4
31,sd__chm,EPY,ULM,0.006746,very significant (p < 0.01),-1.0,large,3.070476,15.568905,6,6
32,sd__chm,NRY,ULM,0.000293,highly significant (p < 0.001),-1.0,large,2.206086,15.568905,6,6
33,sd__chm,NRY,ULO,0.036834,significant (p < 0.05),-1.0,large,2.206086,8.278877,6,5
