In [1]:
Summary
•	PERMANOVA analysis notebook that filters transects to the old reserve, excludes designated taxa, and builds a species-by-transect matrix for habitat comparisons
•	Implemented a log₂(x + 1) + 1 abundance transformation to retain zeros and stabilise variance before analysis
•	Introduced PERMANOVA using Bray–Curtis distances with 999 permutations to test species composition differences across habitats
•	implement a pure-Python PERMANOVA to avoid scikit-bio dependency

SyntaxError: invalid character '•' (U+2022) (3883255795.py, line 2)

In [2]:
import pandas as pd
import numpy as np
from scipy.spatial.distance import pdist, squareform

def bray_curtis_dm(data):
    return squareform(pdist(data, metric='braycurtis'))

def permanova(distance_matrix, grouping, permutations=999, random_state=None):
    distance_matrix = np.asarray(distance_matrix)
    n = distance_matrix.shape[0]
    groups, group_indices = np.unique(grouping, return_inverse=True)
    dist_sq = distance_matrix**2
    tss = dist_sq.sum() / n
    wss = 0.0
    for g in range(len(groups)):
        idxs = np.where(group_indices == g)[0]
        if len(idxs) > 1:
            wss += dist_sq[np.ix_(idxs, idxs)].sum() / len(idxs)
    bss = tss - wss
    df_between = len(groups) - 1
    df_within = n - len(groups)
    ms_between = bss / df_between
    ms_within = wss / df_within
    f_stat = ms_between / ms_within
    rng = np.random.default_rng(random_state)
    perm_ge = 0
    for _ in range(permutations):
        perm = rng.permutation(group_indices)
        wss_perm = 0.0
        for g in range(len(groups)):
            idxs = np.where(perm == g)[0]
            if len(idxs) > 1:
                wss_perm += dist_sq[np.ix_(idxs, idxs)].sum() / len(idxs)
        bss_perm = tss - wss_perm
        ms_between_perm = bss_perm / df_between
        ms_within_perm = wss_perm / df_within
        f_perm = ms_between_perm / ms_within_perm
        if f_perm >= f_stat:
            perm_ge += 1
    p_val = (perm_ge + 1) / (permutations + 1)
    return {
        'test_statistic': f_stat,
        'p_value': p_val,
        'df_between': df_between,
        'df_within': df_within,
        'permutations': permutations
    }


In [3]:
# Load pickled data
df_transects = pd.read_pickle('../data/pkl/df_transects.pkl')
df_occurrences_with_taxon = pd.read_pickle('../data/pkl/df_occurrences_with_taxon.pkl')

# Filter transects and species (Old Reserve only)
df_transects['Year'] = pd.to_datetime(df_transects['start_time']).dt.year
mask_not_2008 = df_transects['Year'] != 2008
mask_keep_2024 = ~((df_transects['Year'] == 2024) & (df_transects['Pre: Transect physical habitat'] != 'shrubs closed'))
df_filtered_transects = df_transects[mask_not_2008 & mask_keep_2024].copy()

df_old_reserve_transects = df_filtered_transects[df_filtered_transects['Pre: On old reserve?'] == 'Yes'].copy()
valid_transect_uids = df_old_reserve_transects['UID']

exclude = ['Aves (medium)', 'Aves (small)', '']
df_occ_old_reserve = df_occurrences_with_taxon[
    df_occurrences_with_taxon['UID'].isin(valid_transect_uids) &
    ~df_occurrences_with_taxon['Taxon Label'].isin(exclude)
].copy()


In [4]:
# Build species-by-transect matrix
df_matrix = df_occ_old_reserve.pivot_table(
    index='UID',
    columns='Taxon Label',
    values='ID',
    aggfunc='count',
    fill_value=0
)

# Add habitat information
habitats = df_old_reserve_transects.set_index('UID')['Pre: Transect physical habitat']
habitats = habitats.loc[df_matrix.index]


In [5]:
# Transform abundances: log2(x + 1) + 1
df_log = np.log2(df_matrix + 1) + 1


In [10]:
# Run PERMANOVA (npMANOVA)
dm = bray_curtis_dm(df_log.values)
permanova_results = permanova(dm, grouping=habitats.values, permutations=999)
permanova_results

# Convert to a single-row dataframe
permanova_df = pd.DataFrame([permanova_results])
permanova_df = permanova_df.round(3)
permanova_df


Unnamed: 0,test_statistic,p_value,df_between,df_within,permutations
0,5.321,0.001,3,60,999
