### SIMPER Analysis

Custom Python implementation of SIMPER to identify species contributing to community differences between habitat groups.

In [None]:
import pandas as pd
import numpy as np
import itertools


def bray_curtis_contributions(x, y):
    diff = np.abs(x - y)
    denom = x + y
    denom_sum = denom.sum()
    if denom_sum == 0:
        return 0.0, np.zeros_like(x)
    bc = diff.sum() / denom_sum
    contrib = diff / denom_sum
    return bc, contrib


def simper(df, groups):
    groups = np.asarray(groups)
    taxa = df.columns
    result = {}
    unique_groups = np.unique(groups)
    for g1, g2 in itertools.combinations(unique_groups, 2):
        idx1 = groups == g1
        idx2 = groups == g2
        data1 = df[idx1]
        data2 = df[idx2]
        contrib_sum = np.zeros(len(taxa))
        bc_sum = 0.0
        count = 0
        for i in data1.values:
            for j in data2.values:
                bc, contrib = bray_curtis_contributions(i, j)
                bc_sum += bc
                contrib_sum += contrib
                count += 1
        mean_contrib = contrib_sum / count
        overall_bc = bc_sum / count
        percent = (mean_contrib / overall_bc) * 100
        mean1 = data1.mean(axis=0)
        mean2 = data2.mean(axis=0)
        delta = mean1 - mean2
        df_pair = pd.DataFrame({
            'Taxon': taxa,
            '% Contribution': percent,
            'Group 1': g1,
            'Group 2': g2,
            'Overall Bray-Curtis': overall_bc,
            'Mean abundance in Group 1': mean1.values,
            'Mean abundance in Group 2': mean2.values,
            'Delta(G1-G2)': delta.values
        })
        df_pair.sort_values(by='% Contribution', ascending=False, inplace=True)
        df_pair['Cumulative % Contribution'] = df_pair['% Contribution'].cumsum()
        result[(g1, g2)] = df_pair.reset_index(drop=True)
    return result

In [None]:
# Load pickled data
df_transects = pd.read_pickle('../data/pkl/df_transects.pkl')
df_occurrences_with_taxon = pd.read_pickle('../data/pkl/df_occurrences_with_taxon.pkl')

# Filter transects and species (Old Reserve only)
df_transects['Year'] = pd.to_datetime(df_transects['start_time']).dt.year
mask_not_2008 = df_transects['Year'] != 2008
mask_keep_2024 = ~((df_transects['Year'] == 2024) & (df_transects['Pre: Transect physical habitat'] != 'shrubs closed'))
df_filtered_transects = df_transects[mask_not_2008 & mask_keep_2024].copy()

df_old_reserve_transects = df_filtered_transects[df_filtered_transects['Pre: On old reserve?'] == 'Yes'].copy()
valid_transect_uids = df_old_reserve_transects['UID']

exclude = ['Aves (medium)', 'Aves (small)', '']
df_occ_old_reserve = df_occurrences_with_taxon[
    df_occurrences_with_taxon['UID'].isin(valid_transect_uids) &
    ~df_occurrences_with_taxon['Taxon Label'].isin(exclude)
].copy()

# Build species-by-transect matrix
df_matrix = df_occ_old_reserve.pivot_table(
    index='UID',
    columns='Taxon Label',
    values='ID',
    aggfunc='count',
    fill_value=0
)

# Add habitat information
habitats = df_old_reserve_transects.set_index('UID')['Pre: Transect physical habitat']
habitats = habitats.loc[df_matrix.index]

def log2_plus1_anderson(df):
    return df.map(lambda x: np.log2(x) + 1 if x > 0 else 0)

df_log = log2_plus1_anderson(df_matrix)

# Run SIMPER
simper_results = simper(df_log, habitats.values)

# Display top contributors for the first pair of groups
first_pair = next(iter(simper_results))
simper_results[first_pair].head()
