In [3]:
#!pip install scipy seaborn scikit-learn

In [None]:
# Goal: Indicator species analysis with effort- and detection-corrected densities
# Each step below is commented to describe the workflow clearly.

In [1]:
# Step 1: Load and filter data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the pickled data generated in previous notebooks
df_transects = pd.read_pickle('../data/pkl/df_transects.pkl')
df_occurrences = pd.read_pickle('../data/pkl/df_occurrences_with_taxon.pkl')

# Filter transects excluding year 2008 and most 2024 transects (keep 2024 'shrubs closed')
# and keep only those on the old reserve
df_transects['Year'] = pd.to_datetime(df_transects['start_time']).dt.year
mask_not_2008 = df_transects['Year'] != 2008
mask_keep_2024 = ~((df_transects['Year'] == 2024) & (df_transects['Pre: Transect physical habitat'] != 'shrubs closed'))
df_filtered_transects = df_transects[mask_not_2008 & mask_keep_2024].copy()
df_filtered_transects = df_filtered_transects[df_filtered_transects['Pre: On old reserve?'] == 'Yes'].copy()

# Standardize column names for ease of merging
rename_transects = {'UID': 'TransectID', 'Pre: Transect physical habitat': 'Habitat'}
df_filtered_transects = df_filtered_transects.rename(columns=rename_transects)

# Prepare occurrences: standardize fields and restrict to filtered transects
rename_occ = {'UID': 'TransectID', 'Taxon Label': 'Taxon'}
df_occurrences = df_occurrences.rename(columns=rename_occ)
df_occurrences = df_occurrences[df_occurrences['TransectID'].isin(df_filtered_transects['TransectID'])]

# Remove broad or unwanted taxa
excluded_species = ['ostrich', 'Aves (medium)', 'Aves (small)']
df_occurrences = df_occurrences[~df_occurrences['Taxon'].isin(excluded_species)]

# Attach habitat and transect-length info to occurrences
df_occurrences = df_occurrences.merge(
    df_filtered_transects[['TransectID', 'Habitat', 'distance_km']],
    on='TransectID',
    how='left',
)


In [2]:
# Step 2: Define survey width and effective strip width (ESW)
# Observers walked side by side covering a 50 m strip, so recorded distances
# are not suitable for fitting a detection function. We assume perfect detection
# within this strip, yielding an ESW equal to half the strip width.

TRANSECT_WIDTH_M = 50  # total search width in meters
ESW_m = TRANSECT_WIDTH_M / 2
print(f"Assumed ESW (m): {ESW_m}")

Assumed ESW (m): 25.0


In [3]:
# Step 3: Compute effort-corrected densities
# Use transect-specific lengths and the fixed strip width to derive searched area.
# Density for a taxon on a transect is Count / searched area.

df_filtered_transects['TransectLength_m'] = df_filtered_transects['distance_km'] * 1000
df_filtered_transects['SearchedArea_m2'] = df_filtered_transects['TransectLength_m'] * TRANSECT_WIDTH_M

# Count occurrences per transect, habitat and taxon
transect_taxon_counts = (
    df_occurrences.groupby(['TransectID', 'Habitat', 'Taxon']).size().reset_index(name='Count')
)

# Merge with searched area and compute density
transect_taxon_counts = transect_taxon_counts.merge(
    df_filtered_transects[['TransectID', 'SearchedArea_m2']],
    on='TransectID',
    how='left'
)
transect_taxon_counts['Density_per_m2'] = (
    transect_taxon_counts['Count'] / transect_taxon_counts['SearchedArea_m2']
)

transect_taxon_counts.head()


Unnamed: 0,TransectID,Habitat,Taxon,Count,SearchedArea_m2,Density_per_m2
0,271687,grass closed,Mammalia indet,1,45171.255589,2.2e-05
1,271687,grass closed,Thompson's gazelle,1,45171.255589,2.2e-05
2,271687,grass closed,zebra,3,45171.255589,6.6e-05
3,274498,grass closed,hartebeest,2,31499.227591,6.3e-05
4,274498,grass closed,warthog,1,31499.227591,3.2e-05


In [4]:
# Step 4: Create habitat × taxon density matrix
# We aggregate densities by habitat to get average detection-corrected density per taxon.

df_density_matrix = (
    transect_taxon_counts.groupby(["Habitat", "Taxon"]).agg(
        Density_per_m2=("Density_per_m2", "mean")
    ).reset_index()
)

# Pivot to matrix form: rows = habitats, columns = taxa, values = density

df_matrix = df_density_matrix.pivot(
    index="Habitat", columns="Taxon", values="Density_per_m2"
).fillna(0)

df_matrix.head()

Taxon,Bovidae (large),Bovidae (medium),Bovidae (small),Grant's gazelle,Mammalia indet,Rhinocerotidae,Thompson's gazelle,black rhinoceros,buffalo,cow (domestic),...,giraffe,hare,hartebeest,impala,reedbuck,spotted hyaena,ungulate,warthog,waterbuck,zebra
Habitat,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
grass closed,2.4e-05,3.6e-05,3.2e-05,2e-05,6.3e-05,0.0,4.9e-05,2e-05,2.7e-05,2.7e-05,...,8e-05,2e-05,3.5e-05,2e-05,0.0,0.0,0.0,4.3e-05,2e-05,9.8e-05
shrubs closed,4e-05,2.7e-05,3.2e-05,2e-05,8e-05,0.0,0.0,0.0,6.1e-05,0.0,...,3.5e-05,0.0,0.0,4e-05,0.0,0.0,2e-05,2e-05,3e-05,4.2e-05
shrubs open,2.8e-05,2.8e-05,4e-05,2e-05,8.3e-05,2e-05,3.3e-05,0.0,3.5e-05,2e-05,...,3.1e-05,0.0,2e-05,2.9e-05,0.0,2e-05,0.0,4.1e-05,2.5e-05,6.6e-05
trees closed,0.0,2e-05,2e-05,0.0,1.7e-05,0.0,0.0,0.0,0.0,0.0,...,2.8e-05,2e-05,0.0,2e-05,2e-05,0.0,0.0,2e-05,2.7e-05,4e-05


In [5]:
# Step 5: Indicator Species Analysis on detection-corrected densities
# Apply the Dufrêne-Legendre IndVal method with permutation-based p-values.

# Convert to presence/absence if required by the analysis
# Here we keep densities, but you could binarize by replacing >0 with 1.

def compute_indval(df, groups, n_permutations=999, random_state=0):
    """Compute IndVal and permutation-based p-values for each species.

    Parameters
    ----------
    df : pandas.DataFrame
        Species densities with habitats as index and species as columns.
    groups : list-like
        Group label for each row in `df` (e.g., habitat names).
    n_permutations : int
        Number of random permutations to derive p-values.
    random_state : int
        Seed for the random number generator.
    """
    rng = np.random.default_rng(random_state)
    groups = np.asarray(groups)
    group_names = np.unique(groups)
    results = []

    for species in df.columns:
        species_data = df[species].to_numpy()
        indvals = []
        for g in group_names:
            mask = groups == g
            A = species_data[mask].mean()  # specificity
            B = (species_data[mask] > 0).mean()  # fidelity
            indvals.append(A * B)
        max_indval = float(np.max(indvals))
        best_group = group_names[int(np.argmax(indvals))]

        # Permutation test for p-value
        permuted_max = []
        for _ in range(n_permutations):
            shuffled = rng.permutation(groups)
            perm_indvals = []
            for g in group_names:
                mask = shuffled == g
                A = species_data[mask].mean()
                B = (species_data[mask] > 0).mean()
                perm_indvals.append(A * B)
            permuted_max.append(np.max(perm_indvals))
        permuted_max = np.asarray(permuted_max)
        p_val = (np.sum(permuted_max > max_indval) + 1) / (n_permutations + 1) #Use > instead of >= in the permutation comparison (less conservative, but be explicit about it)

        results.append({"Species": species, "Habitat": best_group, "IndVal": max_indval, "p_value": p_val})

    return pd.DataFrame(results)

# Group labels correspond to the habitat names
habitats = df_matrix.index.to_list()

df_indicators = compute_indval(df_matrix, habitats)

# Sort and inspect the top indicator species
df_indicators.sort_values("IndVal", ascending=False).head(10)


Unnamed: 0,Species,Habitat,IndVal,p_value
21,zebra,grass closed,9.8e-05,0.001
4,Mammalia indet,shrubs open,8.3e-05,0.001
12,giraffe,grass closed,8e-05,0.001
8,buffalo,shrubs closed,6.1e-05,0.001
6,Thompson's gazelle,grass closed,4.9e-05,0.001
19,warthog,grass closed,4.3e-05,0.001
2,Bovidae (small),shrubs open,4e-05,0.001
15,impala,shrubs closed,4e-05,0.001
0,Bovidae (large),shrubs closed,4e-05,0.001
11,elephant,trees closed,4e-05,0.001
