In [None]:
#!pip install scipy seaborn scikit-learn

In [None]:
# Goal: Indicator species analysis with effort- and detection-corrected densities
# Each step below is commented to describe the workflow clearly.

In [None]:
# Step 1: Load data
import pandas as pd
import numpy as np
from scipy.optimize import minimize
from math import erf, sqrt, pi
import seaborn as sns
import matplotlib.pyplot as plt

# Load the pickled data generated in previous notebooks
# df_transects contains one row per transect with metadata such as habitat and length
# df_occurrences_with_taxon contains one row per bone occurrence with taxon, transect ID and perpendicular distance

df_transects = pd.read_pickle("../data/pkl/df_transects.pkl")
df_occurrences = pd.read_pickle("../data/pkl/df_occurrences_with_taxon.pkl")

In [None]:
# Step 2: Estimate effective strip width (ESW) using line-transect distance sampling
# We fit a half-normal detection function to the perpendicular distances for each habitat.
# The ESW is the integral of the detection function from 0 to the truncation distance (25 m here).

TRUNCATION_DISTANCE = 25  # meters

# Negative log-likelihood for half-normal detection function

def _neg_loglik(sigma, distances, w):
    sigma = sigma[0]
    if sigma <= 0:
        return np.inf
    # detection function g(x) = exp(-x^2 / (2*sigma^2))
    # probability density of observed distances conditional on detection
    denom = sigma * sqrt(pi/2) * erf(w / (sqrt(2) * sigma))
    f_x = np.exp(-distances ** 2 / (2 * sigma ** 2)) / denom
    # Return negative log-likelihood
    return -np.sum(np.log(f_x))

# Function to estimate ESW for a set of distances

def estimate_esw(distances, w=TRUNCATION_DISTANCE):
    result = minimize(_neg_loglik, x0=[w/2], args=(distances, w), bounds=[(1e-3, w*2)])
    sigma = result.x[0]
    esw = sigma * sqrt(pi/2) * erf(w / (sqrt(2) * sigma))
    return esw

# Apply to each habitat
habitat_esw = {}
for habitat, group in df_occurrences.groupby("Habitat"):
    distances = group["PerpendicularDistance_m"].values
    habitat_esw[habitat] = estimate_esw(distances)

habitat_esw = pd.Series(habitat_esw, name="ESW_m")
print(habitat_esw)

In [None]:
# Step 3: Compute detection- and effort-corrected densities
# For each transect we compute the searched area = 2 * ESW * transect length.
# Density for a taxon on a transect is then MNI / searched area.

TRANSECT_LENGTH_M = 1000  # 1 km transects

df_transects = df_transects.merge(habitat_esw, left_on="Habitat", right_index=True)
df_transects["SearchedArea_m2"] = 2 * df_transects["ESW_m"] * TRANSECT_LENGTH_M

# Count occurrences per transect and taxon
transect_taxon_counts = (
    df_occurrences.groupby(["TransectID", "Taxon"]).size().reset_index(name="Count")
)

# Merge with searched area and compute density
transect_taxon_counts = transect_taxon_counts.merge(
    df_transects[["TransectID", "Habitat", "SearchedArea_m2"]],
    on="TransectID",
)
transect_taxon_counts["Density_per_m2"] = (
    transect_taxon_counts["Count"] / transect_taxon_counts["SearchedArea_m2"]
)

transect_taxon_counts.head()

In [None]:
# Step 4: Create habitat × taxon density matrix
# We aggregate densities by habitat to get average detection-corrected density per taxon.

df_density_matrix = (
    transect_taxon_counts.groupby(["Habitat", "Taxon"]).agg(
        Density_per_m2=("Density_per_m2", "mean")
    ).reset_index()
)

# Pivot to matrix form: rows = habitats, columns = taxa, values = density

df_matrix = df_density_matrix.pivot(
    index="Habitat", columns="Taxon", values="Density_per_m2"
).fillna(0)

df_matrix.head()

In [None]:
# Step 5: Indicator Species Analysis on detection-corrected densities
# We reuse the IndVal approach, but now apply it to density values.

# Convert to presence/absence if required by the analysis
# Here we keep densities, but you could binarize by replacing >0 with 1.

def compute_indval(df, groups):
    group_names = sorted(set(groups))
    results = []
    for species in df.columns:
        species_data = df[species]
        indvals = []
        for g in group_names:
            group_mask = np.array(groups) == g
            A = species_data[group_mask].mean()
            B = (species_data[group_mask] > 0).mean()
            indvals.append(A * B)
        max_indval = max(indvals)
        best_group = group_names[indvals.index(max_indval)]
        results.append({"Species": species, "Habitat": best_group, "IndVal": max_indval})
    return pd.DataFrame(results)

# Group labels correspond to the habitat names
habitats = df_matrix.index.to_list()

df_indicators = compute_indval(df_matrix, habitats)

# Sort and inspect the top indicator species

df_indicators.sort_values("IndVal", ascending=False).head(10)