In [None]:
#!pip install scipy seaborn scikit-learn

In [None]:
# Goal: Indicator species analysis with effort- and detection-corrected densities
# Each step below is commented to describe the workflow clearly.

In [None]:
# Step 1: Load data
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# Load the pickled data generated in previous notebooks
# df_transects contains one row per transect with metadata such as habitat and length
# df_occurrences_with_taxon contains one row per bone occurrence with taxon and transect ID

df_transects = pd.read_pickle("../data/pkl/df_transects.pkl")
df_occurrences = pd.read_pickle("../data/pkl/df_occurrences_with_taxon.pkl")

# Attach habitat and transect-length info to occurrences
df_occurrences = df_occurrences.merge(
    df_transects[["TransectID", "Habitat", "distance_km"]],
    on="TransectID",
    how="left"
)

In [None]:
# Step 2: Define survey width and effective strip width (ESW)
# Observers walked side by side covering a 50 m strip, so recorded distances
# are not suitable for fitting a detection function. We assume perfect detection
# within this strip, yielding an ESW equal to half the strip width.

TRANSECT_WIDTH_M = 50  # total search width in meters
ESW_m = TRANSECT_WIDTH_M / 2
print(f"Assumed ESW (m): {ESW_m}")

In [None]:
# Step 3: Compute effort-corrected densities
# Use transect-specific lengths and the fixed strip width to derive searched area.
# Density for a taxon on a transect is Count / searched area.

df_transects["TransectLength_m"] = df_transects["distance_km"] * 1000
df_transects["SearchedArea_m2"] = df_transects["TransectLength_m"] * TRANSECT_WIDTH_M

# Count occurrences per transect, habitat and taxon
transect_taxon_counts = (
    df_occurrences.groupby(["TransectID", "Habitat", "Taxon"]).size().reset_index(name="Count")
)

# Merge with searched area and compute density
transect_taxon_counts = transect_taxon_counts.merge(
    df_transects[["TransectID", "SearchedArea_m2"]],
    on="TransectID",
    how="left"
)
transect_taxon_counts["Density_per_m2"] = (
    transect_taxon_counts["Count"] / transect_taxon_counts["SearchedArea_m2"]
)

transect_taxon_counts.head()

In [None]:
# Step 4: Create habitat × taxon density matrix
# We aggregate densities by habitat to get average detection-corrected density per taxon.

df_density_matrix = (
    transect_taxon_counts.groupby(["Habitat", "Taxon"]).agg(
        Density_per_m2=("Density_per_m2", "mean")
    ).reset_index()
)

# Pivot to matrix form: rows = habitats, columns = taxa, values = density

df_matrix = df_density_matrix.pivot(
    index="Habitat", columns="Taxon", values="Density_per_m2"
).fillna(0)

df_matrix.head()

In [None]:
# Step 5: Indicator Species Analysis on detection-corrected densities
# We reuse the IndVal approach, but now apply it to density values.

# Convert to presence/absence if required by the analysis
# Here we keep densities, but you could binarize by replacing >0 with 1.

def compute_indval(df, groups):
    group_names = sorted(set(groups))
    results = []
    for species in df.columns:
        species_data = df[species]
        indvals = []
        for g in group_names:
            group_mask = np.array(groups) == g
            A = species_data[group_mask].mean()
            B = (species_data[group_mask] > 0).mean()
            indvals.append(A * B)
        max_indval = max(indvals)
        best_group = group_names[indvals.index(max_indval)]
        results.append({"Species": species, "Habitat": best_group, "IndVal": max_indval})
    return pd.DataFrame(results)

# Group labels correspond to the habitat names
habitats = df_matrix.index.to_list()

df_indicators = compute_indval(df_matrix, habitats)

# Sort and inspect the top indicator species

df_indicators.sort_values("IndVal", ascending=False).head(10)