# An exploration into the variation of confidence by site, habitat or species.

Does detection confidence vary systematically by site, habitat, or species dominance?  
If yes, this should be taken into account during proceeding analysis.

# Setup System Path

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd


# Go up one level to .../audiomoth
PROJECT_ROOT = Path(os.getcwd()).resolve().parent

# Add project root to sys.path so `src` is importable
sys.path.insert(0, str(PROJECT_ROOT))

EXCEL_PATH = PROJECT_ROOT / "data_raw" / "audiomoth_sample.xlsx"

# Make pandas show more columns/rows while exploring
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## Basic normalisation
Standardise column names and parse timestamps if present.


In [None]:
import src.audio_moth_schema as audio_moth_schema
import src.normaliser as normaliser

# Get all the excel sheets available in the auditomoth sample file
sheets = normaliser.get_excel_sheets(EXCEL_PATH)

# Flatten all the sheets into a single DataFrame
sample_df = normaliser.flatten_data(sheets)


# Lowercase/underscore column names (non-destructive copy)
sample_df = normaliser.combine_date_and_time(
    sample_df, date_col="date", time_col="time", output_col="time"
)


# Validate and convert types according to AudioMoth schema
sample_df = audio_moth_schema.AudioMothSchema.validate(sample_df)

# sample_df.head()
sample_df.shape

## Site-level confidence distributions


In [None]:
confidence_by_site = sample_df.groupby("site")["confidence"].describe()

confidence_by_site


Detection confidence was consistent across sites, with similar distributions and median values, indicating that observed differences in species richness and dominance are unlikely to be driven by site-level variation in model confidence.  
Note: A minumum confidence filtering has already been applied to the data to remove <0.7 detections

## Visualise confidence by site


In [None]:
import matplotlib.pyplot as plt

plt.figure()
sample_df.boxplot(column="confidence", by="site")
plt.title("Detection confidence by site")
plt.suptitle("")
plt.ylabel("Confidence")
plt.show()

## Dominant vs non-dominant species confidence

In [None]:
dominant_species = (
    sample_df.groupby(["site", "common_name"])
    .size()
    .groupby("site")
    .idxmax()
    .apply(lambda x: x[1])
)

sample_df["is_dominant"] = sample_df.apply(
    lambda r: r["common_name"] == dominant_species[r["site"]], axis=1
)

sample_df.groupby("is_dominant")["confidence"].describe()

Dominant species were not associated with higher detection confidence; in fact, non-dominant species exhibited slightly higher average confidence. This suggests that observed dominance patterns are driven by behavioural or ecological factors rather than model bias.

## Does detection confidence vary systematically with hour of day?

In [None]:
sample_df["hour"] = sample_df["time"].dt.hour  # type: ignore

# Mean confidence by hour
hourly_confidence = sample_df.groupby("hour")["confidence"].mean()

plt.figure()
hourly_confidence.plot()
plt.xlabel("Hour of day")
plt.ylabel("Mean detection confidence")
plt.title("Mean detection confidence by hour of day")
plt.show()