# An exploration into the variation of confidence by site, habitat or species.

Does detection confidence vary systematically by site, habitat, or species dominance?  
If yes, this should be taken into account during proceeding analysis.

# Setup System Path

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd


# Go up one level to .../audiomoth
PROJECT_ROOT = Path(os.getcwd()).resolve().parent

# Add project root to sys.path so `src` is importable
sys.path.insert(0, str(PROJECT_ROOT))

EXCEL_PATH = PROJECT_ROOT / "data_raw" / "helman_tor_audiomoth_data.xlsx"

# Make pandas show more columns/rows while exploring
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## Import Excel Data

In [None]:
import src.normaliser as normaliser

# Get all the excel sheets available in the audiomoth data
sheets = normaliser.get_excel_sheets(EXCEL_PATH)

## Basic normalisation
Standardise column names and parse timestamps if present.


In [None]:
import src.audio_moth_schema as audio_moth_schema

# Before merging we should combine date and time columns in Overview sheet
sheets["Overview"] = normaliser.combine_date_and_time(
    sheets["Overview"],
    date_col="deployment_date",
    time_col="deployment_time",
    output_col="deployment_timestamp",
)

# Combine date and time columns in All Data sheet
sheets["All Data"] = normaliser.combine_date_and_time(
    sheets["All Data"],
    date_col="date",
    time_col="time",
    output_col="detection_timestamp",
)

# Flatten all the sheets into a single DataFrame
df = normaliser.flatten_data(sheets)

## Validate Data

In [None]:
# Validate and convert types according to AudioMoth schema
df = audio_moth_schema.AudioMothSchema.validate(df)


print(df.shape)
df.head()

## Site-level confidence distributions


In [None]:
confidence_by_site = df.groupby("site")["confidence"].describe()

confidence_by_site


Detection confidence was consistent across sites, with similar distributions and median values, indicating that observed differences in species richness and dominance are unlikely to be driven by site-level variation in model confidence.  
Note: A minumum confidence filtering has already been applied to the data to remove <0.7 detections

## Visualise confidence by site


In [None]:
import matplotlib.pyplot as plt

plt.figure()
df.boxplot(column="confidence", by="site")
plt.title("Detection confidence by site")
plt.suptitle("")
plt.ylabel("Confidence")
plt.xticks(rotation=45)
plt.figtext(
    0.5,
    -0.15,
    "Figure 1. Boxplot showing the distribution of detection confidence scores for each recording site.",
    ha="center",
    fontsize=9,
)
plt.show()

## Dominant vs non-dominant species confidence

In [None]:
dominant_species = (
    df.groupby(["site", "common_name"])
    .size()
    .groupby("site")
    .idxmax()
    .apply(lambda x: x[1])
)

df["is_dominant"] = df.apply(
    lambda r: r["common_name"] == dominant_species[r["site"]], axis=1
)

df.groupby("is_dominant")["confidence"].describe()

Dominant species were not associated with higher detection confidence; in fact, non-dominant species exhibited slightly higher average confidence. This suggests that observed dominance patterns are driven by behavioural or ecological factors rather than model bias.

## Does detection confidence vary systematically with hour of day?

In [None]:
df["hour"] = df["detection_timestamp"].dt.hour  # type: ignore

# Mean confidence by hour
hourly_confidence = df.groupby("hour")["confidence"].mean()

plt.figure()
hourly_confidence.plot()
plt.xlabel("Hour of day")
plt.ylabel("Mean detection confidence")
plt.title("Mean detection confidence by hour of day")
plt.figtext(
    0.5,
    -0.15,
    "Figure 2. Mean detection confidence for all sites combined, by hour of day.",
    ha="center",
    fontsize=9,
)
plt.show()

Detection confidence is showing a weak but consistent diel pattern, with higher confidence during nocturnal, dawn and dusk periods and a modest reduction during daytime hours. The magnitude of this effect was small, indicating that time of day should be accounted for in downstream modelling but is unlikely to dominate site-level patterns.  

Since there is a clear pattern we can divide the data into diel period bins. 

In [None]:
def assign_diel_period(hour: int) -> str:
    if hour >= 22 or hour <= 3:
        return "night"
    elif 4 <= hour <= 6:
        return "dawn"
    elif 7 <= hour <= 17:
        return "day"
    else:  # 18â€“21
        return "dusk"


df["diel_period"] = df["hour"].apply(assign_diel_period)
df.groupby("diel_period")["confidence"].describe()

In [None]:
plt.figure()
df.groupby("diel_period")["confidence"].mean().reindex(
    ["night", "dawn", "day", "dusk"]
).plot(kind="bar")

plt.ylabel("Mean detection confidence")
plt.xlabel("Diel period")
plt.title("Mean detection confidence by diel period")
plt.ylim(0.85, 0.91)
plt.figtext(
    0.5,
    -0.15,
    "Figure 3. Mean detection confidence by diel period.\n"
    "Detection confidence is slightly higher during dawn and dusk compared to daytime hours.\n"
    "The y-axis is truncated to highlight small differences between periods.",
    ha="center",
    fontsize=9,
)

plt.tight_layout()
plt.show()