# Species Detection Exploration
## This will cover:


- Overall top detected species  

- Top detected species by device  

- Overall top detected species by week  

- Top detected species by week for each device  

- Overall top detected species by month  

- Top detected species by month for each device


I have mostly gone for top 10. This can be easily changed via the TOP_N variable.

## Setup System Path And Get Data

In [None]:
import sys
import os
from pathlib import Path
import pandas as pd


# Go up one level to .../audiomoth
PROJECT_ROOT = Path(os.getcwd()).resolve().parent

# Add project root to sys.path so `src` is importable
sys.path.insert(0, str(PROJECT_ROOT))

PROCESSED_DATA_PATH = out_dir = (
    Path(PROJECT_ROOT) / "data_processed" / "analysis_df.parquet"
)
analysis_df = pd.read_parquet(PROCESSED_DATA_PATH)

# Make pandas show more columns/rows while exploring
pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

## Top detected species


In [None]:
SPECIES_COL = "common_name"

TOP_N = 20

top_species_overall = (
    analysis_df.groupby(SPECIES_COL)
    .size()
    .sort_values(ascending=False)
    .head(TOP_N)
    .rename("detections")
    .reset_index()
)


total = len(analysis_df)
top_species_overall["%"] = (top_species_overall["detections"] / total * 100).round(2)
top_species_overall

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

top_species_overall.to_csv(out_dir / "overall_top_species.csv")

## Top Detected Species By Device

In [None]:
TOP_N = 10

# For each device, list the species in order of most detections
top_species_per_device = (
    analysis_df.groupby(["device", SPECIES_COL])
    .size()
    .rename("detections")
    .reset_index()
    .sort_values(["device", "detections"], ascending=[True, False])
)

# Add a column that ranks species within each device
top_species_per_device["rank_within_device"] = top_species_per_device.groupby("device")[
    "detections"
].rank(method="first", ascending=False)

# Keep only the top N ranked species per device
top_species_per_device = top_species_per_device.loc[
    top_species_per_device["rank_within_device"] <= TOP_N
]

# Pivot to wide format for easier viewing
top_species_per_device_wide = top_species_per_device.pivot_table(
    index="device", columns="rank_within_device", values=SPECIES_COL, aggfunc="first"
)

top_species_per_device_wide

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

top_species_per_device.to_csv(out_dir / "device_top_species.csv", index=False)

## Top Detected Species Per Week

### Overall

In [None]:
TOP_N = 10

species_by_week_overall = (
    analysis_df.groupby(["week", SPECIES_COL])
    .size()
    .rename("detections")
    .reset_index()
    .sort_values(["week", "detections"], ascending=[True, False])
)

species_by_week_overall["rank_within_week"] = species_by_week_overall.groupby("week")[
    "detections"
].rank(method="first", ascending=False)

species_by_week_overall = species_by_week_overall.loc[
    species_by_week_overall["rank_within_week"] <= TOP_N
]

# Pivot to wide format for easier viewing
species_by_week_overall_wide = species_by_week_overall.pivot_table(
    index="week", columns="rank_within_week", values=SPECIES_COL, aggfunc="first"
)
species_by_week_overall_wide

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

species_by_week_overall.to_csv(out_dir / "overall_top_species_by_week.csv", index=False)

### Per Device

In [None]:
TOP_N = 10

device_week_species = (
    analysis_df.groupby(["device", "week", SPECIES_COL])
    .size()
    .rename("detections")
    .reset_index()
    .sort_values(["device", "week", "detections"], ascending=[True, True, False])
    .reset_index(drop=True)
)

device_week_species["rank_within_device"] = device_week_species.groupby(
    ["device", "week"]
)["detections"].rank(method="first", ascending=False)

device_week_species = device_week_species.loc[
    device_week_species["rank_within_device"] <= TOP_N
]

# Pivot to wide format for easier viewing
device_week_species_wide = device_week_species.pivot_table(
    index=["device", "week"],
    columns="rank_within_device",
    values=SPECIES_COL,
    aggfunc="first",
)
device_week_species_wide

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

device_week_species.to_csv(out_dir / "device_top_species_by_week.csv", index=False)

#### Save Seperate csv file for each device

In [None]:
def device_week_table(df: pd.DataFrame, device: str) -> pd.DataFrame:
    subset = df.loc[df["device"] == device].copy()

    wide = subset.pivot_table(
        index="week", columns="rank", values=[SPECIES_COL], aggfunc="first"
    )

    # Flatten column names
    wide.columns = [f"{int(rank)}" for val, rank in wide.columns]

    return wide.sort_index()

In [None]:
device_tables = {
    device: device_week_table(device_week_species, device)
    for device in device_week_species["device"].unique()
}

device_tables["CWT2"]

### Save to csv file

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs" / "device_week_tables"
out_dir.mkdir(parents=True, exist_ok=True)


for device, table in device_tables.items():
    table.to_csv(out_dir / f"{device}_top_species_by_week.csv")

## Top Detected Species Per Month

### Overall

In [None]:
TOP_N = 10

species_by_month_overall = (
    analysis_df.groupby(["month", SPECIES_COL])
    .size()
    .rename("detections")
    .reset_index()
    .sort_values(["month", "detections"], ascending=[True, False])
)

species_by_month_overall["rank_within_month"] = species_by_month_overall.groupby(
    "month"
)["detections"].rank(method="first", ascending=False)

species_by_month_overall = species_by_month_overall.loc[
    species_by_month_overall["rank_within_month"] <= TOP_N
]

#### Table

In [None]:
# Pivot to wide format for easier viewing
species_by_month_overall_wide = species_by_month_overall.pivot_table(
    index="month", columns="rank_within_month", values=SPECIES_COL, aggfunc="first"
)
species_by_month_overall_wide

#### Save to csv file

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

species_by_month_overall.to_csv(
    out_dir / "overall_top_species_by_month.csv", index=False
)

### Per Device

In [None]:
TOP_N = 10

device_month_species = (
    analysis_df.groupby(["device", "month", SPECIES_COL])
    .size()
    .rename("detections")
    .reset_index()
    .sort_values(["device", "month", "detections"], ascending=[True, True, False])
    .reset_index(drop=True)
)

device_month_species["rank_within_device"] = device_month_species.groupby(
    ["device", "month"]
)["detections"].rank(method="first", ascending=False)

device_month_species = device_month_species.loc[
    device_month_species["rank_within_device"] <= TOP_N
]

# Pivot to wide format for easier viewing
device_month_species_wide = device_month_species.pivot_table(
    index=["device", "month"],
    columns="rank_within_device",
    values=SPECIES_COL,
    aggfunc="first",
)
device_month_species_wide.head(10)

In [None]:
out_dir = Path(PROJECT_ROOT) / "outputs"
out_dir.mkdir(parents=True, exist_ok=True)

device_month_species.to_csv(out_dir / "device_top_species_by_month.csv", index=False)