# Distance spotted: early vs latest seasons

This notebook compares the mean `Pre: Distance spotted` between early seasons and the latest season (2024) for each habitat.


In [None]:

import pandas as pd
from pathlib import Path
from scipy.stats import ttest_ind, mannwhitneyu

# Seasons to compare
early_years = [2003, 2007, 2008, 2011, 2018, 2019]
latest_year = 2024

# Load data
df_transects = pd.read_pickle('../data/pkl/df_transects.pkl')
df_occurrences = pd.read_pickle('../data/pkl/df_occurrences.pkl')

# Merge occurrences with transect info
df = df_occurrences.merge(
    df_transects[['UID', 'Pre: Transect physical habitat', 'Pre: On old reserve?', 'start_time']],
    how='left',
    left_on='TransectUID',
    right_on='UID'
)

df['Pre: Distance spotted'] = pd.to_numeric(df['Pre: Distance spotted'], errors='coerce')
df['Year'] = pd.to_datetime(df['start_time']).dt.year

df_cleaned = df


In [None]:

results = []

for habitat in df_cleaned['Pre: Transect physical habitat'].dropna().unique():
    df_habitat = df_cleaned[df_cleaned['Pre: Transect physical habitat'] == habitat]

    group_early = df_habitat[df_habitat['Year'].isin(early_years)]['Pre: Distance spotted'].dropna()
    group_latest = df_habitat[df_habitat['Year'] == latest_year]['Pre: Distance spotted'].dropna()

    if len(group_early) > 0 and len(group_latest) > 0:
        stat, p = ttest_ind(group_latest, group_early, equal_var=False)

        results.append({
            "Habitat": habitat,
            "Mean (early)": group_early.mean(),
            "Mean (2024)": group_latest.mean(),
            "T-stat": stat,
            "p-value": p,
            "Significant (p<0.05)": p < 0.05
        })
    else:
        print(f"⚠️ Skipping {habitat}: one of the groups has no data")

# Display results
comparison_by_habitat = pd.DataFrame(results).round(3)
comparison_by_habitat


In [None]:

# Optional: Mann–Whitney U test if assumptions for t-test are not met
# Replace the t-test in the loop above with the following line:
# stat, p = mannwhitneyu(group_latest, group_early, alternative='two-sided')
