# Pairwise Field Season Comparison
This notebook compares consecutive field seasons using Welch's t-test.

In [None]:
from dataclasses import dataclass
from typing import List

import pandas as pd
from scipy.stats import ttest_ind

@dataclass
class SeasonComparison:
    """Container for the comparison between two field seasons."""
    season_a: int
    season_b: int
    mean_a: float
    mean_b: float
    t_stat: float
    p_value: float
    significant: bool

def compare_consecutive_seasons(
    df: pd.DataFrame,
    *,
    season_col: str = "Year",
    value_col: str = "Pre: Distance spotted",
    alpha: float = 0.05,
) -> pd.DataFrame:
    """Perform pairwise comparisons between consecutive field seasons."""
    field_seasons: List[int] = sorted(df[season_col].dropna().unique())
    results: List[SeasonComparison] = []
    for i in range(1, len(field_seasons)):
        season_a = field_seasons[i - 1]
        season_b = field_seasons[i]
        group_a = df[df[season_col] == season_a][value_col].dropna()
        group_b = df[df[season_col] == season_b][value_col].dropna()
        if len(group_a) > 0 and len(group_b) > 0:
            stat, p = ttest_ind(group_b, group_a, equal_var=False)
            results.append(SeasonComparison(season_a, season_b, group_a.mean(), group_b.mean(), stat, p, p < alpha))
        else:
            print(f"Skipping comparison {season_a} vs {season_b} due to missing data")
    comparison_df = pd.DataFrame([
        {
            "Field Season A": r.season_a,
            "Field Season B": r.season_b,
            "Mean A": r.mean_a,
            "Mean B": r.mean_b,
            "T-stat": r.t_stat,
            "p-value": r.p_value,
            "Significant (p<0.05)": r.significant,
        }
        for r in results
    ])
    return comparison_df.round(4)


In [None]:
import pandas as pd
from pathlib import Path
from scipy.stats import ttest_ind, mannwhitneyu

# Seasons to compare
early_years = [2003, 2007, 2008, 2011, 2018, 2019]
latest_year = 2024

# Load data
df_transects = pd.read_pickle('../data/pkl/df_transects.pkl')
df_occurrences = pd.read_pickle('../data/pkl/df_occurrences.pkl')

# Merge occurrences with transect info
df = df_occurrences.merge(
    df_transects[['UID', 'Pre: Transect physical habitat', 'Pre: On old reserve?', 'start_time']],
    how='left',
    left_on='TransectUID',
    right_on='UID'
)

df['Pre: Distance spotted'] = pd.to_numeric(df['Pre: Distance spotted'], errors='coerce')
df['Year'] = pd.to_datetime(df['start_time']).dt.year

df_cleaned = df

comparison_df = compare_consecutive_seasons(df_cleaned)
comparison_df


Unnamed: 0,Field Season A,Field Season B,Mean A,Mean B,T-stat,p-value,Significant (p<0.05)
0,2003,2007,3.3069,8.1525,3.2803,0.0016,True
1,2007,2008,8.1525,9.3173,0.6153,0.5394,False
2,2008,2011,9.3173,3.0897,-4.7234,0.0,True
3,2011,2018,3.0897,2.3884,-1.873,0.0619,False
4,2018,2019,2.3884,1.6491,-2.6972,0.0073,True
5,2019,2024,1.6491,0.8955,-2.4959,0.0142,True
