In [None]:
# Cell 1: Setup and Species Selection
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
import statsmodels.api as sm
from pathlib import Path
import sys

# Load your cleaned data (you already have this)
print("Loading cleaned data...")
sys.path.append('/Applications/Documents/app-stats/birds-biodiversity/src')
from data_io import load_excel_data, clean_observations

data_path = Path('/Applications/Documents/app-stats/birds-biodiversity/data/raw/Observations 2012-2025.xlsx')
data_dict = load_excel_data(data_path)
df_clean = clean_observations(data_dict['observations'])

# Identify interesting species
def identify_interesting_species(df_clean, top_n=10):
    """Identify species with interesting patterns"""
    
    # Most abundant species
    species_abundance = df_clean.groupby('species_name')['individual_count'].sum().sort_values(ascending=False)
    
    # Species with highest year-to-year variation
    species_variation = df_clean.groupby(['year', 'species_name'])['individual_count'].sum().unstack().std().sort_values(ascending=False)
    
    # Species present in most years (consistency)
    species_presence = df_clean.groupby(['year', 'species_name'])['individual_count'].sum().unstack()
    species_consistency = (species_presence > 0).sum().sort_values(ascending=False)
    
    print("Top 10 Most Abundant Species:")
    print(species_abundance.head(10))
    
    print("\nTop 10 Most Variable Species:")
    print(species_variation.head(10))
    
    print("\nTop 10 Most Consistent Species:")
    print(species_consistency.head(10))
    
    # Select 4-6 interesting species for detailed analysis
    interesting_species = [
        species_abundance.index[0],  # Most abundant
        species_variation.index[0],  # Most variable
        species_abundance.index[5],  # Moderately abundant but interesting
        species_consistency.index[0], # Most consistent
    ]
    
    # Remove duplicates and ensure we have unique species
    interesting_species = list(dict.fromkeys(interesting_species))[:4]
    
    print(f"\nSelected species for detailed analysis: {interesting_species}")
    return interesting_species

interesting_species = identify_interesting_species(df_clean)

Loading cleaned data...
Loading data from /Applications/Documents/app-stats/birds-biodiversity/data/raw/Observations 2012-2025.xlsx
Loaded 86 species
Loaded 651 GPS points
Converting count columns to numeric
Loaded 114497 observation records
Data Cleaning
Starting with 114497 records
⚠ Found 5 negative wind values - setting to NaN
Removed 4 records with zero/negative counts
Final dataset: 114493 records
  Years: 2014.0 - 2025.0
  Unique species: 102
  Unique transects: 72
  Unique observers: 42
Top 10 Most Abundant Species:
species_name
Quiscale merle                114356.000000
Sucrier à ventre jaune         90260.000000
Elénie siffleuse               61802.000000
Tourterelle à queue carrée     60901.000000
Sporophile rougegorge          60209.666667
Sporophile cici                43824.000000
Tyran gris                     40408.000000
Saltator gros-bec              31566.000000
Viréo à moustaches             28069.000000
Merle à lunettes               27682.000000
Name: individual_