In [None]:
import sys
sys.path.append('../src')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from data_io import load_excel_data, clean_observations, get_annual_summary

# set some plotting defaults
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

In [None]:
# load the data
data = load_excel_data('../data/raw/Observations 2012-2025.xlsx')
df_obs = data['observations']
df_species = data['species']
df_gps = data['gps']

In [None]:
# take a look at the structure
print("Observations shape:", df_obs.shape)
print("\nColumn names:")
print(df_obs.columns.tolist())
print("\nFirst few rows:")
df_obs.head()

In [None]:
# clean it up
df_clean = clean_observations(df_obs)

In [None]:
# get annual summary
annual_summary = get_annual_summary(df_clean)
annual_summary

In [None]:
# Basic Dataset Overview
print("Basic Dataset Overview")

print("\nTemporal Coverage:")
print(f"  First observation: {df_clean['date'].min().strftime('%Y-%m-%d')}")
print(f"  Last observation:  {df_clean['date'].max().strftime('%Y-%m-%d')}")
print(f"  Total years:       {df_clean['year'].nunique()}")
print(f"  Year range:        {int(df_clean['year'].min())} - {int(df_clean['year'].max())}")

print("\nSpatial Coverage:")
print(f"  Unique transects:  {df_clean['transect_name'].nunique()}")
print(f"  Unique points:     {df_clean['point_number'].nunique()} (1-10 per transect)")

print("\nSpecies Diversity:")
print(f"  Unique species:    {df_clean['species_name'].nunique()}")
print(f"  Total individuals: {df_clean['individual_count'].sum():,.0f}")

print("\nSampling Effort:")
print(f"  Total observations: {len(df_clean):,}")
print(f"  Unique observers:   {df_clean['observer_name'].nunique()}")

print("\nCount Statistics:")
print(f"  Mean birds/observation:   {df_clean['individual_count'].mean():.1f}")
print(f"  Median birds/observation: {df_clean['individual_count'].median():.0f}")
print(f"  Max birds/observation:    {df_clean['individual_count'].max():.0f}")

In [None]:
# Missing Value Analysis where we focus on essential columns only
print("Missing Value Analysis (essential columns only)\n")

# Define which columns actually matter for our analysis
essential_cols = [
    'observer_name', 'transect_name', 'species_name', 'date', 'year',
    'individual_count', 'point_number', 'visit_number'
]

optional_cols = [
    'wind', 'rain', 'visibility', 'cloud_cover_raw', 'start_time'
]

print("1. Essential columns (must have for analysis):")
for col in essential_cols:
    missing = df_clean[col].isnull().sum()
    pct = missing / len(df_clean) * 100
    status = "OK" if missing == 0 else f"{missing:,} missing ({pct:.2f}%)"
    print(f"{col:20s}: {status}")

print("\n2. Optional columns (nice to have but not critical):")
for col in optional_cols:
    if col in df_clean.columns:
        missing = df_clean[col].isnull().sum()
        pct = missing / len(df_clean) * 100
        print(f"   {col:20s}: {missing:,} missing ({pct:.2f}%)")

print("\n3. Distance Band Columns (Unnamed: 13-20):")
distance_cols = [col for col in df_clean.columns if 'Unnamed:' in col]
print(f"{len(distance_cols)} distance band columns are mostly empty")
print(f"This is expecte/normal and we will not use these columns in our analysis")

print("\nSummary:")
print(f"- All essential columns are complete or near-complete")
print(f"- Weather data has minimal missing values (<0.1%)")
print(f"- Distance bands are ignored (sparse by design)")
print(f"- Dataset is suitable for biodiversity analysis!")

In [None]:
# Observations Per Year (Simple Table)
print("Observations Per Year\n")

annual_counts = df_clean.groupby('year').agg({
    'observation_id': 'count',
    'species_name': 'nunique',
    'transect_name': 'nunique',
    'individual_count': 'sum'
}).round(0)

annual_counts.columns = ['N_Obs', 'N_Species', 'N_Transects', 'Total_Birds']
print(annual_counts)

print(f"Note: 2014 and 2025 appear to be partial years")
print(f"2014: {df_clean[df_clean.year == 2014]['date'].min().strftime('%B')} - {df_clean[df_clean.year == 2014]['date'].max().strftime('%B')}")
print(f"2025: {df_clean[df_clean.year == 2025]['date'].min().strftime('%B')} - {df_clean[df_clean.year == 2025]['date'].max().strftime('%B')}")

In [None]:
# Simple Temporal Coverage Plot
import matplotlib.pyplot as plt
import seaborn as sns

fig, ax = plt.subplots(figsize=(12, 6))

years = annual_counts.index.values
obs_counts = annual_counts['N_Obs'].values

# Colour partial years differently
colors = ['orange' if year in [2014, 2025] else 'blue' for year in years]

ax.bar(years, obs_counts, color=colors, edgecolor='black', alpha=0.8)
ax.set_xlabel('Year', fontweight='bold', fontsize=12)
ax.set_ylabel('Number of Observations', fontweight='bold', fontsize=12)
ax.set_title('Observation Effort Over Time', fontweight='bold', fontsize=14)
ax.grid(axis='y', alpha=0.3)

# Mark partial years
for year in [2014, 2025]:
    if year in years:
        idx = list(years).index(year)
        ax.text(year, obs_counts[idx] + 200, 'Partial\nYear', 
                ha='center', fontsize=9, color='darkred', fontweight='bold')

plt.tight_layout()
plt.savefig('../figures/eda_temporal_coverage.png', dpi=300, bbox_inches='tight')
plt.show()

print("Saved: ../figures/eda_temporal_coverage.png")

### Species: simple summaries and plots

Below we summarise species-level totals and richness over time to understand which species dominate counts and how diversity changes annually.

In [None]:
# Top 20 species by total individuals
species_totals = (
    df_clean.groupby('species_name')['individual_count']
            .sum()
            .sort_values(ascending=False)
)

top_n = 20
fig, ax = plt.subplots(figsize=(10, 8))
ax.barh(species_totals.head(top_n).index[::-1],
        species_totals.head(top_n).values[::-1],
        color=sns.color_palette(None, top_n))
ax.set_xlabel('Total individuals (2014-2025)', fontweight='bold')
ax.set_ylabel('Species', fontweight='bold')
ax.set_title(f'Top {top_n} species by abundance', fontweight='bold')
plt.tight_layout()
plt.savefig('../figures/eda_species_top20.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: ../figures/eda_species_top20.png')


In [None]:
# Annual species richness (unique species per year)
richness_by_year = (
    df_clean.groupby('year')['species_name']
            .nunique()
            .sort_index()
)

fig, ax = plt.subplots(figsize=(12, 6))
ax.plot(richness_by_year.index.astype(int), richness_by_year.values,
        marker='o', linewidth=2, color='tab:green')
ax.set_xlabel('Year', fontweight='bold')
ax.set_ylabel('Unique species', fontweight='bold')
ax.set_title('Species richness by year', fontweight='bold')
ax.grid(axis='y', alpha=0.3)

# highlight partial years if present
for year in [2014, 2025]:
    if year in richness_by_year.index:
        ax.scatter([year], [richness_by_year.loc[year]], color='orange', s=60, zorder=3)
        ax.text(year, richness_by_year.loc[year] + 1, 'Partial', ha='center', fontsize=9, color='darkred')

plt.tight_layout()
plt.savefig('../figures/eda_species_richness_by_year.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: ../figures/eda_species_richness_by_year.png')


### Observer effort: simple summaries and plots

We assess how sampling effort is distributed across observers and over time.


In [None]:
# Top observers by number of observations
obs_by_observer = (
    df_clean.groupby('observer_name')['observation_id']
            .count()
            .sort_values(ascending=False)
)

n = 15
fig, ax = plt.subplots(figsize=(10, 7))
ax.barh(obs_by_observer.head(n).index[::-1],
        obs_by_observer.head(n).values[::-1],
        color=sns.color_palette('deep', n))
ax.set_xlabel('Number of observations', fontweight='bold')
ax.set_ylabel('Observer', fontweight='bold')
ax.set_title(f'Top {n} observers by observation count', fontweight='bold')
plt.tight_layout()
plt.savefig('../figures/eda_observers_top15_obs.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: ../figures/eda_observers_top15_obs.png')


In [None]:
# Observers by spatial coverage (unique transects visited)
transects_by_observer = (
    df_clean.groupby('observer_name')['transect_name']
            .nunique()
            .sort_values(ascending=False)
)

m = 15
fig, ax = plt.subplots(figsize=(10, 7))
ax.barh(transects_by_observer.head(m).index[::-1],
        transects_by_observer.head(m).values[::-1],
        color=sns.color_palette('husl', m))
ax.set_xlabel('Unique transects', fontweight='bold')
ax.set_ylabel('Observer', fontweight='bold')
ax.set_title(f'Top {m} observers by spatial coverage', fontweight='bold')
plt.tight_layout()
plt.savefig('../figures/eda_observers_top15_transects.png', dpi=300, bbox_inches='tight')
plt.show()
print('Saved: ../figures/eda_observers_top15_transects.png')
