# Notebook 8: Spatial-Temporal Patterns

## Purpose
Visualize geographic and temporal patterns in the PathWild elk data.

## Key Questions
- Is spatial coverage uniform or clustered?
- Do elk show seasonal migration patterns?
- Are there spatial hotspots for elk presence?
- Do features show expected seasonal cycles?

## Key Observations to Look For
- **Spatial Clustering**: Elk observations should cluster in known habitat areas
- **Seasonal Migration**: Elevation changes in winter (lower) vs summer (higher)
- **NDVI Patterns**: Should correlate with vegetation zones
- **Temporal Autocorrelation**: Could affect train/test splitting

In [None]:
# Setup
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scipy.ndimage import gaussian_filter
import os
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

np.random.seed(42)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 8)
plt.rcParams['font.size'] = 10

# Determine project root and output directories
possible_roots = [
    Path('.'),  # If running from project root
    Path('..'),  # If running from notebooks directory
    Path('../..'),  # If running from subdirectory
]

data_root = None
for root in possible_roots:
    if (root / 'data' / 'features').exists():
        data_root = root / 'data'
        break

if data_root is None:
    data_root = Path('../data')

# Create output directories relative to project root
figures_dir = data_root / 'figures'
reports_dir = data_root / 'reports'
figures_dir.mkdir(parents=True, exist_ok=True)
reports_dir.mkdir(parents=True, exist_ok=True)

print(f'✓ Setup complete')
print(f'  Output directory: {data_root.absolute()}')

## 1. Load Data and Detect Columns

In [None]:
# Load data
df = pd.read_csv('data/features/complete_context.csv')

# Detect key columns
timestamp_col = None
lat_col = None
lon_col = None
presence_col = None
ndvi_col = None
elevation_col = None

for col in df.columns:
    if any(x in col.lower() for x in ['timestamp', 'date', 'time']):
        timestamp_col = col
    if 'lat' in col.lower() and 'lon' not in col.lower():
        lat_col = col
    if 'lon' in col.lower() and 'lat' not in col.lower():
        lon_col = col
    if col.lower() in ['presence', 'target', 'label', 'is_presence']:
        presence_col = col
    if 'ndvi' in col.lower():
        ndvi_col = col
    if 'elev' in col.lower() or 'altitude' in col.lower():
        elevation_col = col

if timestamp_col:
    df[timestamp_col] = pd.to_datetime(df[timestamp_col], errors='coerce')
    df['month'] = df[timestamp_col].dt.month
    df['year'] = df[timestamp_col].dt.year

print(f'Dataset shape: {df.shape}')
print(f'Timestamp: {timestamp_col}')
print(f'Latitude: {lat_col}')
print(f'Longitude: {lon_col}')
print(f'Presence: {presence_col}')
print(f'NDVI: {ndvi_col}')
print(f'Elevation: {elevation_col}')

numeric_cols = df.select_dtypes(include=[np.number]).columns.tolist()

## 2. Static Spatial Visualization

In [None]:
# Create scatter plot of GPS points
if lat_col and lon_col:
    plt.figure(figsize=(14, 10))
    
    # Sample if too many points
    sample_size = min(20000, len(df))
    df_sample = df.sample(n=sample_size, random_state=42)
    
    # Color by presence if available
    if presence_col:
        colors = df_sample[presence_col].map({1: 'blue', 0: 'red', True: 'blue', False: 'red'})
        plt.scatter(
            df_sample[lon_col],
            df_sample[lat_col],
            c=colors,
            alpha=0.3,
            s=5,
            edgecolors='none'
        )
        
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='blue', alpha=0.5, label=f'Presence (n={(df[presence_col]==1).sum():,})'),
            Patch(facecolor='red', alpha=0.5, label=f'Absence (n={(df[presence_col]==0).sum():,})')
        ]
        plt.legend(handles=legend_elements, loc='upper right', fontsize=12)
    else:
        plt.scatter(
            df_sample[lon_col],
            df_sample[lat_col],
            alpha=0.3,
            s=5,
            color='blue',
            edgecolors='none'
        )
    
    plt.xlabel('Longitude', fontsize=12)
    plt.ylabel('Latitude', fontsize=12)
    plt.title(f'Spatial Coverage Map (n={sample_size:,} points)', fontsize=14, pad=20)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(figures_dir / 'spatial_coverage.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved spatial coverage map')
else:
    print('⚠ Cannot create spatial map without lat/lon columns')

## 3. NDVI Spatial Heatmap

In [None]:
# Create NDVI spatial heatmap
if ndvi_col and lat_col and lon_col:
    plt.figure(figsize=(14, 10))
    
    # Filter to valid NDVI values
    df_ndvi = df[[lon_col, lat_col, ndvi_col]].dropna()
    
    # Sample if too large
    if len(df_ndvi) > 50000:
        df_ndvi = df_ndvi.sample(n=50000, random_state=42)
    
    # Create hexbin plot
    hexbin = plt.hexbin(
        df_ndvi[lon_col],
        df_ndvi[lat_col],
        C=df_ndvi[ndvi_col],
        gridsize=50,
        cmap='RdYlGn',
        reduce_C_function=np.mean,
        mincnt=1
    )
    
    plt.colorbar(hexbin, label='Mean NDVI')
    plt.xlabel('Longitude', fontsize=12)
    plt.ylabel('Latitude', fontsize=12)
    plt.title('NDVI Spatial Distribution (Hexbin)', fontsize=14, pad=20)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(figures_dir / 'ndvi_spatial_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved NDVI spatial heatmap')
    
    # Statistics by region
    print('\nNDVI statistics by region:')
    print(f'  Overall mean: {df_ndvi[ndvi_col].mean():.3f}')
    print(f'  Overall std: {df_ndvi[ndvi_col].std():.3f}')
else:
    print('⚠ Cannot create NDVI heatmap without NDVI and lat/lon columns')

## 4. Temporal Line Plots

In [None]:
# Plot temporal patterns for all numeric features
if timestamp_col:
    # Select features for temporal analysis (exclude month, year, lat, lon)
    temporal_features = [col for col in numeric_cols 
                        if col not in ['month', 'year', lat_col, lon_col]][:12]
    
    n_cols = 3
    n_rows = (len(temporal_features) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(15, 5*n_rows))
    axes = axes.flatten() if n_rows > 1 else [axes] if n_cols == 1 else axes
    
    for idx, col in enumerate(temporal_features):
        ax = axes[idx]
        
        # Group by date and calculate daily mean
        daily_mean = df.groupby(df[timestamp_col].dt.date)[col].mean()
        
        # Plot with rolling average
        ax.plot(daily_mean.index, daily_mean.values, alpha=0.3, color='gray', linewidth=0.5)
        
        # 7-day rolling average
        rolling = daily_mean.rolling(window=7, center=True).mean()
        ax.plot(rolling.index, rolling.values, color='blue', linewidth=2, label='7-day avg')
        
        ax.set_xlabel('Date', fontsize=9)
        ax.set_ylabel(col, fontsize=9)
        ax.set_title(col, fontsize=10)
        ax.legend(fontsize=8)
        ax.grid(alpha=0.3)
        ax.tick_params(axis='x', rotation=45, labelsize=8)
    
    # Hide extra subplots
    for idx in range(len(temporal_features), len(axes)):
        axes[idx].axis('off')
    
    plt.suptitle('Temporal Patterns (Daily Means with 7-day Rolling Average)', fontsize=16, y=1.00)
    plt.tight_layout()
    plt.savefig(figures_dir / 'temporal_timeseries.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved temporal timeseries')
else:
    print('⚠ Cannot create temporal plots without timestamp column')

## 5. Month vs Feature Heatmap

In [None]:
# Create heatmap showing seasonal patterns
if timestamp_col and 'month' in df.columns:
    # Select features for heatmap
    heatmap_features = [col for col in numeric_cols 
                       if col not in ['month', 'year', lat_col, lon_col]][:20]
    
    # Calculate monthly means
    monthly_data = df.groupby('month')[heatmap_features].mean()
    
    # Normalize each feature (0-1 scale) for better visualization
    from sklearn.preprocessing import MinMaxScaler
    scaler = MinMaxScaler()
    monthly_normalized = pd.DataFrame(
        scaler.fit_transform(monthly_data.T).T,
        index=monthly_data.index,
        columns=monthly_data.columns
    )
    
    plt.figure(figsize=(14, 10))
    sns.heatmap(
        monthly_normalized.T,
        cmap='RdYlGn',
        cbar_kws={'label': 'Normalized Value (0-1)'},
        xticklabels=['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun', 
                     'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec'],
        yticklabels=monthly_normalized.columns
    )
    plt.xlabel('Month', fontsize=12)
    plt.ylabel('Feature', fontsize=12)
    plt.title('Seasonal Heatmap (Normalized Monthly Means)', fontsize=14, pad=20)
    plt.tight_layout()
    plt.savefig(figures_dir / 'seasonal_heatmap.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved seasonal heatmap')
else:
    print('⚠ Cannot create seasonal heatmap without timestamp column')

## 6. Spatial Clustering Analysis

In [None]:
# Calculate kernel density estimate of elk observations
if lat_col and lon_col and presence_col:
    # Filter to presence observations
    presence_df = df[df[presence_col] == 1][[lat_col, lon_col]].dropna()
    absence_df = df[df[presence_col] == 0][[lat_col, lon_col]].dropna()
    
    fig, axes = plt.subplots(1, 2, figsize=(16, 7))
    
    # Presence density
    if len(presence_df) > 0:
        sample_size = min(10000, len(presence_df))
        presence_sample = presence_df.sample(n=sample_size, random_state=42)
        
        axes[0].hexbin(
            presence_sample[lon_col],
            presence_sample[lat_col],
            gridsize=40,
            cmap='Blues',
            mincnt=1
        )
        axes[0].set_xlabel('Longitude', fontsize=12)
        axes[0].set_ylabel('Latitude', fontsize=12)
        axes[0].set_title(f'Elk Presence Density (n={len(presence_df):,})', fontsize=13)
        axes[0].grid(alpha=0.3)
    
    # Absence density
    if len(absence_df) > 0:
        sample_size = min(10000, len(absence_df))
        absence_sample = absence_df.sample(n=sample_size, random_state=42)
        
        axes[1].hexbin(
            absence_sample[lon_col],
            absence_sample[lat_col],
            gridsize=40,
            cmap='Reds',
            mincnt=1
        )
        axes[1].set_xlabel('Longitude', fontsize=12)
        axes[1].set_ylabel('Latitude', fontsize=12)
        axes[1].set_title(f'Pseudo-Absence Density (n={len(absence_df):,})', fontsize=13)
        axes[1].grid(alpha=0.3)
    
    plt.tight_layout()
    plt.savefig(figures_dir / 'elk_density_contours.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved density contours')
else:
    print('⚠ Cannot create density plots without lat/lon and presence columns')

## 7. Elevation vs Time (Migration Pattern)

In [None]:
# Analyze elevation changes over time
if elevation_col and timestamp_col:
    # Sample for visualization
    sample_size = min(50000, len(df))
    df_sample = df.sample(n=sample_size, random_state=42)
    
    plt.figure(figsize=(14, 8))
    
    # Color by presence if available
    if presence_col:
        colors = df_sample[presence_col].map({1: 'blue', 0: 'red', True: 'blue', False: 'red'})
        plt.scatter(
            df_sample[timestamp_col],
            df_sample[elevation_col],
            c=colors,
            alpha=0.2,
            s=10,
            edgecolors='none'
        )
        
        from matplotlib.patches import Patch
        legend_elements = [
            Patch(facecolor='blue', alpha=0.5, label='Presence'),
            Patch(facecolor='red', alpha=0.5, label='Absence')
        ]
        plt.legend(handles=legend_elements, loc='upper right')
    else:
        plt.scatter(
            df_sample[timestamp_col],
            df_sample[elevation_col],
            alpha=0.2,
            s=10,
            color='blue',
            edgecolors='none'
        )
    
    # Add monthly trend lines
    if 'month' in df.columns:
        monthly_elev = df.groupby('month')[elevation_col].mean()
        # Create date proxies for plotting
        month_dates = pd.date_range(start=df[timestamp_col].min(), periods=12, freq='MS')
        plt.plot(month_dates[:len(monthly_elev)], monthly_elev.values, 
                color='black', linewidth=3, label='Monthly Mean', alpha=0.7)
    
    plt.xlabel('Date', fontsize=12)
    plt.ylabel('Elevation (ft)', fontsize=12)
    plt.title('Elevation vs Time (Potential Migration Pattern)', fontsize=14, pad=20)
    plt.grid(alpha=0.3)
    plt.tight_layout()
    plt.savefig(figures_dir / 'elevation_vs_time.png', dpi=300, bbox_inches='tight')
    plt.show()
    
    print('✓ Saved elevation vs time plot')
    
    # Analyze seasonal elevation patterns
    if 'month' in df.columns:
        print('\nSeasonal elevation statistics:')
        seasonal_elev = df.groupby('month')[elevation_col].agg(['mean', 'std', 'count'])
        print(seasonal_elev)
        
        winter_elev = df[df['month'].isin([12, 1, 2])][elevation_col].mean()
        summer_elev = df[df['month'].isin([6, 7, 8])][elevation_col].mean()
        elev_diff = summer_elev - winter_elev
        
        print(f'\nWinter (Dec-Feb) mean elevation: {winter_elev:.1f} ft')
        print(f'Summer (Jun-Aug) mean elevation: {summer_elev:.1f} ft')
        print(f'Seasonal difference: {elev_diff:.1f} ft')
        
        if elev_diff > 500:
            print('✓ Evidence of seasonal migration (elk move to higher elevations in summer)')
        else:
            print('⚠ Limited evidence of seasonal migration')
else:
    print('⚠ Cannot analyze elevation patterns without elevation and timestamp columns')

## 8. Temporal Autocorrelation

In [None]:
# Analyze temporal autocorrelation
if timestamp_col:
    # Select a few key features for autocorrelation analysis
    acf_features = [col for col in [ndvi_col, elevation_col, 'temperature', 'precipitation']
                   if col in df.columns][:4]
    
    if len(acf_features) > 0:
        try:
            from statsmodels.graphics.tsaplots import plot_acf, plot_pacf
            
            fig, axes = plt.subplots(len(acf_features), 2, figsize=(14, 4*len(acf_features)))
            if len(acf_features) == 1:
                axes = axes.reshape(1, -1)
            
            for idx, col in enumerate(acf_features):
                # Get time series data
                ts_data = df.sort_values(timestamp_col)[col].dropna()
                
                # Limit to reasonable size for ACF
                if len(ts_data) > 10000:
                    ts_data = ts_data.iloc[:10000]
                
                # ACF plot
                plot_acf(ts_data, lags=40, ax=axes[idx, 0], alpha=0.05)
                axes[idx, 0].set_title(f'{col} - Autocorrelation', fontsize=11)
                axes[idx, 0].grid(alpha=0.3)
                
                # PACF plot
                plot_pacf(ts_data, lags=40, ax=axes[idx, 1], alpha=0.05)
                axes[idx, 1].set_title(f'{col} - Partial Autocorrelation', fontsize=11)
                axes[idx, 1].grid(alpha=0.3)
            
            plt.tight_layout()
            plt.savefig(figures_dir / 'temporal_autocorrelation.png', dpi=300, bbox_inches='tight')
            plt.show()
            
            print('✓ Saved temporal autocorrelation plots')
            print('\nInterpretation:')
            print('- Significant lags indicate temporal dependence')
            print('- May need to account for this in train/test splitting')
            print('- Consider time-based cross-validation')
        except ImportError:
            print('⚠ statsmodels not available for ACF/PACF plots')
        except Exception as e:
            print(f'⚠ Could not create ACF/PACF plots: {e}')
    else:
        print('⚠ No suitable features found for autocorrelation analysis')
else:
    print('⚠ Cannot analyze autocorrelation without timestamp column')

## Summary

This notebook analyzed spatial and temporal patterns:

1. **Spatial Coverage**: Visualized GPS point distribution
2. **NDVI Spatial Patterns**: Identified vegetation zones
3. **Temporal Patterns**: Examined feature changes over time
4. **Seasonal Heatmap**: Revealed seasonal cycles across features
5. **Spatial Clustering**: Identified elk presence hotspots
6. **Migration Patterns**: Analyzed elevation changes by season
7. **Temporal Autocorrelation**: Assessed time-series dependencies

**Key Findings**:
- Review spatial clustering to validate pseudo-absence generation
- Check for seasonal migration patterns in elevation data
- Consider temporal autocorrelation for model validation strategy

**Next Steps**:
- Proceed to Notebook 09 for feature correlation analysis