# Exploring National Elk Refuge GPS Collar Data (2006-2015)

This notebook explores the National Elk Refuge GPS collar dataset - valuable for general elk behavior patterns and large sample size training!

**Dataset Info:**
- **Location:** National Elk Refuge, Jackson, Wyoming
- **Coverage:** 17 adult female elk, 2006-2015
- **Data:** GPS locations, timestamps, migration patterns
- **Use Case:** General elk behavior patterns, seasonal timing, long time series
- **Note:** ~200 miles from Area 048, but provides valuable general patterns

**Download:** https://data.usgs.gov/datacatalog/data/USGS:5a9f2782e4b0b1c392e502ea

In [None]:
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
from shapely.geometry import Point

# Set up paths
DATA_DIR = Path("../data/raw")
REFUGE_DIR = DATA_DIR / "elk_national_refuge"

print("=" * 60)
print("NATIONAL ELK REFUGE DATASET")
print("=" * 60)
print(f"\nData directory: {REFUGE_DIR}")
print(f"Directory exists: {REFUGE_DIR.exists()}")

# Look for data files
if REFUGE_DIR.exists():
    files = list(REFUGE_DIR.glob("*"))
    print(f"\nFiles found: {len(files)}")
    for f in files[:10]:
        print(f"  - {f.name}")
else:
    print("\n⚠️  Directory doesn't exist yet!")
    print("📥 Download instructions:")
    print("   1. Visit: https://data.usgs.gov/datacatalog/data/USGS:5a9f2782e4b0b1c392e502ea")
    print("   2. Download the dataset (CSV or shapefile format)")
    print("   3. Extract to: data/raw/elk_national_refuge/")

## Step 1: Load the Data

The dataset may be in CSV format (GPS points) or shapefile format. We'll try both.

In [None]:
# Try to find and load the data file
csv_files = list(REFUGE_DIR.glob("*.csv"))
shp_files = list(REFUGE_DIR.glob("*.shp"))

if shp_files:
    print(f"Loading shapefile: {shp_files[0].name}")
    gdf = gpd.read_file(shp_files[0])
    data_type = "shapefile"
elif csv_files:
    print(f"Loading CSV: {csv_files[0].name}")
    df = pd.read_csv(csv_files[0])
    
    # Auto-detect lat/lon columns
    lat_col = None
    lon_col = None
    for col in df.columns:
        col_lower = col.lower()
        if 'lat' in col_lower and lat_col is None:
            lat_col = col
        if ('lon' in col_lower or 'long' in col_lower) and lon_col is None:
            lon_col = col
    
    if lat_col and lon_col:
        print(f"  Found coordinates: {lat_col}, {lon_col}")
        gdf = gpd.GeoDataFrame(
            df,
            geometry=gpd.points_from_xy(df[lon_col], df[lat_col]),
            crs='EPSG:4326'
        )
        data_type = "csv_points"
    else:
        print(f"  ⚠️  Columns: {list(df.columns)}")
        print("  Please update the notebook to specify lat/lon column names.")
        gdf = None
        data_type = None
else:
    print("⚠️  No data files found!")
    gdf = None
    data_type = None

if gdf is not None:
    print(f"\n✓ Data loaded: {data_type}, Shape: {gdf.shape}, CRS: {gdf.crs}")

## Step 2: Inspect Dataset Structure

In [None]:
if gdf is not None:
    print("=" * 60)
    print("DATASET STRUCTURE")
    print("=" * 60)
    print(f"\nShape: {gdf.shape}")
    print(f"Columns: {list(gdf.columns)}")
    print(f"\nFirst few rows:")
    print(gdf.head())
    print(f"\nData types:")
    print(gdf.dtypes)
    print(f"\nMissing values:")
    missing = gdf.isnull().sum()
    if missing.sum() > 0:
        for col, count in missing[missing > 0].items():
            print(f"  {col}: {count} ({count/len(gdf)*100:.1f}%)")
    else:
        print("  ✓ No missing values!")

## Step 3: Extract Coordinates and Analyze Spatial Coverage

In [None]:
if gdf is not None:
    # Ensure we have lat/lon
    if 'latitude' not in gdf.columns or 'longitude' not in gdf.columns:
        if gdf.geometry is not None:
            gdf_wgs84 = gdf.to_crs('EPSG:4326') if gdf.crs != 'EPSG:4326' else gdf
            gdf_wgs84['latitude'] = gdf_wgs84.geometry.y
            gdf_wgs84['longitude'] = gdf_wgs84.geometry.x
        else:
            gdf_wgs84 = None
    else:
        gdf_wgs84 = gdf.to_crs('EPSG:4326') if gdf.crs != 'EPSG:4326' else gdf
    
    if gdf_wgs84 is not None:
        print("=" * 60)
        print("SPATIAL COVERAGE")
        print("=" * 60)
        print(f"\nLatitude: {gdf_wgs84['latitude'].min():.4f}° to {gdf_wgs84['latitude'].max():.4f}°")
        print(f"Longitude: {gdf_wgs84['longitude'].min():.4f}° to {gdf_wgs84['longitude'].max():.4f}°")
        
        # Distance to Area 048
        area_048_lat, area_048_lon = 41.835, -106.425
        
        from math import radians, sin, cos, sqrt, atan2
        
        def haversine_distance(lat1, lon1, lat2, lon2):
            R = 6371  # Earth radius in km
            lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
            dlat = lat2 - lat1
            dlon = lon2 - lon1
            a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
            c = 2 * atan2(sqrt(a), sqrt(1-a))
            return R * c
        
        gdf_wgs84['distance_to_area_048_km'] = gdf_wgs84.apply(
            lambda row: haversine_distance(row['latitude'], row['longitude'], area_048_lat, area_048_lon),
            axis=1
        )
        
        print(f"\nProximity to Area 048:")
        print(f"  Min distance: {gdf_wgs84['distance_to_area_048_km'].min():.2f} km")
        print(f"  Max distance: {gdf_wgs84['distance_to_area_048_km'].max():.2f} km")
        print(f"  Avg distance: {gdf_wgs84['distance_to_area_048_km'].mean():.2f} km")
        print(f"  Points within 200km: {(gdf_wgs84['distance_to_area_048_km'] <= 200).sum()} ({(gdf_wgs84['distance_to_area_048_km'] <= 200).sum() / len(gdf_wgs84) * 100:.1f}%)")
        print(f"\n⚠️  Note: National Elk Refuge is ~200 miles from Area 048.")
        print(f"   This data is valuable for general patterns, not geographic specificity.")

## Step 4: Analyze Temporal Patterns

In [None]:
if gdf_wgs84 is not None:
    # Try to find date column
    date_col = None
    for col in gdf_wgs84.columns:
        if 'date' in col.lower() or 'time' in col.lower():
            date_col = col
            break
    
    if date_col:
        try:
            gdf_wgs84['date'] = pd.to_datetime(gdf_wgs84[date_col])
            gdf_wgs84['year'] = gdf_wgs84['date'].dt.year
            gdf_wgs84['month'] = gdf_wgs84['date'].dt.month
            
            print("=" * 60)
            print("TEMPORAL ANALYSIS")
            print("=" * 60)
            print(f"\nDate range: {gdf_wgs84['date'].min()} to {gdf_wgs84['date'].max()}")
            
            print(f"\nYear distribution:")
            for year, count in gdf_wgs84['year'].value_counts().sort_index().items():
                print(f"  {int(year)}: {count:,} points ({count/len(gdf_wgs84)*100:.1f}%)")
            
            print(f"\nMonth distribution:")
            for month, count in gdf_wgs84['month'].value_counts().sort_index().items():
                month_name = pd.to_datetime(f"2020-{month}-01").strftime("%B")
                print(f"  {month_name}: {count:,} points ({count/len(gdf_wgs84)*100:.1f}%)")
            
            # October analysis
            october_points = gdf_wgs84[gdf_wgs84['month'] == 10]
            print(f"\n🎯 October data: {len(october_points):,} points ({len(october_points)/len(gdf_wgs84)*100:.1f}%)")
        except Exception as e:
            print(f"⚠️  Could not parse dates: {e}")
    else:
        print("⚠️  No date column found")

## Step 5: Prepare Data for PathWild Integration

In [None]:
if gdf_wgs84 is not None:
    # Create PathWild-ready dataset
    pathwild_data = pd.DataFrame({
        'latitude': gdf_wgs84['latitude'],
        'longitude': gdf_wgs84['longitude'],
        'distance_to_area_048_km': gdf_wgs84['distance_to_area_048_km']
    })
    
    # Add temporal info if available
    if 'date' in gdf_wgs84.columns:
        pathwild_data['date'] = gdf_wgs84['date']
        pathwild_data['year'] = gdf_wgs84['year']
        pathwild_data['month'] = gdf_wgs84['month']
    
    # Add other relevant columns
    for col in gdf_wgs84.columns:
        if col not in pathwild_data.columns and col not in ['geometry', 'latitude', 'longitude']:
            if gdf_wgs84[col].dtype in ['int64', 'float64', 'object']:
                pathwild_data[col] = gdf_wgs84[col]
    
    print("=" * 60)
    print("PATHWILD-READY DATASET")
    print("=" * 60)
    print(f"\nShape: {pathwild_data.shape}")
    print(f"Columns: {list(pathwild_data.columns)}")
    print(f"\nFirst few rows:")
    print(pathwild_data.head())
    
    # Save to CSV
    output_file = Path("../data/processed/national_refuge_points.csv")
    output_file.parent.mkdir(parents=True, exist_ok=True)
    pathwild_data.to_csv(output_file, index=False)
    print(f"\n✓ Saved to {output_file}")

## Step 6: Summary and Next Steps

In [None]:
if gdf_wgs84 is not None:
    print("=" * 60)
    print("NATIONAL ELK REFUGE DATASET SUMMARY")
    print("=" * 60)
    print(f"\nTotal GPS points: {len(gdf_wgs84):,}")
    print(f"\nGeographic coverage:")
    print(f"  Latitude: {gdf_wgs84['latitude'].min():.4f}° to {gdf_wgs84['latitude'].max():.4f}°")
    print(f"  Longitude: {gdf_wgs84['longitude'].min():.4f}° to {gdf_wgs84['longitude'].max():.4f}°")
    print(f"\nProximity to Area 048:")
    print(f"  Average distance: {gdf_wgs84['distance_to_area_048_km'].mean():.2f} km")
    
    print(f"\n📋 Key Insights:")
    print(f"  ✓ Large sample size for general elk behavior patterns")
    print(f"  ✓ Long time series (2006-2015)")
    print(f"  ✓ Useful for understanding seasonal timing")
    print(f"  ⚠️  Geographic distance from Area 048 (~200 miles)")
    print(f"  → Best used for general patterns, not geographic specificity")
    
    print(f"\nNext steps:")
    print("  1. Combine with South Bighorn data for hybrid training")
    print("  2. Use for general elk behavior patterns")
    print("  3. Integrate with DataContextBuilder to add environmental features")
    print("  4. Create training dataset with positive examples (GPS points)")
    print("  5. Generate negative examples (random points)")
    print("  6. Train XGBoost model with weighted combination of datasets")