# Part A: Load Competition Data (CSV/Excel)
## Weather Emergency Prediction - Load Existing Files

**Use this notebook when you have competition data files.**

This notebook handles:
- ‚úÖ Load CSV/Excel files with geodata
- ‚úÖ Auto-detect latitude/longitude columns
- ‚úÖ Create GeoDataFrames from existing data
- ‚úÖ Spatial operations and feature engineering
- ‚úÖ Data validation and cleaning
- ‚úÖ Export processed data

In [None]:
# Install required packages
!pip install pandas numpy geopandas shapely folium matplotlib seaborn plotly scikit-learn openpyxl

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon
import folium
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
from datetime import datetime
from pathlib import Path

print("‚úÖ All libraries imported successfully")
print(f"GeoPandas version: {gpd.__version__}")
print(f"Pandas version: {pd.__version__}")

## 1. Load Competition Data from CSV/Excel

This function automatically detects and loads your data files with geodata.

In [None]:
def load_competition_data(filepath, lat_col=None, lon_col=None, date_col='date'):
    """
    Load competition CSV/Excel file with geodata.
    
    Args:
        filepath: Path to CSV or Excel file
        lat_col: Name of latitude column (auto-detected if None)
        lon_col: Name of longitude column (auto-detected if None)
        date_col: Name of date column
    
    Returns:
        GeoDataFrame with loaded data
    """
    print(f"üìÇ Loading data from: {filepath}")
    print("=" * 70)
    
    # Load file based on extension
    if filepath.endswith('.csv'):
        df = pd.read_csv(filepath)
    elif filepath.endswith(('.xlsx', '.xls')):
        df = pd.read_excel(filepath)
    else:
        raise ValueError("File must be CSV or Excel format")
    
    print(f"‚úÖ Loaded {len(df):,} rows, {len(df.columns)} columns")
    print(f"\nColumns found: {list(df.columns)}")
    
    # Auto-detect lat/lon columns if not specified
    if lat_col is None or lon_col is None:
        print(f"\nüîç Auto-detecting geodata columns...")
        
        lat_candidates = [col for col in df.columns if 'lat' in col.lower()]
        lon_candidates = [col for col in df.columns if 'lon' in col.lower()]
        
        if lat_candidates and lon_candidates:
            lat_col = lat_candidates[0]
            lon_col = lon_candidates[0]
            print(f"   ‚úÖ Found geodata: {lat_col}, {lon_col}")
        else:
            raise ValueError(f"Could not find latitude/longitude columns. Please specify manually.")
    
    # Check if columns exist
    if lat_col not in df.columns or lon_col not in df.columns:
        raise ValueError(f"Columns not found: {lat_col}, {lon_col}")
    
    # Parse date column if exists
    if date_col in df.columns:
        df[date_col] = pd.to_datetime(df[date_col], errors='coerce')
        print(f"   ‚úÖ Parsed date column: {date_col}")
        print(f"   üìÖ Date range: {df[date_col].min()} to {df[date_col].max()}")
    
    # Remove rows with invalid coordinates
    initial_len = len(df)
    df = df.dropna(subset=[lat_col, lon_col])
    if len(df) < initial_len:
        print(f"   üóëÔ∏è Removed {initial_len - len(df)} rows with missing coordinates")
    
    # Create Point geometries
    geometry = [Point(lon, lat) for lon, lat in zip(df[lon_col], df[lat_col])]
    gdf = gpd.GeoDataFrame(df, geometry=geometry, crs='EPSG:4326')
    
    print(f"\n‚úÖ Created GeoDataFrame:")
    print(f"   Total points: {len(gdf):,}")
    print(f"   Unique locations: {gdf.geometry.nunique()}")
    print(f"   Coordinate bounds: {gdf.total_bounds}")
    print(f"   CRS: {gdf.crs}")
    print("=" * 70)
    
    return gdf

print("‚úÖ Data loader function ready")

## 2. Load Your Competition Files

**Option 1:** Specify file path directly

**Option 2:** Upload file (for Google Colab)

In [None]:
# OPTION 1: Load from file path
# Replace with your actual file path
weather_gdf = load_competition_data('sample_weather.xlsx')

# Show first few rows
print("\nüìä Data Preview:")
weather_gdf.head()

In [None]:
# OPTION 2: Upload file (Google Colab)
# Uncomment to use:

# from google.colab import files
# uploaded = files.upload()
# filename = list(uploaded.keys())[0]
# weather_gdf = load_competition_data(filename)

## 3. Data Validation & Statistics

In [None]:
print("=" * 70)
print("DATA VALIDATION REPORT")
print("=" * 70)

print(f"\nüìä Basic Statistics:")
print(f"   Total records: {len(weather_gdf):,}")
print(f"   Total columns: {len(weather_gdf.columns)}")
print(f"   Memory usage: {weather_gdf.memory_usage(deep=True).sum() / 1024**2:.2f} MB")

print(f"\nüìç Geographic Coverage:")
print(f"   Unique locations: {weather_gdf.geometry.nunique()}")
print(f"   Bounds (lon_min, lat_min, lon_max, lat_max): {weather_gdf.total_bounds}")

print(f"\nüîç Data Quality:")
print(f"   Missing values per column:")
missing = weather_gdf.isnull().sum()
if missing.sum() > 0:
    print(missing[missing > 0])
else:
    print("   ‚úÖ No missing values")

print(f"\nüìà Numeric Columns Summary:")
numeric_cols = weather_gdf.select_dtypes(include=[np.number]).columns
print(weather_gdf[numeric_cols].describe())

## 4. Spatial Operations

In [None]:
def add_spatial_features(gdf):
    """Add spatial features to GeoDataFrame."""
    print("üó∫Ô∏è Adding spatial features...")
    
    gdf = gdf.copy()
    
    # Calculate center point
    center_lat = gdf.geometry.y.mean()
    center_lon = gdf.geometry.x.mean()
    print(f"   Center: ({center_lat:.4f}, {center_lon:.4f})")
    
    # Distance to center
    center_point = Point(center_lon, center_lat)
    gdf_proj = gdf.to_crs('EPSG:3857')
    center_proj = gpd.GeoSeries([center_point], crs='EPSG:4326').to_crs('EPSG:3857')[0]
    gdf['distance_to_center_km'] = gdf_proj.geometry.distance(center_proj) / 1000
    
    # Extract coordinates
    gdf['lat'] = gdf.geometry.y
    gdf['lon'] = gdf.geometry.x
    
    # Normalize coordinates
    gdf['lat_norm'] = (gdf['lat'] - gdf['lat'].mean()) / gdf['lat'].std()
    gdf['lon_norm'] = (gdf['lon'] - gdf['lon'].mean()) / gdf['lon'].std()
    
    print(f"   ‚úÖ Added: distance_to_center_km, lat, lon, lat_norm, lon_norm")
    print(f"   Distance range: {gdf['distance_to_center_km'].min():.2f} - {gdf['distance_to_center_km'].max():.2f} km")
    
    return gdf

weather_gdf = add_spatial_features(weather_gdf)
weather_gdf.head()

## 5. Temporal Features

In [None]:
def add_temporal_features(gdf, date_col='date'):
    """Add temporal features to GeoDataFrame."""
    print("üìÖ Adding temporal features...")
    
    if date_col not in gdf.columns:
        print(f"   ‚ö†Ô∏è Date column '{date_col}' not found. Skipping.")
        return gdf
    
    gdf = gdf.copy()
    
    gdf['year'] = gdf[date_col].dt.year
    gdf['month'] = gdf[date_col].dt.month
    gdf['day'] = gdf[date_col].dt.day
    gdf['day_of_year'] = gdf[date_col].dt.dayofyear
    gdf['week_of_year'] = gdf[date_col].dt.isocalendar().week.astype(int)
    gdf['season'] = (gdf['month'] % 12 // 3 + 1)
    
    # Cyclical encoding
    gdf['month_sin'] = np.sin(2 * np.pi * gdf['month'] / 12)
    gdf['month_cos'] = np.cos(2 * np.pi * gdf['month'] / 12)
    gdf['day_sin'] = np.sin(2 * np.pi * gdf['day_of_year'] / 365)
    gdf['day_cos'] = np.cos(2 * np.pi * gdf['day_of_year'] / 365)
    
    print(f"   ‚úÖ Added: year, month, day, season, cyclical encodings")
    
    return gdf

weather_gdf = add_temporal_features(weather_gdf)
weather_gdf.head()

## 6. Visualize Data on Map

In [None]:
def create_map(gdf, max_points=100):
    """Create interactive map of data points."""
    print(f"üó∫Ô∏è Creating interactive map...")
    
    # Sample data if too many points
    if len(gdf) > max_points:
        gdf_sample = gdf.sample(max_points, random_state=42)
        print(f"   Showing {max_points} random points (out of {len(gdf):,})")
    else:
        gdf_sample = gdf
    
    # Calculate center
    center_lat = gdf.geometry.y.mean()
    center_lon = gdf.geometry.x.mean()
    
    # Create map
    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=10,
        tiles='CartoDB positron'
    )
    
    # Add points
    for idx, row in gdf_sample.iterrows():
        popup_text = f"<b>Point {idx}</b><br>"
        for col in ['date', 'temperature', 'precipitation', 'district_id', 'district']:
            if col in row:
                popup_text += f"{col}: {row[col]}<br>"
        
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],
            radius=5,
            popup=popup_text,
            color='blue',
            fill=True,
            fillOpacity=0.6
        ).add_to(m)
    
    print(f"   ‚úÖ Map created")
    return m

map_obj = create_map(weather_gdf, max_points=100)
map_obj

## 7. Export Processed Data

In [None]:
print("üíæ Exporting processed data...")
print("=" * 70)

# Export as GeoJSON (with geometry)
weather_gdf.to_file('processed_data.geojson', driver='GeoJSON')
print("‚úÖ processed_data.geojson")

# Export as CSV (without geometry)
weather_df = weather_gdf.drop('geometry', axis=1)
weather_df.to_csv('processed_data.csv', index=False)
print(f"‚úÖ processed_data.csv ({len(weather_df):,} rows)")

# Export feature list
with open('feature_list.txt', 'w') as f:
    f.write("FEATURES LIST\n")
    f.write("=" * 50 + "\n\n")
    for i, col in enumerate(weather_df.columns, 1):
        f.write(f"{i:3d}. {col}\n")
print(f"‚úÖ feature_list.txt ({len(weather_df.columns)} features)")

print("=" * 70)
print("\n‚úÖ All files exported successfully!")

## 8. Summary

In [None]:
print("=" * 70)
print("FINAL SUMMARY")
print("=" * 70)

print(f"\nüìä Processed Data:")
print(f"   Total records: {len(weather_gdf):,}")
print(f"   Total features: {len(weather_gdf.columns)}")
print(f"   Unique locations: {weather_gdf.geometry.nunique()}")

if 'date' in weather_gdf.columns:
    print(f"\nüìÖ Temporal Coverage:")
    print(f"   Date range: {weather_gdf['date'].min()} to {weather_gdf['date'].max()}")
    print(f"   Total days: {(weather_gdf['date'].max() - weather_gdf['date'].min()).days}")

print(f"\nüìç Geographic Coverage:")
print(f"   Latitude range: {weather_gdf.geometry.y.min():.4f} to {weather_gdf.geometry.y.max():.4f}")
print(f"   Longitude range: {weather_gdf.geometry.x.min():.4f} to {weather_gdf.geometry.x.max():.4f}")
print(f"   Distance spread: {weather_gdf['distance_to_center_km'].max():.2f} km")

print(f"\nüîß Features Added:")
print(f"   Spatial: lat, lon, lat_norm, lon_norm, distance_to_center_km")
if 'month_sin' in weather_gdf.columns:
    print(f"   Temporal: year, month, day, season, cyclical encodings")

print(f"\nüìÅ Output Files:")
print(f"   ‚Ä¢ processed_data.geojson (with geometry)")
print(f"   ‚Ä¢ processed_data.csv (tabular data)")
print(f"   ‚Ä¢ feature_list.txt (feature names)")

print("\n" + "=" * 70)
print("‚úÖ PART A COMPLETED - Ready for Model Training!")
print("=" * 70)