# Part A: Data Processing with GeoPandas
## Weather Emergency Prediction - Geospatial Approach

This notebook focuses on **geospatial data processing** using GeoPandas:
- ‚úÖ Load geodata (latitude/longitude)
- ‚úÖ Create GeoDataFrames with geometry
- ‚úÖ Spatial operations (buffers, distances, intersections)
- ‚úÖ District/region assignment
- ‚úÖ Spatial aggregation
- ‚úÖ Interactive maps with Folium
- ‚úÖ Feature engineering with spatial context

In [None]:
# Install required packages
!pip install pandas numpy geopandas shapely folium matplotlib seaborn plotly scikit-learn contextily

In [None]:
import pandas as pd
import numpy as np
import geopandas as gpd
from shapely.geometry import Point, Polygon, LineString
from shapely.ops import nearest_points
import folium
from folium import plugins
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from datetime import datetime, timedelta
from sklearn.preprocessing import StandardScaler

# Set random seed
np.random.seed(42)

print("‚úÖ All libraries imported successfully")
print(f"GeoPandas version: {gpd.__version__}")

## 1. Define Geographic Districts (8 Districts)

We'll create 8 districts with:
- Center coordinates (lat/lon)
- Approximate boundaries (polygons)
- Metadata (population, area)

In [None]:
# Define 8 districts with coordinates
DISTRICTS = {
    "District_1": {
        "center_lat": 47.2220,
        "center_lon": 39.7180,
        "name_ru": "–†–∞–π–æ–Ω 1",
        "population": 150000,
        "area_km2": 43.5
    },
    "District_2": {
        "center_lat": 47.2580,
        "center_lon": 39.7850,
        "name_ru": "–†–∞–π–æ–Ω 2",
        "population": 97000,
        "area_km2": 65.3
    },
    "District_3": {
        "center_lat": 47.2750,
        "center_lon": 39.7320,
        "name_ru": "–†–∞–π–æ–Ω 3",
        "population": 207000,
        "area_km2": 45.2
    },
    "District_4": {
        "center_lat": 47.2180,
        "center_lon": 39.6420,
        "name_ru": "–†–∞–π–æ–Ω 4",
        "population": 180000,
        "area_km2": 52.8
    },
    "District_5": {
        "center_lat": 47.1980,
        "center_lon": 39.7680,
        "name_ru": "–†–∞–π–æ–Ω 5",
        "population": 165000,
        "area_km2": 48.7
    },
    "District_6": {
        "center_lat": 47.2420,
        "center_lon": 39.6850,
        "name_ru": "–†–∞–π–æ–Ω 6",
        "population": 135000,
        "area_km2": 41.3
    },
    "District_7": {
        "center_lat": 47.2640,
        "center_lon": 39.7180,
        "name_ru": "–†–∞–π–æ–Ω 7",
        "population": 123000,
        "area_km2": 38.9
    },
    "District_8": {
        "center_lat": 47.2380,
        "center_lon": 39.7420,
        "name_ru": "–†–∞–π–æ–Ω 8",
        "population": 175000,
        "area_km2": 47.1
    }
}

print(f"‚úÖ Defined {len(DISTRICTS)} districts")
for district_id, info in DISTRICTS.items():
    print(f"   {district_id}: ({info['center_lat']:.4f}, {info['center_lon']:.4f})")

## 2. Create Districts GeoDataFrame

Convert district data into a GeoPandas GeoDataFrame with Point geometries.

In [None]:
def create_districts_geodataframe(districts_dict):
    """Create GeoDataFrame from districts dictionary."""
    
    data = []
    for district_id, info in districts_dict.items():
        data.append({
            'district_id': district_id,
            'name_ru': info['name_ru'],
            'population': info['population'],
            'area_km2': info['area_km2'],
            'geometry': Point(info['center_lon'], info['center_lat'])
        })
    
    gdf = gpd.GeoDataFrame(data, crs='EPSG:4326')
    
    print(f"‚úÖ Created GeoDataFrame with {len(gdf)} districts")
    print(f"   CRS: {gdf.crs}")
    print(f"   Bounds: {gdf.total_bounds}")
    
    return gdf

districts_gdf = create_districts_geodataframe(DISTRICTS)
districts_gdf.head()

## 3. Create District Polygons (Voronoi-like)

Create polygon boundaries around each district center using buffers.

In [None]:
def create_district_polygons(districts_gdf, buffer_km=5):
    """Create polygon buffers around district centers."""
    
    # Convert to projected CRS for accurate distance calculations
    gdf_proj = districts_gdf.to_crs('EPSG:3857')  # Web Mercator
    
    # Create buffers (in meters)
    gdf_proj['geometry'] = gdf_proj.geometry.buffer(buffer_km * 1000)
    
    # Convert back to WGS84
    gdf_polygons = gdf_proj.to_crs('EPSG:4326')
    
    print(f"‚úÖ Created polygon buffers ({buffer_km} km radius)")
    return gdf_polygons

districts_polygons = create_district_polygons(districts_gdf, buffer_km=5)
districts_polygons.head()

## 4. Generate Weather Data with Geodata

Generate synthetic weather data for each district over 30 years.

In [None]:
def generate_weather_geodata(districts_dict, start_year=2015, num_years=30):
    """Generate weather data for all districts with geodata."""
    
    print(f"üå§Ô∏è Generating {num_years} years of weather data for {len(districts_dict)} districts...")
    
    data = []
    start_date = datetime(start_year, 1, 1)
    
    for day in range(365 * num_years):
        current_date = start_date + timedelta(days=day)
        day_of_year = current_date.timetuple().tm_yday
        
        for district_id, info in districts_dict.items():
            # Seasonal temperature with district variation
            base_temp = 10 + 15 * np.sin(2 * np.pi * day_of_year / 365)
            district_offset = np.random.normal(0, 1)  # District microclimate
            temperature = base_temp + district_offset + np.random.normal(0, 3)
            
            # Precipitation
            precip_prob = 0.3 + 0.2 * np.sin(2 * np.pi * day_of_year / 365 + np.pi/2)
            precipitation = np.random.gamma(2, 5) if np.random.random() < precip_prob else 0
            
            # Humidity
            humidity = np.clip(
                50 + 20 * np.sin(2 * np.pi * day_of_year / 365 + np.pi/2) + np.random.normal(0, 10),
                0, 100
            )
            
            # Wind speed
            wind_speed = np.abs(np.random.gamma(3, 2))
            
            # Pressure
            pressure = 1013 + np.random.normal(0, 8)
            
            data.append({
                'date': current_date,
                'district_id': district_id,
                'latitude': info['center_lat'],
                'longitude': info['center_lon'],
                'temperature': round(temperature, 1),
                'precipitation': round(precipitation, 1),
                'humidity': round(humidity, 1),
                'wind_speed': round(wind_speed, 1),
                'pressure': round(pressure, 1)
            })
    
    df = pd.DataFrame(data)
    print(f"   ‚úÖ Generated {len(df):,} records")
    print(f"   üìÖ Date range: {df['date'].min()} to {df['date'].max()}")
    
    return df

weather_df = generate_weather_geodata(DISTRICTS, start_year=2015, num_years=30)
weather_df.head()

## 5. Convert Weather Data to GeoDataFrame

Transform the weather DataFrame into a GeoDataFrame with Point geometries.

In [None]:
def create_weather_geodataframe(weather_df):
    """Convert weather DataFrame to GeoDataFrame."""
    
    # Create Point geometries from lat/lon
    geometry = [Point(lon, lat) for lon, lat in zip(weather_df['longitude'], weather_df['latitude'])]
    
    gdf = gpd.GeoDataFrame(weather_df, geometry=geometry, crs='EPSG:4326')
    
    print(f"‚úÖ Created Weather GeoDataFrame")
    print(f"   Total points: {len(gdf):,}")
    print(f"   Unique locations: {gdf.geometry.nunique()}")
    print(f"   CRS: {gdf.crs}")
    
    return gdf

weather_gdf = create_weather_geodataframe(weather_df)
weather_gdf.head()

## 6. Spatial Operations: Calculate Distances

Calculate distance from each point to region center.

In [None]:
def add_distance_to_center(gdf, center_lat=47.2357, center_lon=39.7015):
    """Calculate distance from each point to region center."""
    
    print(f"üìè Calculating distances to center ({center_lat}, {center_lon})...")
    
    # Create center point
    center_point = Point(center_lon, center_lat)
    
    # Convert to projected CRS for accurate distance
    gdf_proj = gdf.to_crs('EPSG:3857')
    center_proj = gpd.GeoSeries([center_point], crs='EPSG:4326').to_crs('EPSG:3857')[0]
    
    # Calculate distance in km
    gdf['distance_to_center_km'] = gdf_proj.geometry.distance(center_proj) / 1000
    
    print(f"   ‚úÖ Distance range: {gdf['distance_to_center_km'].min():.2f} - {gdf['distance_to_center_km'].max():.2f} km")
    
    return gdf

weather_gdf = add_distance_to_center(weather_gdf)
weather_gdf[['date', 'district_id', 'temperature', 'distance_to_center_km']].head(10)

## 7. Spatial Join: Assign Districts

Use spatial join to verify district assignments.

In [None]:
def spatial_join_districts(weather_gdf, districts_polygons):
    """Perform spatial join to assign districts."""
    
    print(f"üó∫Ô∏è Performing spatial join...")
    
    # Spatial join: which weather points fall within which district polygons
    joined = gpd.sjoin(weather_gdf, districts_polygons, how='left', predicate='within')
    
    # Rename columns
    if 'district_id_right' in joined.columns:
        joined['district_assigned'] = joined['district_id_right']
        joined = joined.drop(['index_right', 'district_id_right'], axis=1, errors='ignore')
    
    print(f"   ‚úÖ Spatial join complete")
    print(f"   Points assigned: {joined['district_assigned'].notna().sum():,}")
    
    return joined

weather_gdf = spatial_join_districts(weather_gdf, districts_polygons)
weather_gdf.head()

## 8. Spatial Aggregation by District

Aggregate weather data by district and time period.

In [None]:
def aggregate_by_district(gdf, time_period='M'):
    """Aggregate weather data by district and time period."""
    
    print(f"üìä Aggregating by district (period: {time_period})...")
    
    # Add time period column
    gdf['period'] = gdf['date'].dt.to_period(time_period)
    
    # Aggregate
    agg_data = gdf.groupby(['district_id', 'period']).agg({
        'temperature': ['mean', 'min', 'max', 'std'],
        'precipitation': ['sum', 'mean'],
        'humidity': 'mean',
        'wind_speed': 'mean',
        'pressure': 'mean',
        'latitude': 'first',
        'longitude': 'first'
    }).reset_index()
    
    # Flatten column names
    agg_data.columns = ['_'.join(col).strip('_') for col in agg_data.columns.values]
    
    print(f"   ‚úÖ Created {len(agg_data)} aggregated records")
    
    return agg_data

monthly_agg = aggregate_by_district(weather_gdf, time_period='M')
monthly_agg.head(10)

## 9. Create Interactive Map with Folium

Visualize districts and weather data on an interactive map.

In [None]:
def create_interactive_map_with_polygons(districts_gdf, districts_polygons, weather_sample_gdf=None):
    """Create interactive Folium map with district polygons AND center points."""
    
    print(f"üó∫Ô∏è Creating interactive map with polygons and points...")
    
    # Calculate center
    center_lat = districts_gdf.geometry.centroid.y.mean()
    center_lon = districts_gdf.geometry.centroid.x.mean()
    
    # Create map
    m = folium.Map(
        location=[center_lat, center_lon],
        zoom_start=11,
        tiles='CartoDB positron'
    )
    
    # Add district POLYGONS with transparency
    for idx, row in districts_polygons.iterrows():
        # Create color based on population (gradient)
        pop_normalized = (row['population'] - 90000) / (210000 - 90000)
        color_intensity = int(255 * pop_normalized)
        fill_color = f'#{color_intensity:02x}00{255-color_intensity:02x}'
        
        folium.GeoJson(
            row.geometry.__geo_interface__,
            style_function=lambda x, fc=fill_color: {
                'fillColor': fc,
                'color': '#000000',
                'weight': 2,
                'fillOpacity': 0.3
            },
            tooltip=f"{row['district_id']}<br>Pop: {row['population']:,}<br>Area: {row['area_km2']} km¬≤"
        ).add_to(m)
    
    # Add district CENTER POINTS (dots)
    districts_centers = create_districts_geodataframe(DISTRICTS)  # Recreate with point geometry
    for idx, row in districts_centers.iterrows():
        folium.CircleMarker(
            location=[row.geometry.y, row.geometry.x],
            radius=8,
            popup=f"<b>{row['district_id']}</b><br>Pop: {row['population']:,}<br>Area: {row['area_km2']} km¬≤",
            color='darkblue',
            fill=True,
            fillColor='blue',
            fillOpacity=0.8,
            weight=2
        ).add_to(m)
    
    # Add weather sample points if provided
    if weather_sample_gdf is not None:
        for idx, row in weather_sample_gdf.iterrows():
            folium.CircleMarker(
                location=[row.geometry.y, row.geometry.x],
                radius=4,
                popup=f"<b>Weather Point</b><br>District: {row['district_id']}<br>Temp: {row['temperature']}¬∞C<br>Precip: {row['precipitation']} mm<br>Date: {row['date']}",
                color='orange',
                fill=True,
                fillColor='red',
                fillOpacity=0.6
            ).add_to(m)
    
    # Add legend
    legend_html = '''
    <div style="position: fixed; bottom: 50px; left: 50px; width: 200px; height: 120px;
                background-color: white; border:2px solid grey; z-index:9999; font-size:12px;
                padding: 10px">
    <b>Legend:</b><br>
    <i style="background:blue; width:15px; height:15px; float:left; margin-right:5px; border-radius:50%"></i> District Centers<br>
    <i style="background:rgba(200,0,55,0.3); width:15px; height:15px; float:left; margin-right:5px;"></i> District Polygons<br>
    <i style="background:red; width:15px; height:15px; float:left; margin-right:5px; border-radius:50%"></i> Weather Points<br>
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))
    
    print(f"   ‚úÖ Map created with {len(districts_polygons)} polygons, {len(districts_centers)} centers")
    return m

# Sample latest data for visualization
weather_sample = weather_gdf[weather_gdf['date'] == weather_gdf['date'].max()]
map_obj = create_interactive_map_with_polygons(districts_gdf, districts_polygons, weather_sample)
map_obj

## 10. Spatial Feature Engineering

Create spatial features for modeling.

In [None]:
def create_spatial_features(gdf):
    """Create spatial features for ML models."""
    
    print(f"üîß Creating spatial features...")
    
    gdf = gdf.copy()
    
    # Extract coordinates
    gdf['lat'] = gdf.geometry.y
    gdf['lon'] = gdf.geometry.x
    
    # Normalize coordinates (for ML)
    gdf['lat_norm'] = (gdf['lat'] - gdf['lat'].mean()) / gdf['lat'].std()
    gdf['lon_norm'] = (gdf['lon'] - gdf['lon'].mean()) / gdf['lon'].std()
    
    # District encoding (one-hot)
    district_dummies = pd.get_dummies(gdf['district_id'], prefix='district')
    gdf = pd.concat([gdf, district_dummies], axis=1)
    
    print(f"   ‚úÖ Created spatial features")
    print(f"   Total features: {len(gdf.columns)}")
    
    return gdf

weather_gdf = create_spatial_features(weather_gdf)
print(f"\nFeature columns: {list(weather_gdf.columns)}")

## 11. Temporal Feature Engineering

Add time-based features.

In [None]:
def create_temporal_features(gdf):
    """Create time-based features."""
    
    print(f"üìÖ Creating temporal features...")
    
    gdf = gdf.copy()
    
    gdf['year'] = gdf['date'].dt.year
    gdf['month'] = gdf['date'].dt.month
    gdf['day'] = gdf['date'].dt.day
    gdf['day_of_year'] = gdf['date'].dt.dayofyear
    gdf['week_of_year'] = gdf['date'].dt.isocalendar().week.astype(int)
    gdf['season'] = (gdf['month'] % 12 // 3 + 1)
    
    # Cyclical encoding
    gdf['month_sin'] = np.sin(2 * np.pi * gdf['month'] / 12)
    gdf['month_cos'] = np.cos(2 * np.pi * gdf['month'] / 12)
    gdf['day_sin'] = np.sin(2 * np.pi * gdf['day_of_year'] / 365)
    gdf['day_cos'] = np.cos(2 * np.pi * gdf['day_of_year'] / 365)
    
    print(f"   ‚úÖ Temporal features created")
    
    return gdf

weather_gdf = create_temporal_features(weather_gdf)
weather_gdf.head()

## 12. Save Processed GeoData

Save as GeoJSON and CSV formats.

In [None]:
# Save GeoDataFrame as GeoJSON
print(f"üíæ Saving processed geodata...")

# Save districts (center points)
districts_gdf_centers = create_districts_geodataframe(DISTRICTS)
districts_gdf_centers.to_file('districts_centers.geojson', driver='GeoJSON')
print(f"   ‚úÖ districts_centers.geojson (points)")

# Save districts (polygons)
districts_polygons.to_file('districts_polygons.geojson', driver='GeoJSON')
print(f"   ‚úÖ districts_polygons.geojson (polygons)")

# Save weather data (sample - full dataset is large)
weather_recent = weather_gdf[weather_gdf['year'] >= 2020]
weather_recent.to_file('weather_geodata_2020_2025.geojson', driver='GeoJSON')
print(f"   ‚úÖ weather_geodata_2020_2025.geojson")

# Save full dataset as CSV (without geometry column for size)
weather_df_export = weather_gdf.drop('geometry', axis=1)
weather_df_export.to_csv('weather_geodata_full.csv', index=False)
print(f"   ‚úÖ weather_geodata_full.csv ({len(weather_df_export):,} rows)")

# Save monthly aggregation
monthly_agg.to_csv('weather_monthly_aggregated.csv', index=False)
print(f"   ‚úÖ weather_monthly_aggregated.csv")

print(f"\n‚úÖ All data saved successfully!")

## 13. Summary Statistics

Generate summary statistics for the processed geodata.

In [None]:
print("=" * 70)
print("GEODATA PROCESSING SUMMARY")
print("=" * 70)

print(f"\nüìä Weather GeoDataFrame:")
print(f"   Total records: {len(weather_gdf):,}")
print(f"   Total features: {len(weather_gdf.columns)}")
print(f"   Districts: {weather_gdf['district_id'].nunique()}")
print(f"   Date range: {weather_gdf['date'].min()} to {weather_gdf['date'].max()}")
print(f"   CRS: {weather_gdf.crs}")
print(f"   Bounds: {weather_gdf.total_bounds}")

print(f"\nüìè Spatial Statistics:")
print(f"   Distance to center (avg): {weather_gdf['distance_to_center_km'].mean():.2f} km")
print(f"   Distance to center (max): {weather_gdf['distance_to_center_km'].max():.2f} km")

print(f"\nüå°Ô∏è Weather Statistics by District:")
district_stats = weather_gdf.groupby('district_id').agg({
    'temperature': ['mean', 'min', 'max'],
    'precipitation': 'sum',
    'humidity': 'mean'
}).round(2)
print(district_stats)

print(f"\n" + "=" * 70)
print("‚úÖ PART A COMPLETED - GeoPandas Approach")
print("=" * 70)
print(f"\nüéØ Ready for Part B: Model Training with Spatial Features")