In [11]:
# Cell 1: Setup and Initialize Everything
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import folium
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# FIXED: Add utils to path properly
current_dir = os.getcwd()
utils_dir = os.path.join(current_dir, '..', 'utils')
if os.path.exists(utils_dir):
    sys.path.insert(0, utils_dir)
    print(f"✅ Added utils to path: {utils_dir}")
else:
    print(f"❌ Utils directory not found at: {utils_dir}")

# Import our modules
from config import Config
from gee_auth import initialize_earth_engine
from gee_data_extractor import GEEDataExtractor
from weather_api import WeatherAPI
from soil_data_collector import SoilDataCollector

print("🌾 PUNJAB SMART CROP ADVISORY - COMPLETE DATA COLLECTION")
print("=" * 65)
print(f"📅 Collection Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print(f"🎯 Goal: Collect real satellite, weather & soil data for Punjab")

# Create directories
Config.create_directories()


✅ Added utils to path: c:\Users\DELL\OneDrive\Desktop\SIH2\Punjab_Crop_Advisory\notebooks\..\utils
🌾 PUNJAB SMART CROP ADVISORY - COMPLETE DATA COLLECTION
📅 Collection Date: 2025-09-01 02:37:22
🎯 Goal: Collect real satellite, weather & soil data for Punjab
✅ All directories created


In [15]:
# Cell 2: Initialize All Systems
print("\n🔧 INITIALIZING ALL SYSTEMS")
print("=" * 35)

# Initialize Google Earth Engine with your JSON
print("1️⃣ Google Earth Engine...")
gee_success = initialize_earth_engine()

# Initialize Weather API
print("\n2️⃣ Weather API System...")
weather_api = WeatherAPI()

# Test weather API
test_weather = weather_api.get_weather_for_location(30.9010, 75.8573)
weather_success = test_weather is not None
print(f"   Weather API Status: {'✅ Working' if weather_success else '❌ Failed'}")

# Initialize Soil Data Collector
print("\n3️⃣ Soil Data Collection System...")
soil_collector = SoilDataCollector()

# Test soil data collection
test_soil = soil_collector.get_soilgrids_data(30.9010, 75.8573)
soil_success = test_soil is not None
print(f"   Soil Data Status: {'✅ Working' if soil_success else '❌ Failed'}")

# Initialize GEE Data Extractor
print("\n4️⃣ Satellite Data Extractor...")
gee_extractor = GEEDataExtractor()

print(f"\n📊 SYSTEM STATUS:")
print(f"   🛰️ Google Earth Engine: {'✅' if gee_success else '❌'}")
print(f"   🌤️ Weather APIs: {'✅' if weather_success else '❌'}")
print(f"   🌱 Soil Data: {'✅' if soil_success else '❌'}")

if test_weather:
    print(f"   🌡️ Current temp in Ludhiana: {test_weather['temperature']:.1f}°C")
if test_soil:
    print(f"   🧪 Sample pH for Ludhiana: {test_soil['pH']:.1f}")



🔧 INITIALIZING ALL SYSTEMS
1️⃣ Google Earth Engine...
🛰️ Initializing Google Earth Engine...
🔑 Service Account: airflow-earth-engine-reader@smart-crop-advisory.iam.gserviceaccount.com
🔑 Project ID: smart-crop-advisory
✅ Google Earth Engine initialized successfully!

2️⃣ Weather API System...
   Weather API Status: ✅ Working

3️⃣ Soil Data Collection System...
✅ Google Earth Engine initialized successfully!

2️⃣ Weather API System...
   Weather API Status: ✅ Working

3️⃣ Soil Data Collection System...
SoilGrids API error: HTTPSConnectionPool(host='rest.soilgrids.org', port=443): Max retries exceeded with url: /soilgrids/v2.0/properties/query?lon=75.8573&lat=30.901&property=phh2o&property=soc&property=sand&property=silt&property=clay&property=nitrogen&depth=0-5cm (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000221ABCD6710>: Failed to resolve 'rest.soilgrids.org' ([Errno 11001] getaddrinfo failed)")), using Punjab model
   Soil Data Status: ✅ Working


In [14]:
# Force reload the config module
import importlib
import config
importlib.reload(config)
from config import Config

# Test Google Earth Engine Configuration
print("🔍 Testing RELOADED GEE Configuration...")
print(f"Config GEE Path: {Config.GEE_SERVICE_ACCOUNT_JSON}")
print(f"File exists: {os.path.exists(Config.GEE_SERVICE_ACCOUNT_JSON)}")

# If still not found, let's manually set it
if not os.path.exists(Config.GEE_SERVICE_ACCOUNT_JSON):
    Config.GEE_SERVICE_ACCOUNT_JSON = "../config/google_earth_engine.json"
    print(f"✅ Fixed path: {Config.GEE_SERVICE_ACCOUNT_JSON}")
    print(f"File exists now: {os.path.exists(Config.GEE_SERVICE_ACCOUNT_JSON)}")

# Now reload and test GEE auth
import gee_auth
importlib.reload(gee_auth)
from gee_auth import initialize_earth_engine

print("\n🛰️ Testing Google Earth Engine initialization...")
gee_success = initialize_earth_engine()
print(f"GEE Success: {gee_success}")

🔍 Testing RELOADED GEE Configuration...
Config GEE Path: config/service-account.json
File exists: False
✅ Fixed path: ../config/google_earth_engine.json
File exists now: True

🛰️ Testing Google Earth Engine initialization...
🛰️ Initializing Google Earth Engine...
🔑 Service Account: airflow-earth-engine-reader@smart-crop-advisory.iam.gserviceaccount.com
🔑 Project ID: smart-crop-advisory
✅ Google Earth Engine initialized successfully!
GEE Success: True
✅ Google Earth Engine initialized successfully!
GEE Success: True


In [16]:
# Cell 3: Create Punjab Farm Plot Network
print("\n🗺️ CREATING PUNJAB FARM PLOT NETWORK")
print("=" * 42)

# Create farm plots across Punjab districts
farm_plots = gee_extractor.create_punjab_farm_plots(num_plots=50)

# Convert to DataFrame
plots_df = pd.DataFrame([{
    'plot_id': plot['plot_id'],
    'latitude': plot['latitude'],
    'longitude': plot['longitude'],
    'district': plot['district']
} for plot in farm_plots])

print(f"✅ Created {len(plots_df)} farm plots across Punjab")

# Display geographic coverage
print(f"\n📍 Geographic Coverage:")
print(f"   Latitude: {plots_df['latitude'].min():.3f}°N to {plots_df['latitude'].max():.3f}°N")
print(f"   Longitude: {plots_df['longitude'].min():.3f}°E to {plots_df['longitude'].max():.3f}°E")

# District distribution
district_counts = plots_df['district'].value_counts()
print(f"\n🏘️ District Distribution:")
for district, count in district_counts.items():
    print(f"   {district}: {count} plots")

# Save farm plots
plots_df.to_csv('../data/raw/punjab_farm_plots.csv', index=False)
print(f"\n💾 Saved: data/raw/punjab_farm_plots.csv")

# Display sample
print(f"\n🔍 Sample Farm Plots:")
print(plots_df.head())



🗺️ CREATING PUNJAB FARM PLOT NETWORK
✅ Created 50 farm plots across Punjab districts
✅ Created 50 farm plots across Punjab

📍 Geographic Coverage:
   Latitude: 30.009°N to 32.021°N
   Longitude: 74.625°E to 76.869°E

🏘️ District Distribution:
   Mohali: 9 plots
   Patiala: 8 plots
   Bathinda: 8 plots
   Jalandhar: 6 plots
   Gurdaspur: 5 plots
   Kapurthala: 5 plots
   Ludhiana: 5 plots
   Amritsar: 4 plots

💾 Saved: data/raw/punjab_farm_plots.csv

🔍 Sample Farm Plots:
  plot_id   latitude  longitude    district
0  PB_001  31.961465  75.484315   Gurdaspur
1  PB_002  31.211218  75.428835  Kapurthala
2  PB_003  31.311807  75.436676   Jalandhar
3  PB_004  30.577882  76.499115     Patiala
4  PB_005  30.715975  76.694073      Mohali


In [17]:
# Cell 4: Collect Real Satellite Data
print(f"\n🛰️ COLLECTING REAL SATELLITE DATA (SENTINEL-2)")
print("=" * 52)

# Define date range for satellite data
end_date = '2024-08-31'
start_date = '2024-06-01'  # Growing season

print(f"📅 Satellite Data Period: {start_date} to {end_date}")
print(f"🎯 Processing {len(farm_plots)} farm plots...")

satellite_data = []
successful_extractions = 0

for idx, plot in enumerate(farm_plots):
    plot_id = plot['plot_id']
    geometry = plot['geometry']
    
    print(f"🔄 Processing {idx+1}/{len(farm_plots)}: {plot_id}", end=" ... ")
    
    try:
        # Extract real satellite data
        sat_data = gee_extractor.extract_ndvi_sentinel2(geometry, start_date, end_date)
        
        # Add plot information
        plot_data = {
            'plot_id': plot_id,
            'latitude': plot['latitude'],
            'longitude': plot['longitude'],
            'district': plot['district'],
            **sat_data
        }
        
        satellite_data.append(plot_data)
        
        if sat_data['data_source'] == 'Sentinel2_Real':
            successful_extractions += 1
            print("✅ Real data")
        else:
            print("⚠️ Synthetic")
        
    except Exception as e:
        print(f"❌ Error: {str(e)[:30]}...")
        # Add plot with null values
        satellite_data.append({
            'plot_id': plot_id,
            'latitude': plot['latitude'],
            'longitude': plot['longitude'],
            'district': plot['district'],
            'ndvi_mean': np.nan,
            'ndwi_mean': np.nan,
            'data_source': 'Failed'
        })

# Create satellite DataFrame
satellite_df = pd.DataFrame(satellite_data)

print(f"\n📊 SATELLITE DATA COLLECTION RESULTS:")
print(f"   ✅ Total processed: {len(satellite_data)}")
print(f"   🛰️ Real Sentinel-2: {successful_extractions}")
print(f"   📈 Average NDVI: {satellite_df['ndvi_mean'].mean():.3f}")
print(f"   💧 Average NDWI: {satellite_df['ndwi_mean'].mean():.3f}")

# Data source breakdown
source_counts = satellite_df['data_source'].value_counts()
print(f"\n📊 Data Sources:")
for source, count in source_counts.items():
    print(f"   {source}: {count} plots")

# Save satellite data
satellite_df.to_csv('../data/raw/punjab_satellite_data.csv', index=False)
print(f"\n💾 Saved: data/raw/punjab_satellite_data.csv")

# Display sample
print(f"\n🔍 Sample Satellite Data:")
display_cols = ['plot_id', 'district', 'ndvi_mean', 'ndwi_mean', 'data_source']
print(satellite_df[display_cols].head())



🛰️ COLLECTING REAL SATELLITE DATA (SENTINEL-2)
📅 Satellite Data Period: 2024-06-01 to 2024-08-31
🎯 Processing 50 farm plots...
🔄 Processing 1/50: PB_001 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 2/50: PB_002 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 2/50: PB_002 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 3/50: PB_003 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 3/50: PB_003 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 4/50: PB_004 ... Sentinel-2 extraction error: 'NoneType' object is not subscriptable, using synthetic data
⚠️ Synthetic
🔄 Processing 4/50: PB_004 ... Sentinel-2 extraction error: 'NoneType' obje

In [18]:
# Cell 5: Collect Weather Data for All Plots
print(f"\n🌤️ COLLECTING WEATHER DATA FOR ALL PLOTS")
print("=" * 45)

# Get weather for all plots
weather_df = weather_api.get_weather_for_multiple_locations(plots_df)

print(f"\n📊 WEATHER DATA SUMMARY:")
print(f"   🌡️ Temperature range: {weather_df['temperature'].min():.1f}°C to {weather_df['temperature'].max():.1f}°C")
print(f"   💧 Humidity range: {weather_df['humidity'].min():.0f}% to {weather_df['humidity'].max():.0f}%")
print(f"   🌧️ Plots with rain: {(weather_df['rainfall'] > 0).sum()}")
print(f"   💨 Avg wind speed: {weather_df['wind_speed'].mean():.1f} km/h")

# Weather data sources
weather_sources = weather_df['data_source'].value_counts()
print(f"\n📊 Weather Data Sources:")
for source, count in weather_sources.items():
    print(f"   {source}: {count} plots")

# Save weather data
weather_df.to_csv('../data/raw/punjab_weather_data.csv', index=False)
print(f"\n💾 Saved: data/raw/punjab_weather_data.csv")

# Display sample
print(f"\n🔍 Sample Weather Data:")
weather_display_cols = ['plot_id', 'temperature', 'humidity', 'rainfall', 'data_source']
print(weather_df[weather_display_cols].head())



🌤️ COLLECTING WEATHER DATA FOR ALL PLOTS
🌤️ Collecting weather data...
✅ Weather data collected for 50 locations
📊 Data sources used: {'OpenWeatherMap': 50}

📊 WEATHER DATA SUMMARY:
   🌡️ Temperature range: 22.4°C to 24.4°C
   💧 Humidity range: 93% to 98%
   🌧️ Plots with rain: 15
   💨 Avg wind speed: 3.5 km/h

📊 Weather Data Sources:
   OpenWeatherMap: 50 plots

💾 Saved: data/raw/punjab_weather_data.csv

🔍 Sample Weather Data:
  plot_id  temperature  humidity  rainfall     data_source
0  PB_001        22.53        96      0.00  OpenWeatherMap
1  PB_002        23.27        97      1.24  OpenWeatherMap
2  PB_003        23.04        97      0.60  OpenWeatherMap
3  PB_004        23.21        96      0.00  OpenWeatherMap
4  PB_005        23.17        94      0.00  OpenWeatherMap
✅ Weather data collected for 50 locations
📊 Data sources used: {'OpenWeatherMap': 50}

📊 WEATHER DATA SUMMARY:
   🌡️ Temperature range: 22.4°C to 24.4°C
   💧 Humidity range: 93% to 98%
   🌧️ Plots with rain: 15
  

In [19]:
# Cell 6: Collect Comprehensive Soil Data
print(f"\n🌱 COLLECTING COMPREHENSIVE SOIL DATA")
print("=" * 40)

# Collect soil data for all plots
soil_df = soil_collector.collect_soil_data_for_plots(plots_df)

print(f"\n📊 SOIL DATA SUMMARY:")
print(f"   🧪 pH range: {soil_df['pH'].min():.1f} to {soil_df['pH'].max():.1f}")
print(f"   🌿 Organic Carbon: {soil_df['organic_carbon'].min():.2f}% to {soil_df['organic_carbon'].max():.2f}%")
print(f"   🍃 Nitrogen availability: {soil_df['N_available'].min():.0f} to {soil_df['N_available'].max():.0f} kg/ha")
print(f"   🏥 Soil Health Status:")

health_counts = soil_df['soil_health_status'].value_counts()
for status, count in health_counts.items():
    print(f"      {status}: {count} plots")

# Soil data sources  
soil_sources = soil_df['data_source'].value_counts()
print(f"\n📊 Soil Data Sources:")
for source, count in soil_sources.items():
    print(f"   {source}: {count}")

# Save soil data
soil_df.to_csv('../data/raw/punjab_soil_data.csv', index=False)
print(f"\n💾 Saved: data/raw/punjab_soil_data.csv")

# Display sample
print(f"\n🔍 Sample Soil Data:")
soil_display_cols = ['plot_id', 'pH', 'organic_carbon', 'N_available', 'soil_health_status', 'data_source']
print(soil_df[soil_display_cols].head())



🌱 COLLECTING COMPREHENSIVE SOIL DATA
🌱 Collecting soil data from global sources...
   Processing PB_001... SoilGrids API error: HTTPSConnectionPool(host='rest.soilgrids.org', port=443): Max retries exceeded with url: /soilgrids/v2.0/properties/query?lon=75.48431496086756&lat=31.961464826625406&property=phh2o&property=soc&property=sand&property=silt&property=clay&property=nitrogen&depth=0-5cm (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000221ABF15810>: Failed to resolve 'rest.soilgrids.org' ([Errno 11001] getaddrinfo failed)")), using Punjab model
✅
   Processing PB_002... SoilGrids API error: HTTPSConnectionPool(host='rest.soilgrids.org', port=443): Max retries exceeded with url: /soilgrids/v2.0/properties/query?lon=75.42883532770341&lat=31.21121798229296&property=phh2o&property=soc&property=sand&property=silt&property=clay&property=nitrogen&depth=0-5cm (Caused by NameResolutionError("<urllib3.connection.HTTPSConnection object at 0x00000221ABF1559

In [20]:
# Cell 7: Generate Realistic Crop Yield Data
print(f"\n🌾 GENERATING REALISTIC CROP YIELD DATA")
print("=" * 42)

def generate_realistic_yield_data():
    """Generate crop yield data based on satellite, weather, and soil data"""
    
    yield_data = []
    
    # Punjab major crops and their typical yields (kg/hectare)
    crop_base_yields = {
        'Wheat': 4200,
        'Rice': 5800, 
        'Cotton': 480  # Cotton in kg/hectare (fiber)
    }
    
    print("📊 Generating yield data based on:")
    print("   • Satellite NDVI (crop health)")
    print("   • Soil health status")
    print("   • Weather conditions")
    print("   • Punjab crop patterns")
    
    for _, row in plots_df.iterrows():
        plot_id = row['plot_id']
        
        # Get corresponding data
        sat_row = satellite_df[satellite_df['plot_id'] == plot_id].iloc[0]
        soil_row = soil_df[soil_df['plot_id'] == plot_id].iloc[0]
        weather_row = weather_df[weather_df['plot_id'] == plot_id].iloc[0]
        
        # Extract key factors
        ndvi = sat_row['ndvi_mean'] if pd.notna(sat_row['ndvi_mean']) else 0.6
        soil_health = soil_row['soil_health_status']
        temperature = weather_row['temperature']
        
        # Calculate influence factors
        ndvi_factor = max(0.5, min(1.5, (ndvi - 0.2) / 0.5))  # NDVI impact
        soil_factor = {'Good': 1.1, 'Medium': 1.0, 'Poor': 0.85}[soil_health]
        
        # Temperature stress (too hot reduces yield)
        temp_factor = 1.0 if temperature < 35 else max(0.8, 1.0 - (temperature - 35) * 0.02)
        
        # Generate yields for 3 years and 3 crops
        for year in [2022, 2023, 2024]:
            for crop, base_yield in crop_base_yields.items():
                
                # Seasonal variation
                year_factor = np.random.normal(1.0, 0.1)
                
                # Final yield calculation
                final_yield = (base_yield * 
                              ndvi_factor * 
                              soil_factor * 
                              temp_factor * 
                              year_factor)
                
                # Ensure minimum viable yield
                final_yield = max(base_yield * 0.4, final_yield)
                
                # Crop-specific adjustments
                if crop == 'Cotton' and temperature > 38:  # Cotton heat tolerance
                    final_yield *= 0.9
                elif crop == 'Rice' and weather_row['rainfall'] < 1:  # Rice needs water
                    final_yield *= 0.85
                
                yield_data.append({
                    'plot_id': plot_id,
                    'year': year,
                    'crop_type': crop,
                    'yield_kg_per_hectare': round(final_yield, 1),
                    'sowing_date': f'{year}-{6 if crop == "Rice" else 11}-{np.random.randint(1,20):02d}',
                    'harvest_date': f'{year if crop == "Rice" else year+1}-{10 if crop == "Rice" else 4}-{np.random.randint(1,25):02d}'
                })
    
    return pd.DataFrame(yield_data)

# Generate yield data
yield_df = generate_realistic_yield_data()

print(f"\n📊 CROP YIELD DATA SUMMARY:")
print(f"   📈 Total yield records: {len(yield_df):,}")
print(f"   🌾 Crops: {list(yield_df['crop_type'].unique())}")
print(f"   📅 Years: {sorted(yield_df['year'].unique())}")

# Average yields by crop
avg_yields = yield_df.groupby('crop_type')['yield_kg_per_hectare'].mean().sort_values(ascending=False)
print(f"\n📊 Average Yields (kg/hectare):")
for crop, yield_val in avg_yields.items():
    print(f"   {crop:<8}: {yield_val:>6.0f} kg/ha")

# Yield distribution by soil health
yield_soil_health = yield_df.merge(
    soil_df[['plot_id', 'soil_health_status']], 
    on='plot_id'
).groupby(['crop_type', 'soil_health_status'])['yield_kg_per_hectare'].mean()

print(f"\n📊 Yield by Soil Health (kg/ha):")
for crop in yield_df['crop_type'].unique():
    print(f"   {crop}:")
    for health in ['Good', 'Medium', 'Poor']:
        if (crop, health) in yield_soil_health:
            yield_val = yield_soil_health[(crop, health)]
            print(f"      {health}: {yield_val:.0f} kg/ha")

# Save yield data
yield_df.to_csv('../data/raw/punjab_crop_yields.csv', index=False)
print(f"\n💾 Saved: data/raw/punjab_crop_yields.csv")



🌾 GENERATING REALISTIC CROP YIELD DATA
📊 Generating yield data based on:
   • Satellite NDVI (crop health)
   • Soil health status
   • Weather conditions
   • Punjab crop patterns

📊 CROP YIELD DATA SUMMARY:
   📈 Total yield records: 450
   🌾 Crops: ['Wheat', 'Rice', 'Cotton']
   📅 Years: [np.int64(2022), np.int64(2023), np.int64(2024)]

📊 Average Yields (kg/hectare):
   Rice    :   5129 kg/ha
   Wheat   :   4256 kg/ha
   Cotton  :    482 kg/ha

📊 Yield by Soil Health (kg/ha):
   Wheat:
      Good: 4951 kg/ha
      Medium: 4495 kg/ha
      Poor: 3693 kg/ha
   Rice:
      Good: 6068 kg/ha
      Medium: 5316 kg/ha
      Poor: 4425 kg/ha
   Cotton:
      Good: 558 kg/ha
      Medium: 495 kg/ha
      Poor: 426 kg/ha

💾 Saved: data/raw/punjab_crop_yields.csv


In [22]:
# Cell 8: Create Interactive Maps and Visualizations
%pip install folium --quiet
import folium
print(f"\n🗺️ CREATING INTERACTIVE MAPS AND VISUALIZATIONS")
print("=" * 52)
print(f"\n🗺️ CREATING INTERACTIVE MAPS AND VISUALIZATIONS")
print("=" * 52)

def create_comprehensive_punjab_map():
    """Create comprehensive interactive map with all data layers"""
    
    # Center on Punjab
    punjab_center = [30.9, 75.8]
    
    # Create base map
    m = folium.Map(
        location=punjab_center,
        zoom_start=8,
        tiles='OpenStreetMap'
    )
    
    # Add farm plots with multi-layer information
    for _, row in satellite_df.iterrows():
        plot_id = row['plot_id']
        
        # Get corresponding data from other DataFrames
        soil_info = soil_df[soil_df['plot_id'] == plot_id].iloc[0]
        weather_info = weather_df[weather_df['plot_id'] == plot_id].iloc[0]
        
        # Color code by NDVI (crop health)
        ndvi = row['ndvi_mean']
        if pd.notna(ndvi):
            if ndvi > 0.7:
                color = 'green'
                health = 'Healthy'
            elif ndvi > 0.5:
                color = 'orange'
                health = 'Moderate'
            else:
                color = 'red'
                health = 'Stressed'
        else:
            color = 'gray'
            health = 'No Data'
        
        # Create comprehensive popup
        popup_html = f"""
        <div style='width: 300px; font-family: Arial;'>
            <h4>🌾 Farm Plot: {plot_id}</h4>
            <hr>
            <b>📍 Location:</b><br>
            • Coordinates: {row['latitude']:.4f}°N, {row['longitude']:.4f}°E<br>
            • District: {row['district']}<br>
            
            <hr><b>🛰️ Satellite Data:</b><br>
            • NDVI: {ndvi:.3f} ({health})<br>
            • NDWI: {row['ndwi_mean']:.3f}<br>
            • Data Source: {row['data_source']}<br>
            
            <hr><b>🌱 Soil Health:</b><br>
            • Status: {soil_info['soil_health_status']}<br>
            • pH: {soil_info['pH']:.1f}<br>
            • Organic Carbon: {soil_info['organic_carbon']:.2f}%<br>
            • N Available: {soil_info['N_available']:.0f} kg/ha<br>
            • Data Source: {soil_info['data_source']}<br>
            
            <hr><b>🌤️ Weather Data:</b><br>
            • Temperature: {weather_info['temperature']:.1f}°C<br>
            • Humidity: {weather_info['humidity']:.0f}%<br>
            • Rainfall: {weather_info['rainfall']:.1f}mm<br>
            • Data Source: {weather_info['data_source']}<br>
        </div>
        """
        
        # Add marker to map
        folium.CircleMarker(
            location=[row['latitude'], row['longitude']],
            radius=8,
            popup=folium.Popup(popup_html, max_width=350),
            color='black',
            weight=1,
            fillColor=color,
            fillOpacity=0.8
        ).add_to(m)
    
    # Add legend
    legend_html = f'''
    <div style="position: fixed; 
                bottom: 50px; left: 50px; width: 200px; height: 140px; 
                background-color: white; border:2px solid grey; z-index:9999; 
                font-size:12px; padding: 10px; box-shadow: 3px 3px 10px rgba(0,0,0,0.3);">
    <h4>🌾 Punjab Smart Crop Advisory</h4>
    <p><b>Crop Health (NDVI):</b></p>
    <p><span style="color:green;">●</span> Healthy (>0.7)</p>
    <p><span style="color:orange;">●</span> Moderate (0.5-0.7)</p>
    <p><span style="color:red;">●</span> Stressed (<0.5)</p>
    <p><span style="color:gray;">●</span> No Data</p>
    <small>📅 Data: {datetime.now().strftime('%Y-%m-%d')}</small>
    </div>
    '''
    m.get_root().html.add_child(folium.Element(legend_html))
    
    return m

# Create comprehensive map
print("🗺️ Creating interactive map with all data layers...")
punjab_map = create_comprehensive_punjab_map()

# Save map
map_filename = '../data/processed/punjab_comprehensive_farm_map.html'
punjab_map.save(map_filename)

print(f"✅ Interactive map created!")
print(f"🗺️ Saved: {map_filename}")
print(f"💡 Open this file in your browser to explore the data")

# Display map in notebook
punjab_map



🗺️ CREATING INTERACTIVE MAPS AND VISUALIZATIONS
🗺️ Creating interactive map with all data layers...
✅ Interactive map created!
🗺️ Saved: ../data/processed/punjab_comprehensive_farm_map.html
💡 Open this file in your browser to explore the data


In [11]:
# Cell 9: Display the Interactive Map and Final Summary
print("🗺️ DISPLAYING INTERACTIVE MAP")
print("=" * 32)

# If the map is not displaying, click "Trust" at the top of the notebook
try:
    # Re-create and display the map
    punjab_map = create_comprehensive_punjab_map()
    
    # Display in notebook
    display(punjab_map)
    
    print("✅ Map should be displayed above!")
    print("💡 If you don't see the map, click 'Trust' at the top of this notebook")
    
except Exception as e:
    print(f"❌ Error displaying map: {e}")
    print("💡 Please click 'Trust' at the top of this notebook to enable interactive content")

# Final Summary
print(f"\n🎯 DATA COLLECTION COMPLETE - FINAL SUMMARY")
print("=" * 48)

print(f"📊 DATASETS CREATED:")
print(f"   📁 Raw Data:")
print(f"      • data/raw/punjab_farm_plots.csv")
print(f"      • data/raw/punjab_satellite_data.csv") 
print(f"      • data/raw/punjab_weather_data.csv")
print(f"      • data/raw/punjab_soil_data.csv")
print(f"      • data/raw/punjab_crop_yields.csv")
print(f"   📁 Processed Data:")
print(f"      • data/processed/punjab_comprehensive_farm_map.html")

print(f"\n✅ DATA COLLECTION COMPLETE!")
print(f"🎯 Ready for Feature Engineering and Model Training!")
print(f"📚 Next: Open notebooks/02_Feature_Engineering_EDA.ipynb")

🗺️ DISPLAYING INTERACTIVE MAP


✅ Map should be displayed above!
💡 If you don't see the map, click 'Trust' at the top of this notebook

🎯 DATA COLLECTION COMPLETE - FINAL SUMMARY
📊 DATASETS CREATED:
   📁 Raw Data:
      • data/raw/punjab_farm_plots.csv
      • data/raw/punjab_satellite_data.csv
      • data/raw/punjab_weather_data.csv
      • data/raw/punjab_soil_data.csv
      • data/raw/punjab_crop_yields.csv
   📁 Processed Data:
      • data/processed/punjab_comprehensive_farm_map.html

✅ DATA COLLECTION COMPLETE!
🎯 Ready for Feature Engineering and Model Training!
📚 Next: Open notebooks/02_Feature_Engineering_EDA.ipynb
