In [2]:
# Standard Library Imports
from concurrent.futures import ThreadPoolExecutor, as_completed
from joblib import Parallel, delayed
from tqdm import tqdm
from datetime import datetime
from datetime import timedelta
from pathlib import Path
import os
import random

from tqdm.notebook import tqdm
import time

# Third-Party Imports
import ee
import geemap
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import rasterio
from lightgbm import LGBMClassifier
from shapely.affinity import scale, translate
from skimage import exposure
from sklearn.metrics import classification_report, f1_score
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder

from shapely import wkt
import geopandas as gpd

import warnings
warnings.filterwarnings('ignore')

import logging
# Set up a logger to capture Rasterio warnings
logging.getLogger("rasterio._env").setLevel(logging.ERROR)

In [3]:
# Define the root path for the project
root_path = Path("..")

In [4]:
# Initialize Earth Engine with a specific project
# Replace "project" with your project ID as needed
#ee.Authenticate()
ee.Initialize(project="ee-crop-health-telangana")

In [5]:
# Load training and testing datasets from CSV files
train = pd.read_csv(root_path / 'data/train.csv')
test = pd.read_csv(root_path / 'data/test.csv')

# Convert WKT geometry to actual geometry objects in both datasets
train['geometry'] = train['geometry'].apply(wkt.loads)
test['geometry'] = test['geometry'].apply(wkt.loads)

# Convert pandas DataFrames to GeoDataFrames with CRS set to 'epsg:4326'
train = gpd.GeoDataFrame(train, crs='epsg:4326')
test = gpd.GeoDataFrame(test, crs='epsg:4326')

# Concatenate train and test datasets into a single DataFrame for consistent processing
# 'dataset' column distinguishes between train and test rows
data = pd.concat(
    [train.assign(dataset='train'), test.assign(dataset='test')]
).reset_index(drop=True)


In [14]:
import ee
import geemap
import os
import pandas as pd
from concurrent.futures import ThreadPoolExecutor, as_completed
from datetime import timedelta
from tqdm.notebook import tqdm
import time

def setup_download_environment(base_dir='../data/sentinel-2-all'):
    os.makedirs(base_dir, exist_ok=True)
    ee.Initialize()
    return base_dir

def process_single_farm(row, output_dir, cloud_threshold=20):
    try:
        harvest_date = pd.to_datetime(row['HDate'])
        temporal_files = []
        
        # Time points: -10, -5, 0 (harvest), +5 days
        time_points = [-10, -5, 0, 5]
        
        for days_offset in time_points:
            target_date = harvest_date + timedelta(days=days_offset)
            file_name = f"S2_{row['FarmID']}_{target_date.strftime('%Y%m%d')}.tif"
            output_path = os.path.join(output_dir, file_name)
            
            if os.path.exists(output_path):
                temporal_files.append(output_path)
                continue
                
            # Narrow window to find closest image (2 days before and after target)
            start_date = (target_date - timedelta(days=2)).strftime('%Y-%m-%d')
            end_date = (target_date + timedelta(days=2)).strftime('%Y-%m-%d')
            
            region = ee.Geometry.Polygon(row['geometry'].__geo_interface__['coordinates']) if row['geometry'].geom_type == 'Polygon' else \
                     ee.Geometry.MultiPolygon([polygon.exterior.coords[:] for polygon in row['geometry'].geoms])
            
            collection = ee.ImageCollection('COPERNICUS/S2_HARMONIZED') \
                .filterDate(start_date, end_date) \
                .filterBounds(region) \
                .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE', cloud_threshold)) \
                .select(['B2','B3','B4','B5','B6','B7','B8','B8A','B11','B12'])
            
            if collection.size().getInfo() == 0:
                continue
                
            # Get image closest to target date
            def add_date_distance(image):
                image_date = ee.Date(image.get('system:time_start'))
                target_ee_date = ee.Date(target_date.strftime('%Y-%m-%d'))
                diff = ee.Number(image_date.difference(target_ee_date, 'day')).abs()
                return image.set('date_diff', diff)
                
            closest_image = collection.map(add_date_distance) \
                .sort('date_diff') \
                .first() \
                .clip(region)
            
            for attempt in range(3):
                try:
                    geemap.ee_export_image(
                        closest_image,
                        filename=output_path,
                        scale=10,
                        region=region,
                        file_per_band=False,
                        crs='EPSG:4326'
                    )
                    time.sleep(1)
                    temporal_files.append(output_path)
                    break
                except Exception as e:
                    time.sleep(2 ** attempt)
        
        return temporal_files if temporal_files else None
        
    except Exception as e:
        print(f"Error processing FarmID {row['FarmID']}: {str(e)}")
        return None

def download_and_update_dataset(df, output_dir, max_workers=4):
    df = df.copy()
    df['tif_paths'] = None
    
    with ThreadPoolExecutor(max_workers=max_workers) as executor:
        futures = {
            executor.submit(process_single_farm, row, output_dir): idx 
            for idx, row in df.iterrows()
        }
        
        for future in tqdm(as_completed(futures), total=len(df)):
            idx = futures[future]
            try:
                result = future.result()
                df.at[idx, 'tif_paths'] = result
            except Exception as e:
                print(f"Failed to process index {idx}: {str(e)}")
    
    return df
# Usage

output_dir = setup_download_environment()
enriched_data = download_and_update_dataset(data.head(3), output_dir)

  0%|          | 0/3 [00:00<?, ?it/s]

Generating URL ...
Downloading data from https://earthengine.googleapis.com/v1/projects/242223918869/thumbnails/3b0f8322a81a16330daa8db533db213e-68f0db2bd4dd5f542f06130b16a7ed9d:getPixels
Please wait ...
Generating URL ...
Data downloaded to /Users/jonas/Library/CloudStorage/GoogleDrive-jonas.nothnagel@gmail.com/My Drive/giz/coding/telangana-crop-health/data/sentinel-2-all/S2_1326577_20240416.tif
Generating URL ...
Downloading data from https://earthengine.googleapis.com/v1/projects/242223918869/thumbnails/77fcd73043859be516a24d6faf409e64-ed97303d6575d1e51cef1d1a39a7b73d:getPixels
Please wait ...
Downloading data from https://earthengine.googleapis.com/v1/projects/242223918869/thumbnails/48c801882ac4230b32b9b1339378b3fe-f9a815420b353a02d7e8626b01aa9d68:getPixels
Please wait ...
Data downloaded to /Users/jonas/Library/CloudStorage/GoogleDrive-jonas.nothnagel@gmail.com/My Drive/giz/coding/telangana-crop-health/data/sentinel-2-all/S2_1326576_20240404.tif
Generating URL ...
Data downloaded

In [29]:
data

Unnamed: 0,FarmID,category,Crop,State,District,Sub-District,SDate,HDate,CropCoveredArea,CHeight,...,CLast,CTransp,IrriType,IrriSource,IrriCount,WaterCov,ExpYield,Season,geometry,dataset
0,1326576,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-25 0:00:00,2024-04-14 0:00:00,97,54,...,Lentil,Transplanting,Flood,Groundwater,4,87,17,Rabi,"POLYGON ((78.18143 17.97888, 78.18149 17.97899...",train
1,1326577,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-11-13 0:00:00,2024-04-26 0:00:00,82,58,...,Lentil,Transplanting,Flood,Canal,5,94,15,Rabi,"POLYGON ((78.17545 17.98107, 78.17578 17.98104...",train
2,1326578,Healthy,Paddy,Telangana,Medak,Kulcharam,2023-12-19 0:00:00,2024-04-28 0:00:00,92,91,...,Lentil,Transplanting,Flood,Canal,3,99,20,Rabi,"POLYGON ((78.16914 17.97621, 78.1693 17.97619,...",train
3,1331836,Diseased,Paddy,Telangana,Medak,Kulcharam,2023-02-11 0:00:00,2024-11-04 0:00:00,91,52,...,Lentil,Transplanting,Flood,Canal,5,92,16,Rabi,"POLYGON ((78.16889 17.97461, 78.16916 17.97471...",train
4,1326579,Diseased,Paddy,Telangana,Medak,Kulcharam,2023-12-12 0:00:00,2024-05-19 0:00:00,94,55,...,Lentil,Transplanting,Flood,Canal,5,97,20,Rabi,"POLYGON ((78.17264 17.96925, 78.17276 17.96926...",train
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10601,551119,,Maize,Telangana,Nirmal,Sarangapur,2023-10-12 0:00:00,2024-07-03 0:00:00,92,225,...,Maize,Drilling,Drip,Groundwater,4,44,39,Rabi,"POLYGON ((78.25909 19.22659, 78.25924 19.22656...",test
10602,916476,,Maize,Telangana,Nirmal,Sarangapur,2023-05-11 0:00:00,2024-04-03 0:00:00,79,236,...,Maize,Seed Drilling,Drip,Canal,3,41,40,Rabi,"POLYGON ((78.2574 19.20149, 78.25735 19.20115,...",test
10603,121860,,Maize,Telangana,Nirmal,Sarangapur,2023-11-14 0:00:00,2024-03-04 0:00:00,87,210,...,Maize,Broadcasting,Sprinkler,Canal,3,43,41,Rabi,"POLYGON ((78.25711 19.19664, 78.25713 19.19696...",test
10604,270936,,Maize,Telangana,Nirmal,Sarangapur,2023-02-12 0:00:00,2024-03-26 0:00:00,95,204,...,Maize,Drilling,Sprinkler,Canal,4,48,39,Rabi,"POLYGON ((78.25624 19.19529, 78.25624 19.19498...",test
