### What resource we need to Train this large data??: GPU

### Completed Tasks:

--> GEE Asset/Crop polygons

--> polygon area estimation for each crop type data

--> Plante/NICFI Pixel count for each crops, each polygons

--> Graphic representation for area vs Pixel count

### Next tasks:

--> Cloud masking for Planet (Algorthms)

--> Image Normalization: Scale the pixel values (e.g., 0-255) to a range that is typical for neural network inputs, such as 0-1 or -1 to 1.

--> Data Augmentation: To increase the diversity of the training data and prevent overfitting, apply transformations like rotations, translations, scaling, and horizontal flipping.

--> Patch Extraction: For high-resolution images, it might be necessary to create smaller, manageable patches. This makes the training process more efficient and helps in handling large images during deployment.

--> DL (FCNN) Trainings
--> Parallel ML training

### Reading corner about crop mapping in Senegal
SEN4STAT/ESA: https://www.esa-sen4stat.org/user-stories/senegal-prototype/

EOSTAT/FAO: https://data.apps.fao.org/catalog/dataset/5c377b2b-3c2e-4b70-afd7-0c80900b68bb/resource/50bc9ff5-95d2-40cd-af12-6aee2cfcc4ae

access to the data_storage.py: https://drive.google.com/file/d/1-6_x0L6_yxaj3oxwmGJoYbn6luBgcnwX/view?usp=drive_link
access to the code below as script: https://drive.google.com/file/d/1-9158gNZZzkJLlUvEiqUkq6S7cVNLMf4/view?usp=sharing

In [None]:
import ee
# @title Authenticate to the Earth Engine servers
ee.Authenticate()
# Initialize the Earth Engine object with Google Cloud project ID
project_id = 'ee-janet' # change here
ee.Initialize(project=project_id)


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import os
os.chdir('/content/drive/MyDrive/Crop Monitoring/crop_types_data/CSE_team') # change to your drive path

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# @title Lib imports:
#import ee
#print('Using EE version ', ee.__version__)
import folium
#print('Using Folium version ', folium.__version__)
from os import MFD_HUGE_1MB
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Iterable, List, Tuple
#from google.colab import auth
import datetime as dt
import time
import geopandas as gpd
from shapely.geometry import shape, Polygon, MultiPolygon


In [None]:
#import these packages
!pip install rasterio rasterstats fiona pyogrio geopandas earthpy geemap -q

In [None]:
#@title Running the imported files
import scripts.data_storage as ds  # Importing the data storage script
import scripts.process_raw_data as prd


Total number of features: 5832
Processing features from 0 to 5000
Processing features from 5000 to 10000
(5827, 7)
<class 'geopandas.geodataframe.GeoDataFrame'>


In [None]:
#@title Clean the raw data for all years
# Process data for 2018, 2019, 2020, and 2023 algother
# you need to adjust these paths and parameters based on your data

shapefile_directory = '/content/drive/MyDrive/Crop Monitoring/crop_types_data/CSE_team/data_2018_2023'  # Set this to the directory where your shapefiles are stored
batch_size = 5000  # Set this to the desired batch size
data_years = {
    '2018': ds.data_2018,
    '2019': ds.data_2019,
    '2020': ds.data_2020,
    '2023': ds.data_2023
}

for year, collection in data_years.items():
    prd.fetch_and_process_features(collection, year, batch_size,shapefile_directory)

In [None]:
#@title import the data as GEE asset per subclass

# Dictionary with years and corresponding shapefile names
shapefiles = {
    2018: 'clean_raw_data_2018.shp',
    2019: 'clean_raw_data_2019.shp',
    2020: 'clean_raw_data_2020.shp',
    2023: 'clean_raw_data_2023.shp'
}

# Process and export each subclass for each year
for year, shapefile_name in shapefiles.items():
    # Load the GeoDataFrame from the shapefile
    shapefile_path = f'{shapefile_directory}/{shapefile_name}'
    gdf = gpd.read_file(shapefile_path)
    gdf= gdf[gdf['ID'].notna()]#.notna() #remove na in 2020 data
    # Group by subclass and export each subclass
    for subclass, subclass_gdf in gdf.groupby('Sub_class'):
        print(f"subclass_gdf for {year}:", subclass_gdf.shape)
        try:
            prd.export_to_asset(subclass_gdf, year, subclass)
        except Exception as e:
            print(f"Error exporting year {year}, subclass {subclass}: {str(e)}")

        # Add a delay between exports to avoid overwhelming the EE API
        time.sleep(10)

In [None]:
# @title Export data using Sentinel 2
# Dictionary with years and corresponding shapefile names
shapefile_directory = '/content/drive/MyDrive/ICRISAT/crop_type_classification/crop_types_data/clean_data_with_bands_s2'
# Define the FeatureCollections from data_storage.py
feature_collections = {
    '2018_Cereals': ds.clean_raw_data_2018_Cereals,
    '2018_Legumes': ds.clean_raw_data_2018_Legumes,
    '2018_Noncrop': ds.clean_raw_data_2018_Noncrop,
    '2018_Tree_Crops': ds.clean_raw_data_2018_Tree_Crops,
    '2018_Vegetables': ds.clean_raw_data_2018_Vegetables,
    '2019_Cereals': ds.clean_raw_data_2019_Cereals,
    '2019_Legumes': ds.clean_raw_data_2019_Legumes,
    '2019_Noncrop': ds.clean_raw_data_2019_Noncrop,
    '2019_Vegetables': ds.clean_raw_data_2019_Vegetables,
    '2020_Bare_Built_Up': ds.clean_raw_data_2020_Bare_Built_Up,
    '2020_Cereals': ds.clean_raw_data_2020_Cereals,
    '2020_Fallow': ds.clean_raw_data_2020_Fallow,
    '2020_Legumes': ds.clean_raw_data_2020_Legumes,
    '2020_Noncrop': ds.clean_raw_data_2020_Noncrop,
    '2020_Other_Vegetation': ds.clean_raw_data_2020_Other_Vegetation,
    '2020_Tree_Crops': ds.clean_raw_data_2020_Tree_Crops,
    '2020_Vegetables': ds.clean_raw_data_2020_Vegetables,
    '2023_Cereals': ds.clean_raw_data_2023_Cereals,
    '2023_Fallow': ds.clean_raw_data_2023_Fallow,
    '2023_Legumes': ds.clean_raw_data_2023_Legumes,
    '2023_Noncrop': ds.clean_raw_data_2023_Noncrop,
    '2023_Tree_Crops': ds.clean_raw_data_2023_Tree_Crops,
    '2023_Vegetables': ds.clean_raw_data_2023_Vegetables
}
# Define the months and indices for Sentinel-2 processing
months = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
indices = ['NDVI']

# Process and export each subclass for each year
for year, shapefile_name in shapefiles.items():
    # Load the GeoDataFrame from the shapefile
    shapefile_path = f'{shapefile_directory}/{shapefile_name}'
    gdf = gpd.read_file(shapefile_path)
    gdf = gdf[gdf['ID'].notna()]  # Remove rows with NaN values in the 'ID' column

    # Group by subclass and export each subclass
    for subclass, subclass_gdf in gdf.groupby('Sub_class'):
        print(f"subclass_gdf for {year}:", subclass_gdf.shape)
        try:
            subclass_processor = esp.SubclassProcessor(subclass_df=subclass_gdf, year=year, months=months, indices=indices, subclass_name=subclass)
            subclass_processor.export_to_drive(description=f'Sentinel2_Processed_Data_{year}_{subclass}')
        except Exception as e:
            print(f"Error exporting year {year}, subclass {subclass}: {str(e)}")

        # Add a delay between exports to avoid overwhelming the EE API
        time.sleep(10)

#### Optional: Exporting the shapefiles into an asset GEE

### Extracting remote sensing indices and normalized band values

Planet NICFI

$`Biannual Collection`
   PS_Tropical_Normalized_Analytic_Biannual

*  December 2015,  June 2016, December 2016, June 2017, December 2017,   June 2018,December 2018, June 2019, December 2019, June 2020
                              
                          

$`Monthly Collection`
  PS_Tropical_Normalized_Analytic_Monthly
* September 2020, October 2020,November 2020, December 2020,January 2021,February 2021, March 2021,April 2021,May 2021,June 2021,July 2021,August 2021, September 2021,October 2021,November 2021,December 2021,January 2022, February 2022,March 2022,  April 2022
                       

In [None]:
# Set working directory
os.chdir('/content/drive/MyDrive/Crop Monitoring/crop_monitoring/crop_types_data')

# Load shapefiles
shp_2018 = gpd.read_file('clean_data_no_bands/land_cover_2018.shp')
shp_2019 = gpd.read_file('clean_data_no_bands/land_cover_2019.shp')
shp_2020 = gpd.read_file('clean_data_no_bands/land_cover_2020.shp')
shp_2023 = gpd.read_file('clean_data_no_bands/land_cover_2023.shp')

class PlanetNICFIProcessor:
    def __init__(self, year, date_ranges, indices, data_fc):
        self.year = year
        self.date_ranges = date_ranges  # List of tuples (start_date, end_date)
        self.indices = indices
        self.data_fc = data_fc

    def calculate_ndvi(self, image):
        """
        Calculate the NDVI for an image and add it as a band.
        """
        return image.addBands(image.normalizedDifference(['N', 'R']).rename('NDVI'))

    def normalize_bands(self, image, band_names, geometry):
        """
        Normalize the specified bands of an image.
        """
        def normalize_band(band_name):
            band = image.select(band_name).toFloat()
            min_max = band.reduceRegion(
                reducer=ee.Reducer.minMax(),
                geometry=geometry,
                scale=4.77,
                maxPixels=1e13
            )
            min_val = ee.Number(min_max.get(ee.String(band_name).cat('_min')))
            max_val = ee.Number(min_max.get(ee.String(band_name).cat('_max')))

            # Check if min and max are valid
            is_valid = min_val.isNumber().And(max_val.isNumber()).And(max_val.neq(min_val))

            normalized_band = ee.Image(ee.Algorithms.If(
                is_valid,
                band.subtract(min_val).divide(max_val.subtract(min_val)),
                band  # If normalization fails, return the original band
            )).rename(ee.String(band_name).cat('_norm'))

            return normalized_band

        normalized_bands = [normalize_band(band) for band in band_names]
        return image.addBands(ee.Image.cat(normalized_bands))

    def process_all_months(self):
        """
        Process the NICFI Planet imagery for all specified date ranges.
        """
        processed_images = []

        for start_date, end_date in self.date_ranges:
            print(f"Processing period: {start_date} to {end_date}")

            nicfi_planet = ee.ImageCollection("projects/planet-nicfi/assets/basemaps/africa") \
                .filterBounds(self.data_fc.geometry()) \
                .filter(ee.Filter.date(start_date, end_date))

            # Check if the collection is empty
            count = nicfi_planet.size().getInfo()
            if count == 0:
                print(f"No images found for the period: {start_date} to {end_date}")
                continue

            normalized_collection = nicfi_planet.map(lambda image: self.normalize_bands(image, ['B', 'G', 'R', 'N'], self.data_fc.geometry()))
            ndvi_collection = normalized_collection.map(self.calculate_ndvi)

            print(f"Number of images: {ndvi_collection.size().getInfo()}")

            bands = ['B_norm', 'G_norm', 'R_norm', 'N_norm', 'NDVI', 'B', 'G', 'R', 'N']

            composite = ndvi_collection.median().select(bands)

            # Add a date band to the composite
            date_band = ee.Image.constant(ee.Date(start_date).millis()).rename('date')
            composite_with_date = composite.addBands(date_band)

            processed_images.append(composite_with_date)

            time.sleep(random.uniform(5, 10))

        if not processed_images:
            raise ValueError("No images were processed for any date range")

        return ee.ImageCollection(processed_images)

    def add_bands_to_fc(self, image_collection):
        """
        Add the calculated bands to the feature collection as time series.
        """
        def sample_image_collection(feature):
            values = image_collection.map(lambda image: image.reduceRegion(
                reducer=ee.Reducer.mean(),
                geometry=feature.geometry(),
                scale=4.77,
                maxPixels=1e13
            )).toList(image_collection.size())

            return feature.set('time_series', values)

        return self.data_fc.map(sample_image_collection)

    def export_to_drive(self, description, file_format='CSV', folder='clean_data_with_bands_planet_ncifi'):
        """
        Export the feature collection with added bands to Google Drive.
        """
        image_collection = self.process_all_months()
        data_with_bands = self.add_bands_to_fc(image_collection)

        task = ee.batch.Export.table.toDrive(
            collection=data_with_bands,
            description=description,
            fileFormat=file_format,
            folder=folder
        )
        task.start()
        print(f"Started export task: {task.id}")
        print("Check your Earth Engine Tasks panel to monitor progress.")

class SubclassProcessor(PlanetNICFIProcessor):
    def __init__(self, subclass_df, year, date_ranges, indices, subclass_name):
        data_fc = gdf_to_ee_feature_collection(subclass_df)
        super().__init__(year, date_ranges, indices, data_fc)
        self.subclass_name = subclass_name

def gdf_to_ee_feature_collection(gdf):
    """
    Convert a GeoDataFrame to an Earth Engine Feature Collection.
    """
    features = []
    for i, row in gdf.iterrows():
        geometry = row.geometry
        if isinstance(geometry, Polygon):
            ee_geometry = ee.Geometry.Polygon(list(geometry.exterior.coords))
        elif isinstance(geometry, MultiPolygon):
            polygons = [list(polygon.exterior.coords) for polygon in geometry.geoms]
            ee_geometry = ee.Geometry.MultiPolygon(polygons)
        else:
            raise TypeError(f"Unsupported geometry type: {type(geometry)}")

        properties = row.drop('geometry').fillna(0).to_dict()  # Replace NaN with 0
        feature = ee.Feature(ee_geometry, properties)
        features.append(feature)

    return ee.FeatureCollection(features)

# Main execution
if __name__ == "__main__":
    # Set up your Google Drive folder
    os.chdir('/content/drive/MyDrive/Crop Monitoring/crop_monitoring/crop_types_data/data_with_bands_planet_ncifi')

    # Years to process
    years = [2018, 2019, 2020, 2023]

    # Define date ranges for each year
    date_ranges = {
        2018: [
            ('2018-06-01', '2018-08-31'),
            ('2018-06-01', '2018-09-30'),
            ('2018-06-01', '2018-10-31'),
            ('2018-06-01', '2018-11-30')
        ],
        2019: [
            ('2019-06-01', '2019-08-31'),
            ('2019-06-01', '2019-09-30'),
            ('2019-06-01', '2019-10-31'),
            ('2019-06-01', '2019-11-30')
        ],
        2020: [
            ('2020-08-01', '2020-08-31'),
            ('2020-09-01', '2020-09-30'),
            ('2020-10-01', '2020-10-31'),
            ('2020-11-01', '2020-11-30')
        ],
        2023: [
            ('2023-08-01', '2023-08-31'),
            ('2023-09-01', '2023-09-30'),
            ('2023-10-01', '2023-10-31'),
            ('2023-11-01', '2023-11-30')
        ]
    }

    indices = ['NDVI']  # Add more indices as you wish

    shp_data = {
        2018: shp_2018,
        2019: shp_2019,
        2020: shp_2020,
        2023: shp_2023
    }

    # Process each year and each subclass separately
    for year in years:
        for subclass_name, subclass_df in shp_data[year].groupby('Sub_class'):
            processor = SubclassProcessor(subclass_df, year, date_ranges[year], indices, subclass_name)
            try:
                # Ensure the description is a string and doesn't contain any special formatting characters
                description = f"data_{year}_with_bands_subclass_{subclass_name}"
                description = description.replace("%", "").replace("{", "").replace("}", "")
                processor.export_to_drive(description)
            except ee.ee_exception.EEException as e:
                print(f"Error processing {year}, subclass {subclass_name}: {str(e)}")
            except ValueError as e:
                print(f"No data available for {year}, subclass {subclass_name}: {str(e)}")
            except Exception as e:
                print(f"Unexpected error processing {year}, subclass {subclass_name}: {str(e)}")

            # Add a delay between processing subclasses
            time.sleep(random.uniform(10, 20))