#### Import all necessary packages

In [1]:
import ee
# @title Authenticate to the Earth Engine servers
ee.Authenticate()
# Initialize the Earth Engine object with Google Cloud project ID
project_id = 'ee-username' # change here
ee.Initialize(project=project_id)

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
os.chdir('/content/drive/MyDrive/Crop Monitoring/crop_monitoring/crop_types_data')

Mounted at /content/drive


#### Import libraries

In [None]:
# @title Lib imports:
#import ee
#print('Using EE version ', ee.__version__)
import folium
#print('Using Folium version ', folium.__version__)
from os import MFD_HUGE_1MB
import pandas as pd
import random
import matplotlib.pyplot as plt
import seaborn as sns
from typing import Dict, Iterable, List, Tuple
#from google.colab import auth
import datetime as dt
import time
import geopandas as gpd
from shapely.geometry import shape, Polygon, MultiPolygon
from shapely import wkt

### Clean up the raw data

1. Import the data and processing scripts 

In [None]:
#@title Running the imported files
import scripts.data_storage as ds  # Importing the data storage script
import scripts.process_raw_data as prd # import the processing script

2. Process the data and export the clean data as shapefile in Google drive

In [None]:
#@title Clean the raw data for all years
# Process data for 2018, 2019, 2020, and 2023 altogether
# you need to adjust these paths and parameters based on your data
directory = '/content/drive/MyDrive/ICRISAT/crop_type_classification/crop_types_data/clean_data_no_bands'  # Set this to the directory where your shapefiles are stored

data_years = {
    '2018': ds.data_2018,
    '2019': ds.data_2019,
    '2020': ds.data_2020,
    '2023': ds.data_2023
}

for year, collection in data_years.items():
    prd.fetch_and_process_features(collection, year, batch_size=5000,shapefile_directory=directory)

3.  Import the data as GEE assets per subclass

In [None]:
#@title import the data as GEE asset per subclass
# Dictionary with years and corresponding shapefile names
shapefiles = {
    2018: 'clean_raw_data_2018.shp',
    2019: 'clean_raw_data_2019.shp',
    2020: 'clean_raw_data_2020.shp',
    2023: 'clean_raw_data_2023.shp'
}

# Process and export each subclass for each year
for year, shapefile_name in shapefiles.items():
    # Load the GeoDataFrame from the shapefile
    shapefile_path = f'{directory}/{shapefile_name}'
    gdf = gpd.read_file(shapefile_path)
    gdf= gdf[gdf['ID'].notna()]#.notna() #remove na in 2020 data
    # Group by subclass and export each subclass
    for subclass, subclass_gdf in gdf.groupby('Sub_class'):
        print(f"subclass_gdf for {year}:", subclass_gdf.shape)
        try:
            prd.export_to_asset(subclass_gdf, year, subclass)
        except Exception as e:
            print(f"Error exporting year {year}, subclass {subclass}: {str(e)}")

        # Add a delay between exports to avoid overwhelming the EE API
        time.sleep(10)