# Reference Data Sampling

This notebook draws an ecoregion and spatially stratified sample of predictors.

In [None]:
from notebook_setup import *

In [None]:
training_image = setup_training_image()

In [None]:
grid_cells = setup_training_grid()

In [None]:
export_tasks = []
grid_asset_paths = []

create_assets_folder(TRAINING_TILES_PATH)

for cell_name, cell_geom in grid_cells.items():
    print(f"Processing {cell_name}...")

    tile_asset_path = f"{TRAINING_TILES_PATH}/npp_training_{cell_name}_{GEE_MODEL_VERSION}"

    try:
        ee.data.getAsset(tile_asset_path)
        print(f"- Training data already exists at: {tile_asset_path}")
        grid_asset_paths.append(tile_asset_path)
        continue
    except ee.EEException:
        print(f"- Computing and exporting to: {tile_asset_path}")

    # Compute and export
    cell_samples = sample_grid_cell_area_proportional(
        predictor_stack=training_image,
        grid_cell=cell_geom,
        grid_cell_name=cell_name,
        total_points=TOTAL_POINTS_PER_GRID_CELL,
        seed=42,
    )

    grid_asset_paths.append(tile_asset_path)

    export_task = ee.batch.Export.table.toAsset(
        collection=cell_samples,
        description=f"NPP_Training_{cell_name}",
        assetId=tile_asset_path,
    )
    export_task.start()
    export_tasks.append({"cell": cell_name, "id": export_task.id, "asset_path": tile_asset_path})

In [None]:
wait_for_completion(export_tasks)

In [None]:
combined_asset_path = f"{FINAL_TRAINING_ASSET_PATH}/final_reference_samples"
try:
    ee.data.getAsset(combined_asset_path)
    print(f"Combined asset already exists at: {combined_asset_path}")
except ee.EEException:
    print(f"Creating combined asset at: {combined_asset_path}")

    def get_all_assets(folder):
        assets = []
        page_token = None
        while True:
            params = {"parent": folder}
            if page_token:
                params["pageToken"] = page_token
            response = ee.data.listAssets(params)
            assets.extend(response.get("assets", []))
            page_token = response.get("nextPageToken")
            if not page_token:
                break
        return assets

    assets = get_all_assets(TRAINING_TILES_PATH)
    table_ids = [a["name"] for a in assets if a["type"] == "TABLE"]

    if not table_ids:
        print("No training tiles found.")
    else:
        print(f"Merging {len(table_ids)} tiles...")
        # Start with the first collection
        combined_fc = ee.FeatureCollection(table_ids[0])
        # Merge the rest
        for t_id in table_ids[1:]:
            combined_fc = combined_fc.merge(ee.FeatureCollection(t_id))

        task = ee.batch.Export.table.toAsset(
            collection=combined_fc,
            description=f"Combine_NPP_Training_Data_{GEE_MODEL_VERSION}",
            assetId=combined_asset_path,
        )
        task.start()
        print(f"Export task started: {task.id}")
        wait_for_completion(id_list=[task.id])