<a href="https://colab.research.google.com/github/jshogland/SpatialModelingTutorials/blob/main/Notebooks/GettingGeeInfoAndImagery.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Install needed software on Colab

In [None]:
!pip install mapclassify
!pip install geemap
!pip install raster_tools

# Getting image statistics and imagery from Google earth engine
Google earth engine is a great resource for accessing and processing data. However, to facilitate everyone's processing workflows, Google limits the amount, types, and ways processing can occur. Moreover, to perform larger tasks on Google's servers can require purchasing additional resources. Alteratively, we can download base data and perform task off Google's services. In this notebook we will demonstrate how to stream intermediate products from Google's services for down stream analyses. Later we will use these data to estimate parameters of interest.

In [None]:
#import packages
import geopandas as gpd, pandas as pd, os, numpy as np
import ee, geemap, gdown, os, zipfile
import dask, xarray as xr, io, requests, shapely
from raster_tools import Raster, general

### Authenticate into Earth Engine

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize(project='ee-jshogland') #you will want to select your personal cloud project

### Get Missoula County

In [None]:
if(not os.path.exists('tl_2025_us_county.shp')):
    gdown.download('https://www2.census.gov/geo/tiger/TIGER2025/COUNTY/tl_2025_us_county.zip','tl_2025_us_county.zip',quiet=False,fuzzy=True)

    with zipfile.ZipFile('tl_2025_us_county.zip', 'r') as zip_ref:
        zip_ref.extractall(".")

cnty=gpd.read_file('tl_2025_us_county.shp').to_crs('EPSG:4326')
msl_cnty=cnty[cnty.NAME=='Missoula']

### Get the CHM asset
Set a filter date that insure you get complete coverage of chm pixels

In [None]:
#Get the assest
chm=ee.ImageCollection('projects/naip-chm/assets/conus-structure-model')
#Filter for a given time span and return the max height value
chm_img=chm.filterDate('2021','2024').mosaic() #use dates that cover all acquisitions

### Get mean estimate for the image
Note we will be setting the scale to 10 here to reduce computation. This amounts to sampling within the region of interest and will only have a minor impact on the mean estimate.

In [None]:
eeftr=geemap.gdf_to_ee(msl_cnty) # create a feature collection from Missoula County

ee_dic=geemap.image_mean_value(chm_img,eeftr,10) #use the eeftr to clip the image and set scale to 10 to reduce the number of computations

### Look at the mean result

In [None]:
chm_mean=ee_dic.getInfo()
chm_mean

Now look at the (value and time processing) for a scale of 30, 20, 10, 5

In [None]:
import datetime
for s in [30,20,10,5]:
    t1=datetime.datetime.now()
    ee_dic=geemap.image_mean_value(chm_img,eeftr,s)
    info=ee_dic.getInfo()
    t2=datetime.datetime.now()
    et=t2-t1
    print('scale='+str(s), info, et.total_seconds())

### Create a clipped image for viewing

In [None]:
chm_img_clip=chm_img.clip(eeftr)

## Look at the image

In [None]:
m=geemap.Map()
m.set_center(lon=-113.9940,lat=46.8721,zoom=8)
m.add_basemap(basemap='ESRI.WorldImagery')

m.add_gdf(msl_cnty,layer_name='Missoula County',)
m.addLayer(
    chm_img_clip,
    {"min": 0, "max": 3000,"palette":'viridis'},
    'CHM',
)
m

## Create definitions to iteratively download imagery from Earth Engine
This code is a little detailed and will send many requests to earth engine to accommodate their quota limits. Specifically, each chunk within the raster dataset will be a request for a image from Google earth image. So what are image chunks? Chunks are subsets of a large image that are used within Raster Tools to schedule processing. Each chunk is processed separately to accommodate parallel processing. In this case we are using dask and Raster Tools to parallelize the download of the medoid image from Google Earth image. When we save out the final image it will also get saved in parallel.

In [None]:
def _convert_array2(x,bnds,rws,clms):
    outarr=np.zeros((bnds,rws,clms),dtype='f8')
    for r in range(x.shape[0]):
        for c in range(x.shape[1]):
            vls=x[r,c]
            for b in range(bnds):
                outarr[b,r,c]=vls[b]

    return outarr

def _get_block(ee_object,ee_geo,bnds,rws,clms):
    success=True
    url=ee_object.getDownloadURL({'format':'NPY','region':ee_geo})
    try:
        #print('downloading',url)
        resp=requests.get(url)
        data=np.load(io.BytesIO(resp.content))
        #print('reformatting data')
        outarr =_convert_array2(data,bnds,rws,clms)
        #print('finished')
    except Exception as e:
        #print(e,url)
        outarr=np.zeros((bnds,rws,clms))
        success=False
    finally:
        return success,outarr,url

def _get_block_values2(ee_object,xmin,ymax,res,oprj,retry=2,block_info=None):
    bo_info=block_info[None]
    aloc=bo_info['array-location']
    bnds,rws,clms=bo_info['chunk-shape']
    xrng=aloc[2]
    yrng=aloc[1]
    xmin2=xmin+(xrng[0]*res)
    xmax2=xmin+(xrng[1]*res)
    ymax2=ymax-(yrng[0]*res)
    ymin2=ymax-(yrng[1]*res)
    ply=[
        [xmin2,ymin2],
        [xmin2,ymax2],
        [xmax2,ymax2],
        [xmax2,ymin2],
        [xmin2,ymin2]
    ]
    tbb=shapely.Polygon(ply)
    tgdf=gpd.GeoSeries(tbb,crs=oprj).buffer(-res/2.0)
    g=tgdf.to_crs('EPSG:4326').geometry.iloc[0]
    xx,yy=g.exterior.coords.xy
    x=list(xx)
    y=list(yy)
    ee_geo=ee.Geometry.Polygon(tuple(zip(x,y)))
    cnt=0
    success,outarr,url=_get_block(ee_object,ee_geo,bnds,rws,clms)
    #build a loop to get data in the case of server error
    while (success==False):
        if(cnt>retry):
            break
        print('Retry',cnt+1)
        success,outarr,url=_get_block(ee_object,ee_geo,bnds,rws,clms)
        cnt+=1

    if(success==False):
        print("Problem with url",url)

    return outarr



def get_raster(gdf, ee_object,dvs=2048):
    '''
    creates a raster dataset of specified resolution from point clouds

    gdf: geodataframe from build_extent function
    ee_object: earth engine image object
    dvs: int of number of cells of the length of a square chunk

    return: Raster of image values
    '''
    prj=ee_object.projection()
    oprj=prj.crs().getInfo()
    xmin,ymin,xmax,ymax=gdf.to_crs(oprj).total_bounds.astype('int32')
    res=prj.nominalScale().getInfo()
    xchs=int((xmax-xmin)/(dvs*res))+1
    ychs=int((ymax-ymin)/(dvs*res))+1
    xsteps=np.arange(xmin,xmin+(xchs*dvs*res),res)
    ysteps=np.arange(ymax,ymax-(ychs*dvs*res),-res)


    #make chunks
    bnds = ee_object.bandNames().getInfo()

    xchunk=tuple([dvs]*xchs)
    ychunk=tuple([dvs]*ychs)

    tda=dask.array.map_blocks(_get_block_values2,ee_object,xmin,ymax,res,oprj,chunks=((len(bnds)),ychunk,xchunk),dtype=float)
    xrs=xr.DataArray(tda,coords={'band':bnds,'y':ysteps,'x':xsteps})
    return xrs

### When creating the image you must specify a projection and set a scale. Chunk size 2048 is also very important to address memory issues on GEE.

In [None]:
chm_img_prg=chm_img_clip.setDefaultProjection(crs='EPSG:5070',scale=1) # set the projection and scale of the output image
chm_loc=get_raster(msl_cnty,chm_img_prg,2048).astype('uint16')
chm_loc

### This will make a very large image (>22.34 GB).
Let's download the values for a small area and plot them.

In [None]:
chm_loc[:,10000:12048,10000:12048].plot(figsize=(15,15))

### What would happen if we changed the resolution to 10 meters?

In [None]:
chm_img_prg2=chm_img_clip.setDefaultProjection(crs='EPSG:5070',scale=10) # set the projection and scale of the output image
chm_loc2=get_raster(msl_cnty,chm_img_prg2,2048).astype('uint16')
chm_loc2

### We now have a much smaller dataset that has been resampled based on nearest neighbors to a spatial resolution of 10m

In [None]:
chm_loc2[:,1000:1205,1000:1205].plot(figsize=(15,15))

### Let's save the 10m image

In [None]:
rs=Raster(chm_loc2).set_crs('EPSG:5070').load()
rs.save('chm.tif',tiled=True)

## Now let's get  RAP3 surfaces for the 30m collection
Create the asset. Note the other rap assets are commented out below

In [None]:
#Vegetation Cover 30m
veg_yearly_30m = ee.ImageCollection("projects/rap-data-365417/assets/vegetation-cover-v3") # Plant functional types

# #Vegetation Cover and Canopy Gap 10m
# veg_yearly_10m = ee.ImageCollection('projects/rap-data-365417/assets/vegetation-cover-10m') # Plant functional types
# iag_yearly_10m = ee.ImageCollection('projects/rap-data-365417/assets/invasive-annual-grass-cover-10m') # Invasive annual grasses
# sagebrush_yearly_10m = ee.ImageCollection('projects/rap-data-365417/assets/sagebrush-cover-10m') # Sagebrush (Artemisia spp.)
# pj_yearly_10m = ee.ImageCollection('projects/rap-data-365417/assets/pj-cover-10m') # Pinyon-juniper
# gap_yearly_10m = ee.ImageCollection('projects/rap-data-365417/assets/gap-cover-10m') # Canopy gaps

# #Rangeland Production 30m
# npp_yearly = ee.ImageCollection("projects/rap-data-365417/assets/npp-partitioned-v3") # Net primary production (yearly)
# npp_16d = ee.ImageCollection("projects/rap-data-365417/assets/npp-partitioned-16day-v3") # Net primary production (16-day)
# npp_16d_prov = ee.ImageCollection("projects/rap-data-365417/assets/npp-partitioned-16day-v3-provisional") # Net primary production (16-day) provisional


### Select the start and end year to filer the image collection and summarize mean the mean value for each image band within Missoula County

In [None]:
yr_start='2021'
yr_end='2022'

eeftr=geemap.gdf_to_ee(msl_cnty)

rap_img=veg_yearly_30m.filterDate(yr_start,yr_end).mosaic()
ee_dic=geemap.image_mean_value(rap_img,region=eeftr,scale=30)
rap_means=ee_dic.getInfo()
rap_means

## Now let's get the embeddings values
### Get the asset

In [None]:
emb=ee.ImageCollection("GOOGLE/SATELLITE_EMBEDDING/V1/ANNUAL")
yr_start='2023-01-01'
yr_end='2024-01-01'
emb_img=emb.filterDate(yr_start,yr_end).mosaic()

### Get mean values

In [None]:
eeftr=geemap.gdf_to_ee(msl_cnty)

ee_dic=geemap.image_mean_value(emb_img,region=eeftr,scale=30)
emb_means=ee_dic.getInfo()
emb_means

### Add all mean values to the Missoula geodataframe

In [None]:
msl_cnty=msl_cnty.join(pd.DataFrame(chm_mean|rap_means|emb_means,index=msl_cnty.index))
msl_cnty

## Getting point data from GEE

### Create point locations within the Missoula county boundary

In [None]:
pnts=gpd.GeoDataFrame(geometry=msl_cnty.sample_points(100).explode(),crs=msl_cnty.crs) #random points in Missoula county
pnts.explore()

### Create definition to extract values at those point locations

In [None]:
def _get_tiles(gdf,ntiles):
  '''
  splits the area into tiles to address quota limits
  gdf = (geodataframe) the points geodata frame used to create bounary of the study area
  ntiles= (int) the number of tiles to make
  '''
  chul=gpd.GeoSeries(gdf.union_all().convex_hull,crs=gdf.crs)
  xmin,ymin,xmax,ymax=chul.total_bounds
  sp=(np.sqrt(chul.area/ntiles))[0]
  sp2=(sp/2)
  xs=np.arange(xmin-sp2,xmax+sp2,sp)
  ys=np.arange(ymin-sp2,ymax+sp2,sp)
  xv, yv = np.meshgrid(xs, ys)
  xv = xv.flatten()
  yv = yv.flatten()
  pnts = gpd.GeoSeries(gpd.points_from_xy(x=xv, y=yv),crs=gdf.crs)
  buff = pnts.buffer(sp2,cap_style='square')
  buff = buff[buff.intersects(gdf.union_all())]
  return buff

def extract_data(gdf,img,ntiles,stats='FIRST',scale=30):
    '''
    Iteratively calls EE and extracts data from the image
    gdf = (geodataframe) of features used to extract values
    img = (ee image object) ee image to extract values from
    ntiles = (int) number of tiles used to extract data at a time
    stats= (string) name of the ee static (e.g., FIRST, MEAN, MAX, MIN, MEDIAN, etc.)

    returns a Dataframe of values (one record for each observation in the gdf)
    '''
    tls=_get_tiles(gdf,ntiles)
    ogdf=gdf.copy()
    for t in tls:
        sel=ogdf.intersects(t)
        sdf=ogdf[['geometry']][sel]

        #use try and except catch errors
        try:
            fc=geemap.gdf_to_ee(sdf) #convert your subset geodataframe into a ee feature class object
            outfc=geemap.extract_values_to_points(fc,img,stats_type=stats,scale=scale) #extract the image values for each point location.
            ogdf2=geemap.ee_to_gdf(outfc).drop(['geometry'],axis=1) #convert your output ee object into a geodataframe
            column_names=ogdf2.columns
            ogdf.loc[sel,column_names]=ogdf2.values #update records of our geodataframe
        except Exception as e:
            print('Error: ',e)

    return ogdf #return the geodataframe

### Get GEE Image point data for the 100 locations
Note, that the CHM dataframe has a column named first while the other dataframes have columns named after the bands. This is because of the way GEE coded their extraction procedure and because CHM only has one band.

In [None]:
chm_vls=extract_data(pnts,chm_img,5)
chm_vls

In [None]:
rap_vls=extract_data(pnts,rap_img,5)
rap_vls

In [None]:
emb_vls=extract_data(pnts,emb_img,5)
emb_vls