Should be enough to retrieve data from earth engine.

In [226]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, to_rgba
from matplotlib.widgets import Slider
from matplotlib.dates import date2num

import geopandas as gpd
from shapely import wkt

import ee

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

Resources: \
Filter from date list: https://gis.stackexchange.com/questions/389005/filtering-image-collection-based-on-list-of-dates-using-google-earth-engine-pyth \
Get region of pixels from image collection: https://developers.google.com/earth-engine/apidocs/ee-imagecollection-getregion

Algorithm for the training data: \
For each cluster centroid, convert its coordinates to epsg3857 and create the rectangular region of 10x10km around it. We then convert it back into epsg4326 for earth engine to parse. We can use the polygon and ee.getRegion() function to retrieve all the pixel values for that region and specific dates. After that we can take the returned values, pad the boundaries with absurdly large negative numbers such that it becomes a rectangle, and then it becomes training data

In [278]:
# load from csv of processed clusters
df = pd.read_csv(r"data_files/valid_clusters.csv")
df['Centroid'] = gpd.GeoSeries.from_wkt(df['Centroid'])
df = gpd.GeoDataFrame(df, geometry='Centroid')

df = df.sort_values('Cluster').set_index('Cluster')

df = df.set_crs(epsg=4326)
df = df.to_crs(epsg=3857)

# cap_style=3 indicates square buffer
df['Region'] = df.buffer(10e3, cap_style=3)

df = df.set_geometry('Region')
df = df.drop(columns=['geometry', 'Centroid'])

df = df.to_crs(epsg=4326)
df

Unnamed: 0_level_0,Date,FID,Density,Oceans,Region
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
...,...,...,...,...,...
168,1973-02-11,10642,0.001000,Pacific Ocean,"POLYGON ((-154.96017 31.17689, -154.96017 31.0..."
169,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
169,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
170,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ..."


https://developers.google.com/earth-engine/datasets/catalog/HYCOM_sea_water_velocity#description

Only has data from 1992-10-02 onwards. Filter out the dates before.

In [279]:
df = df[df['Date'] > '1992-10-02']
df

Unnamed: 0_level_0,Date,FID,Density,Oceans,Region
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
...,...,...,...,...,...
166,2009-08-14,10549,1.111167,Pacific Ocean,"POLYGON ((-139.77017 34.45161, -139.77017 34.3..."
169,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
169,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
170,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ..."


In [287]:
def extract_raster_values_from_df(df, image_collection, band_names, scale=1000, export=False):
    df['ee_region'] = df.geometry.apply(lambda x: ee.Geometry.Polygon(list((x.exterior.coords)), proj='EPSG:4326'))
    regionCollection = ee.List([])
    
    # iterate through all clusters
    for i in df.index.unique():
        # convert all the cluster's required dates into a ee.List dates in millis
        datelist = ee.List(list(fd.loc[i].Date.values)).map(lambda date: ee.Date(date).millis())
    
        # instantiate the image collection for the selected dates
        collection = ee.ImageCollection(image_collection).filter(ee.Filter.inList("system:time_start", datelist)).select(band_names)
        # get the raster information based on the buffer region we created
        pixelInfoRegion = collection.getRegion(geometry=df.loc[i, 'ee_region'].iloc[0], scale=scale)
        
        # remove the first element, which is the header ['id', 'longitude', 'latitude', 'time', 'velocity_u_0', 'velocity_v_0']
        pixelInfoRegion = pixelInfoRegion.remove(pixelInfoRegion.get(0))
        # convert the 2d list of information to features. Going to hard code this part as I cba
        def func(x):
            x = ee.List(x)
            feat = ee.Feature(ee.Geometry.Point([x.get(1), x.get(2)], proj='EPSG:4326'), {'time':x.get(3),'velocity_u_0':x.get(4),'velocity_v_0':x.get(5)})
            return feat
        col = ee.FeatureCollection(pixelInfoRegion.map(func))
        # add the region that we created to the overall feature collection
        regionCollection = regionCollection.add(col)
    
    if export:
        return ee.batch.Export.table.toDrive(collection=ee.FeatureCollection(regionCollection).flatten(), description="water_velocity_train_data", fileFormat="csv", folder="exported_files")
    else:
        return regionCollection

In [288]:
task = extract_raster_values_from_df(df.copy(), 'HYCOM/sea_water_velocity', ['velocity_u_0', 'velocity_v_0'], export=True)
task.start()

In [290]:
ee.data.listOperations()

[{'name': 'projects/earthengine-legacy/operations/V4U4PAQX4TFYQPRWA54HV7LY',
  'metadata': {'@type': 'type.googleapis.com/google.earthengine.v1alpha.OperationMetadata',
   'state': 'SUCCEEDED',
   'description': 'water_velocity_train_data',
   'createTime': '2022-09-04T16:43:43.482729Z',
   'updateTime': '2022-09-04T16:43:51.576962Z',
   'startTime': '2022-09-04T16:43:49.609784Z',
   'endTime': '2022-09-04T16:43:51.576962Z',
   'type': 'EXPORT_FEATURES',
   'destinationUris': ['https://drive.google.com/#folders/1G9uzDXDjugBJ_5Q5F79MBoIB6oGPnlwg'],
   'attempt': 1,
   'progress': 1,
   'stages': [{'displayName': 'Create Local Files',
     'completeWorkUnits': 1,
     'totalWorkUnits': '1',
     'description': 'Computation and writing of temporary files.'},
    {'displayName': 'Write Files to Destination',
     'completeWorkUnits': 1,
     'totalWorkUnits': '1',
     'description': 'Uploading of files to the export destination.'}],
   'batchEecuUsageSeconds': 0.02840505912899971},
  'don