# EE API call

In [245]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap, to_rgba
from matplotlib.widgets import Slider
from matplotlib.dates import date2num

import geopandas as gpd
from shapely import wkt
from shapely.geometry import shape, MultiPoint

import json
import pickle

import ee

In [None]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

Resources: \
Filter from date list: https://gis.stackexchange.com/questions/389005/filtering-image-collection-based-on-list-of-dates-using-google-earth-engine-pyth \
Get region of pixels from image collection: https://developers.google.com/earth-engine/apidocs/ee-imagecollection-getregion

Algorithm for the training data: \
For each cluster centroid, convert its coordinates to epsg3857 and create the rectangular region of 10x10km around it. We then convert it back into epsg4326 for earth engine to parse. We can use the polygon and ee.getRegion() function to retrieve all the pixel values for that region and specific dates. After that we can take the returned values, pad the boundaries with absurdly large negative numbers such that it becomes a rectangle, and then it becomes training data

In [232]:
# load from csv of processed clusters
df = pd.read_csv(r"data_files/valid_clusters.csv")
df['Centroid'] = gpd.GeoSeries.from_wkt(df['Centroid'])
df = gpd.GeoDataFrame(df, geometry='Centroid')

df = df.sort_values('Cluster').set_index('Cluster')

df = df.set_crs(epsg=4326)
df = df.to_crs(epsg=3857)

# cap_style=3 indicates square buffer
df['Region'] = df.buffer(10e3, cap_style=3)

df = df.set_geometry('Region')
df = df.drop(columns=['geometry', 'Centroid'])

df = df.to_crs(epsg=4326)
df

Unnamed: 0_level_0,Date,FID,Density,Oceans,Region
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
...,...,...,...,...,...
168,1973-02-11,10642,0.001000,Pacific Ocean,"POLYGON ((-154.96017 31.17689, -154.96017 31.0..."
169,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
169,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
170,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ..."


https://developers.google.com/earth-engine/datasets/catalog/HYCOM_sea_water_velocity#description

Only has data from 1992-10-02 onwards. Filter out the dates before.

In [233]:
df = df[df['Date'] > '1992-10-02']
df

Unnamed: 0_level_0,Date,FID,Density,Oceans,Region
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
...,...,...,...,...,...
166,2009-08-14,10549,1.111167,Pacific Ocean,"POLYGON ((-139.77017 34.45161, -139.77017 34.3..."
169,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
169,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
170,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ..."


In [291]:
def extract_raster_values_from_df(df, image_collection, band_names, scale=1000, export=False):
    df['ee_region'] = df.geometry.apply(lambda x: ee.Geometry.Polygon(list((x.exterior.coords)), proj='EPSG:4326'))
    regionCollection = ee.List([])
    
    # iterate through all clusters
    for i in df.index.unique():
        # convert all the cluster's required dates into a ee.List dates in millis
        datelist = ee.List(list(fd.loc[i].Date.values)).map(lambda date: ee.Date(date).millis())
    
        # instantiate the image collection for the selected dates
        collection = ee.ImageCollection(image_collection).filter(ee.Filter.inList("system:time_start", datelist)).select(band_names)
        # get the raster information based on the buffer region we created
        pixelInfoRegion = collection.getRegion(geometry=df.loc[i, 'ee_region'].iloc[0], scale=scale)
        
        # remove the first element, which is the header ['id', 'longitude', 'latitude', 'time', 'velocity_u_0', 'velocity_v_0']
        pixelInfoRegion = pixelInfoRegion.remove(pixelInfoRegion.get(0))
        # convert the 2d list of information to features. Going to hard code this part as I cba
        def func(x):
            x = ee.List(x)
            feat = ee.Feature(ee.Geometry.Point([x.get(1), x.get(2)], proj='EPSG:4326'), {'time':x.get(3),'velocity_u_0':x.get(4),'velocity_v_0':x.get(5)})
            return feat
        col = ee.FeatureCollection(pixelInfoRegion.map(func))
        # add the region that we created to the overall feature collection
        regionCollection = regionCollection.add(col)
    
    if export:
        return ee.batch.Export.table.toDrive(collection=ee.FeatureCollection(regionCollection).flatten(), description="water_velocity_train_data", fileFormat="csv", folder="exported_files")
    else:
        return regionCollection

In [292]:
task = extract_raster_values_from_df(df.copy(), 'HYCOM/sea_water_velocity', ['velocity_u_0', 'velocity_v_0'], export=True)
task.start()

In [None]:
ee.data.listOperations()

# Parsing data

Read from csv that we exported

In [234]:
water_df = pd.read_csv(r"data_files/water_velocity_train_data.csv")
water_df['.geo'] = water_df['.geo'].apply(json.loads).apply(shape)
water_df = gpd.GeoDataFrame(water_df, geometry='.geo')
water_df

Unnamed: 0,system:index,time,velocity_u_0,velocity_v_0,.geo
0,0_0,9.724320e+11,-192.0,-88.0,POINT (-62.68893 27.52887)
1,0_1,9.725184e+11,-109.0,18.0,POINT (-62.68893 27.52887)
2,0_2,9.726048e+11,58.0,33.0,POINT (-62.68893 27.52887)
3,0_3,9.724320e+11,-214.0,-133.0,POINT (-62.67995 27.52887)
4,0_4,9.725184e+11,-115.0,-8.0,POINT (-62.67995 27.52887)
...,...,...,...,...,...
143215,163_555,1.433462e+12,35.0,47.0,POINT (3.32826 42.33311)
143216,163_556,1.433376e+12,39.0,-42.0,POINT (3.33724 42.33311)
143217,163_557,1.433462e+12,35.0,47.0,POINT (3.33724 42.33311)
143218,163_558,1.433376e+12,39.0,-42.0,POINT (3.34622 42.33311)


In [235]:
water_idx = water_df['system:index'].str.split('_', expand=True)
water_idx = water_idx.rename(columns={0:'cluster', 1:'index'})
water_idx['cluster'] = water_idx['cluster'].astype(int)
water_df = water_df.join(water_idx).drop(columns=['system:index']).set_index(['cluster', 'index'])
water_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,velocity_u_0,velocity_v_0,.geo
cluster,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,9.724320e+11,-192.0,-88.0,POINT (-62.68893 27.52887)
0,1,9.725184e+11,-109.0,18.0,POINT (-62.68893 27.52887)
0,2,9.726048e+11,58.0,33.0,POINT (-62.68893 27.52887)
0,3,9.724320e+11,-214.0,-133.0,POINT (-62.67995 27.52887)
0,4,9.725184e+11,-115.0,-8.0,POINT (-62.67995 27.52887)
...,...,...,...,...,...
163,555,1.433462e+12,35.0,47.0,POINT (3.32826 42.33311)
163,556,1.433376e+12,39.0,-42.0,POINT (3.33724 42.33311)
163,557,1.433462e+12,35.0,47.0,POINT (3.33724 42.33311)
163,558,1.433376e+12,39.0,-42.0,POINT (3.34622 42.33311)


Reset df cluster indexing to match system:index

In [236]:
remap = dict(zip(df.index.unique(), list(range(len(df.index.unique())))))
df = df.rename(index=remap)
df

Unnamed: 0_level_0,Date,FID,Density,Oceans,Region
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527..."
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616..."
...,...,...,...,...,...
161,2009-08-14,10549,1.111167,Pacific Ocean,"POLYGON ((-139.77017 34.45161, -139.77017 34.3..."
162,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
162,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373..."
163,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ..."


In [243]:
df


Unnamed: 0_level_0,Date,FID,Density,Oceans,Region,Images
Cluster,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,2000-10-25,2568,0.043196,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527...",0.0
0,2000-10-26,2569,0.016661,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527...",0.0
0,2000-10-27,2572,0.001440,Atlantic Ocean,"POLYGON ((-62.51684 27.68624, -62.51684 27.527...",0.0
1,1996-07-12,3129,0.015120,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616...",
1,1996-07-13,3130,0.000000,Atlantic Ocean,"POLYGON ((-56.88767 46.73993, -56.88767 46.616...",
...,...,...,...,...,...,...
161,2009-08-14,10549,1.111167,Pacific Ocean,"POLYGON ((-139.77017 34.45161, -139.77017 34.3...",
162,2015-03-17,10700,0.350167,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373...",
162,2015-03-18,10706,0.198000,Atlantic Ocean,"POLYGON ((-0.78017 37.60620, -0.78017 37.46373...",
163,2015-06-04,10715,0.311500,Atlantic Ocean,"POLYGON ((3.35483 42.34143, 3.35483 42.20850, ...",


Convert time, x, y columns to categories and rename those categories. Convert the dataframe into a pivot table where we can just turn it into a numpy array. aggfunc=np.sum ensures that coordinates that don't exist, still get filled by a 0.0 value

In [249]:
images = []
for key, cluster_df in water_df.groupby(level=0):
    cluster_df = cluster_df.droplevel(0)

    cluster_df.time = pd.Categorical(cluster_df.time)
    cluster_df.time = cluster_df.time.cat.rename_categories(list(range(len(cluster_df.time.cat.categories))))

    cluster_df['x'] = pd.Categorical(cluster_df.geometry.x)
    cluster_df['y'] = pd.Categorical(cluster_df.geometry.y)
    cluster_df.x = cluster_df.x.cat.rename_categories(list(range(len(cluster_df.x.cat.categories))))
    cluster_df.y = cluster_df.y.cat.rename_categories(list(range(len(cluster_df.y.cat.categories))))

    time, maxx, maxy = cluster_df.time.cat.categories[-1] + 1, cluster_df.x.cat.categories[-1] + 1, cluster_df.y.cat.categories[-1] + 1

    cluster_df = cluster_df.pivot_table(values=['velocity_u_0', 'velocity_v_0'], index=['time', 'y', 'x'], aggfunc=np.sum)
    img = cluster_df.to_numpy().reshape([time, maxy, maxx, -1])
    
    # [labels, img]
    images.append([df.loc[key, 'Density'].to_numpy(), img])
    
len(images)

164

In [250]:
with open('train_data.pickle', 'wb') as handle:
    pickle.dump(images, handle, protocol=pickle.HIGHEST_PROTOCOL)

The pickle files holds a list of [label, time_series] pairs, where each label themselves are a list of densities for each time 

In [251]:
with open('train_data.pickle', 'rb') as handle:
    train_data = pickle.load(handle)

In [256]:
labels, images = list(zip(*train_data))