I just realised that the time series data is only dependent on water velocity as we cannot easily add consecutive time series microplastics readings in addition to the image. So we'll do the API calls and everything to get time series data points as standalone training samples. Training the model will entail inputting time series data of multiple images of days before the reading, but not including the image on the day of the reading itself. This will prompt the model to "predict" the future, in a way, and make the microplastic prediction without needing the water velocity of the day itself.

In [1]:
import geopandas as gpd
import pandas as pd
from shapely.geometry import Point, shape
from datetime import timedelta, datetime

import ee
import json

import numpy as np
import pickle

In [4]:
# Trigger the authentication flow.
ee.Authenticate()

# Initialize the library.
ee.Initialize()

Enter verification code:  4/1ARtbsJq9PzEBmLPaTqmkUsTkNxExfV2xV0i0CKywfl3T9QTzJJZpczYIhM0



Successfully saved authorization token.


Read in microplastics data.

In [5]:
df = gpd.read_file(r"data_files/DataRecords.csv")

df['Longitude'] = df['Longitude'].astype('float')
df['Latitude'] = df['Latitude'].astype('float')

df['geometry'] = df.apply(lambda x: Point(x['Longitude'], x['Latitude']), axis=1)
df = df[['Date', 'geometry', 'Microplastics Measurement (density)']]
df['Date'] = pd.to_datetime(df['Date'])
df = df.rename(columns={'Microplastics Measurement (density)':'Density'})
df['Density'] = df['Density'].astype('float')
df.crs = "EPSG:4326"
df

  arr = construct_1d_object_array_from_listlike(values)


Unnamed: 0,Date,geometry,Density
0,1992-11-01,POINT (-59.35000 18.83000),0.004320
1,1992-11-01,POINT (-59.35000 18.83000),0.034556
2,1992-11-02,POINT (-59.43000 17.80000),0.008640
3,1992-11-02,POINT (-59.43000 17.80000),0.000000
4,1992-11-02,POINT (-59.57000 17.03000),0.000000
...,...,...,...
10715,2015-06-04,POINT (3.24000 42.02000),0.036000
10716,2015-06-05,POINT (3.29000 42.37000),1.004000
10717,2015-06-05,POINT (3.06000 41.75000),0.335000
10718,2015-06-05,POINT (2.78000 41.62000),1.986000


HYCOM only has data from after 1992-10-02, get rid of entries before that.

In [6]:
df = df[df['Date'] > '1992-10-02']
df

Unnamed: 0,Date,geometry,Density
0,1992-11-01,POINT (-59.35000 18.83000),0.004320
1,1992-11-01,POINT (-59.35000 18.83000),0.034556
2,1992-11-02,POINT (-59.43000 17.80000),0.008640
3,1992-11-02,POINT (-59.43000 17.80000),0.000000
4,1992-11-02,POINT (-59.57000 17.03000),0.000000
...,...,...,...
10715,2015-06-04,POINT (3.24000 42.02000),0.036000
10716,2015-06-05,POINT (3.29000 42.37000),1.004000
10717,2015-06-05,POINT (3.06000 41.75000),0.335000
10718,2015-06-05,POINT (2.78000 41.62000),1.986000


Calculate the region of interest of the subset of readings. Randomly sample a subset of the data to api request. 

In [7]:
#df = df.sample(n=2001, random_state=0)
df = df.to_crs(epsg=3857)

# cap_style=3 indicates square buffer
df['Region'] = df.buffer(10e3, cap_style=3)

df = df.set_geometry('Region')
df = df.drop(columns=['geometry'])

df = df.to_crs(epsg=4326)

df = df.reset_index(drop=True)

df

Unnamed: 0,Date,Density,Region
0,1992-11-01,0.004320,"POLYGON ((-59.26017 18.91500, -59.26017 18.744..."
1,1992-11-01,0.034556,"POLYGON ((-59.26017 18.91500, -59.26017 18.744..."
2,1992-11-02,0.008640,"POLYGON ((-59.34017 17.88551, -59.34017 17.714..."
3,1992-11-02,0.000000,"POLYGON ((-59.34017 17.88551, -59.34017 17.714..."
4,1992-11-02,0.000000,"POLYGON ((-59.48017 17.11587, -59.48017 16.944..."
...,...,...,...
9565,2015-06-04,0.036000,"POLYGON ((3.32983 42.08670, 3.32983 41.95323, ..."
9566,2015-06-05,1.004000,"POLYGON ((3.37983 42.43633, 3.37983 42.30360, ..."
9567,2015-06-05,0.335000,"POLYGON ((3.14983 41.81698, 3.14983 41.68295, ..."
9568,2015-06-05,1.986000,"POLYGON ((2.86983 41.68712, 2.86983 41.55281, ..."


Actual API call. The lookback_days is how many days back the time series will look, exclusive of the date of the reading itself.

In [8]:
def extract_raster_values_from_df(df, image_collection, band_names, task_name, folder="exported_files", lookback_days=7, scale=5000, export=False):
    df['ee_region'] = df.geometry.apply(lambda x: ee.Geometry.Polygon(list((x.exterior.coords)), proj='EPSG:4326'))
    regionCollection = ee.List([])
    emptyCol = ee.FeatureCollection(ee.Feature(None))
    
    # iterate through all entries
    for row in df.itertuples():
        # instantiate the image collection for the selected time series
        date = getattr(row, 'Date')
        collection = ee.ImageCollection(image_collection).filterDate((date - timedelta(days=lookback_days)).strftime('%Y-%m-%d'), date.strftime('%Y-%m-%d')).select(band_names)
        
        # get the raster information based on the buffer region we created
        pixelInfoRegion = collection.getRegion(geometry=getattr(row, 'ee_region'), scale=scale)

        # remove the first element, which is the header ['id', 'longitude', 'latitude', 'time', 'velocity_u_0', 'velocity_v_0']
        pixelInfoRegion = pixelInfoRegion.remove(pixelInfoRegion.get(0))
        # convert the 2d list of information to features. Going to hard code this part as I cba
        def func(x):
            x = ee.List(x)
            feat = ee.Feature(ee.Geometry.Point([x.get(1), x.get(2)], proj='EPSG:4326'), {'time':x.get(3),'velocity_u_0':x.get(4),'velocity_v_0':x.get(5)})
            return feat
        col = ee.FeatureCollection(pixelInfoRegion.map(func))
        # add the region that we created to the overall feature collection
        
        # ensure collection is not empty bands
        regionCollection = regionCollection.add(ee.Algorithms.If(collection.size(), col, emptyCol))
    
    if export:
        return ee.batch.Export.table.toDrive(collection=ee.FeatureCollection(regionCollection).flatten(), description=task_name, fileFormat="csv", folder=folder)
    else:
        return ee.FeatureCollection(regionCollection).flatten()

Doing the full api call crashes the kernel. Split it up into smaller subtasks.

In [9]:
size = 500
for i in range(0, 9500, size):
    task_name = f"water_velocities_{i}"
    task = extract_raster_values_from_df(df.iloc[i:i+size].copy(), 'HYCOM/sea_water_velocity',
                                         ['velocity_u_0', 'velocity_v_0'], task_name,
                                         folder="exported_files/train_data", export=True)
    task.start()

After that, load the data into dataframes.

In [10]:
df_list = []
for i in range(0, 9500, 500):
    sub_df = pd.read_csv(f'data_files/train_data/water_velocities_{i}.csv')
    
    water_idx = sub_df['system:index'].str.split('_', expand=True)
    water_idx = water_idx.rename(columns={0:'cluster', 1:'index'})
    water_idx['cluster'] = water_idx['cluster'].astype(int) + i
    
    sub_df = sub_df.join(water_idx).drop(columns=['system:index'])
    
    df_list.append(sub_df)
    
water_df = pd.concat(df_list, axis=0)
water_df

Unnamed: 0,time,velocity_u_0,velocity_v_0,.geo,cluster,index
0,7.199712e+11,-61.0,210.0,"{""type"":""Point"",""coordinates"":[-59.40109816240...",0,0
1,7.200576e+11,3.0,113.0,"{""type"":""Point"",""coordinates"":[-59.40109816240...",0,1
2,7.201440e+11,-2.0,288.0,"{""type"":""Point"",""coordinates"":[-59.40109816240...",0,2
3,7.202304e+11,-238.0,160.0,"{""type"":""Point"",""coordinates"":[-59.40109816240...",0,3
4,7.203168e+11,-103.0,202.0,"{""type"":""Point"",""coordinates"":[-59.40109816240...",0,4
...,...,...,...,...,...,...
48095,1.250035e+12,211.0,-9.0,"{""type"":""Point"",""coordinates"":[-139.7104845626...",9499,79
48096,1.250122e+12,294.0,-134.0,"{""type"":""Point"",""coordinates"":[-139.7104845626...",9499,80
48097,1.250208e+12,5.0,-210.0,"{""type"":""Point"",""coordinates"":[-139.7104845626...",9499,81
48098,1.250294e+12,35.0,109.0,"{""type"":""Point"",""coordinates"":[-139.7104845626...",9499,82


Some clusters will be NaN values, remove those.

In [11]:
water_df['.geo'] = water_df['.geo'].apply(json.loads).apply(shape)
water_df = water_df.dropna().set_index(['cluster'])
remap = dict(zip(water_df.index.unique(), list(range(len(water_df.index.unique())))))
water_df = water_df.rename(index=remap).set_index(['index'], append=True)
water_df = gpd.GeoDataFrame(water_df, geometry='.geo')
water_df

Unnamed: 0_level_0,Unnamed: 1_level_0,time,velocity_u_0,velocity_v_0,.geo
cluster,index,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,0,7.199712e+11,-61.0,210.0,POINT (-59.40110 18.75233)
0,1,7.200576e+11,3.0,113.0,POINT (-59.40110 18.75233)
0,2,7.201440e+11,-2.0,288.0,POINT (-59.40110 18.75233)
0,3,7.202304e+11,-238.0,160.0,POINT (-59.40110 18.75233)
0,4,7.203168e+11,-103.0,202.0,POINT (-59.40110 18.75233)
...,...,...,...,...,...
9282,79,1.250035e+12,211.0,-9.0,POINT (-139.71048 37.79662)
9282,80,1.250122e+12,294.0,-134.0,POINT (-139.71048 37.79662)
9282,81,1.250208e+12,5.0,-210.0,POINT (-139.71048 37.79662)
9282,82,1.250294e+12,35.0,109.0,POINT (-139.71048 37.79662)


In [14]:
images = []
largestx, largesty = -1, -1
for key, cluster_df in water_df.groupby(level=0):
    cluster_df = cluster_df.droplevel(0)

    cluster_df.time = pd.Categorical(cluster_df.time)
    cluster_df.time = cluster_df.time.cat.rename_categories(list(range(len(cluster_df.time.cat.categories))))

    cluster_df['x'] = pd.Categorical(cluster_df.geometry.x)
    cluster_df['y'] = pd.Categorical(cluster_df.geometry.y)
    cluster_df.x = cluster_df.x.cat.rename_categories(list(range(len(cluster_df.x.cat.categories))))
    cluster_df.y = cluster_df.y.cat.rename_categories(list(range(len(cluster_df.y.cat.categories))))

    time, maxx, maxy = cluster_df.time.cat.categories[-1] + 1, cluster_df.x.cat.categories[-1] + 1, cluster_df.y.cat.categories[-1] + 1
    
    # i don't want to deal with this shit
    if (time != 7): continue

    cluster_df = cluster_df.pivot_table(values=['velocity_u_0', 'velocity_v_0'], index=['time', 'y', 'x'], aggfunc=np.sum)
    img = cluster_df.to_numpy().reshape([time, maxy, maxx, -1])
    largestx, largesty = max(maxx, largestx), max(maxy, largesty)
    
    # [labels, img]
    images.append([df.loc[key, 'Density'], img])
    
len(images), largestx, largesty

(8910, 11, 11)

In [15]:
with open('train_data.pickle', 'wb') as handle:
    pickle.dump(images, handle, protocol=pickle.HIGHEST_PROTOCOL)

Test padding an image.

In [174]:
img = images[0][1]
img.shape

(3, 7, 4, 2)

In [137]:
height, width = 11, 11
# calculate margins for paddings
top = int(np.floor((height - img.shape[1])/2.0))
bottom = int(np.ceil((height - img.shape[1])/2.0))
left = int(np.floor((width - img.shape[2])/2.0))
right = int(np.ceil((width - img.shape[2])/2.0))

# pad the image -> [don't pad time, pad height, pad width, don't pad channels]
img = np.pad(img, [(0, 0), (top, bottom), (left, right), (0, 0)], 'constant', constant_values=0)
img.shape

(3, 11, 11, 2)