# Create training samples

Here I did not filter with air temperature. I think the training data should be diverse enough that should cover the summer period where the algal bloom occurs. If we did not include images from summer, there's a risk for the model to mis-classify algal bloom pixels as ice pixels as the model never seen algal bloom optical pattern before.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import os
import glob
import geemap
import ee
import rasterio
import contextlib
from p_tqdm import p_map

In [2]:
ee.Authenticate()
ee.Initialize(project="ee-sarice")

In [3]:
def get_match_table(lake_id,
                    save_dir = "/home/xinchenh/Work/sentinel1ice/data/match_tables"
                   ):

    # hydrolakes
    hydrolakes = ee.FeatureCollection("projects/ee-lakeice/assets/HydroLAKES_polys_v10")        
    aoi = hydrolakes.filterMetadata('Hylak_id', 'equals', lake_id)
    roi = aoi.geometry()

    # Load SENTINEL-1
    s1 = ee.ImageCollection('COPERNICUS/S1_GRD') \
            .filter(ee.Filter.eq('instrumentMode', 'IW')) \
            .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VV')) \
            .filter(ee.Filter.listContains('transmitterReceiverPolarisation', 'VH')) \
            .filterBounds(roi)
    # speckle filter
    s1_with_dates = s1.map(lambda image: image.set('date', 
                                                   ee.Date(image.get('system:time_start')).format('YYYY-MM-dd HH:mm:ss')))

    # SENTINEL-2
    s2 = ee.ImageCollection('COPERNICUS/S2_SR_HARMONIZED') \
            .filterBounds(roi) \
            .filter(ee.Filter.lt('CLOUDY_PIXEL_PERCENTAGE',50))

    s2_with_dates = s2.map(lambda image: image.set('date',
                                                    ee.Date(image.get('system:time_start')).format('YYYY-MM-dd HH:mm:ss')))

    # get dates
    s1_dates = pd.to_datetime(s1_with_dates.aggregate_array("date").getInfo())
    s2_dates = pd.to_datetime(s2_with_dates.aggregate_array("date").getInfo())

    # get granule ids
    s1_system_ids = s1_with_dates.aggregate_array("system:id").getInfo()
    s2_system_ids = s2_with_dates.aggregate_array("system:id").getInfo()
    s2_granule_ids = s2_with_dates.aggregate_array("GRANULE_ID").getInfo()
    
    # create 2 df for s1 and s2
    s1_df = pd.DataFrame(zip(s1_dates.to_list(), s1_system_ids), 
                         columns = ["s1_dates", "s1_system_ids"])
    s2_df = pd.DataFrame(zip(s2_dates.to_list(), s2_system_ids, s2_granule_ids), 
                         columns = ["s2_dates", "s2_system_ids", "s2_granule_ids"])
    
    # Find sentinel-2 image that was acquired < 12 hour to the sentinel-1 image
    def lookfor_sen2(sen1_date, 
                     sen2_dates = s2_dates, # list of dates of avaialble sentinel-2 images
                     delta_hr = 12):
        # find all times that has less than 12 hour interval to the sentinel-2 image
        intervals = np.abs(sen2_dates - sen1_date)
        indexes = intervals < pd.Timedelta(f"{delta_hr}H")

        # get the dates
        match_sen2_date = sen2_dates[indexes]

        if len(match_sen2_date) == 0:
            return None
        else:
            return match_sen2_date

    # create a match table between sentinel-1 dates and sentinel-2 dates
    match_table = s1_df.copy()
    match_table["s2_dates"] = match_table["s1_dates"].apply(lookfor_sen2)
    match_table = match_table.dropna().reset_index(drop = True).explode("s2_dates")
    match_table = match_table.merge(s2_df, on = "s2_dates")

    # save the match table
    match_table.to_csv(f"{save_dir}/{lake_id}_match_table.csv")

    return match_table

In [5]:
training_lake_ids = pd.read_csv("data/training_lakes.csv").Hylak_id.to_list()

In [8]:
for hylak_id in training_lake_ids:
    get_match_table(hylak_id)
    print(hylak_id, "match table created")

137 match table created
168 match table created
212 match table created
238 match table created
257 match table created
287 match table created
673 match table created
892 match table created
959 match table created
1286 match table created
1354 match table created
1362 match table created
1367 match table created
1370 match table created
1375 match table created
1387 match table created
1397 match table created
1402 match table created
1407 match table created
1413 match table created
1421 match table created
1429 match table created
1430 match table created
1444 match table created
1448 match table created
1523 match table created
1536 match table created
1539 match table created
1552 match table created
1560 match table created
1615 match table created
1678 match table created
1763 match table created
1771 match table created
1807 match table created
1814 match table created
1864 match table created
1869 match table created
1988 match table created
2072 match table created
2162 matc

25562 match table created
25722 match table created
25751 match table created
25795 match table created
25821 match table created
25899 match table created
26043 match table created
26510 match table created
26550 match table created
26575 match table created
26589 match table created
26601 match table created
26730 match table created
26744 match table created
26779 match table created
26796 match table created
27135 match table created
27396 match table created
27436 match table created
27506 match table created
27834 match table created
27860 match table created
28022 match table created
28143 match table created
28173 match table created
28309 match table created
28490 match table created
28607 match table created
28628 match table created
28790 match table created
28817 match table created
28967 match table created
29064 match table created
29135 match table created
29394 match table created
29400 match table created
29550 match table created
30007 match table created
30119 match 

69253 match table created
69505 match table created
69888 match table created
69953 match table created
69965 match table created
70503 match table created
70617 match table created
71049 match table created
71163 match table created
71324 match table created
71932 match table created
71993 match table created
72025 match table created
72058 match table created
72406 match table created
72908 match table created
72978 match table created
73222 match table created
73397 match table created
73634 match table created
73826 match table created
73859 match table created
74101 match table created
74423 match table created
74523 match table created
74650 match table created
74706 match table created
74946 match table created
75069 match table created
75919 match table created
76037 match table created
76179 match table created
76305 match table created
76671 match table created
76679 match table created
76777 match table created
76830 match table created
76918 match table created
77840 match 

117754 match table created
117816 match table created
117821 match table created
117854 match table created
117856 match table created
117956 match table created
117962 match table created
117966 match table created
118013 match table created
118016 match table created
118022 match table created
118051 match table created
118098 match table created
118122 match table created
118129 match table created
118172 match table created
118204 match table created
118234 match table created
118256 match table created
118281 match table created
118313 match table created
118580 match table created
118594 match table created
118682 match table created
118942 match table created
119061 match table created
119169 match table created
119180 match table created
119412 match table created
119413 match table created
119416 match table created
119428 match table created
119521 match table created
119601 match table created
119640 match table created
119708 match table created
119733 match table created
1

175899 match table created
175924 match table created
176006 match table created
176028 match table created
176218 match table created
176298 match table created
176315 match table created
176352 match table created
176361 match table created
176507 match table created
176533 match table created
176539 match table created
176548 match table created
176639 match table created
176655 match table created
176660 match table created
176665 match table created
176667 match table created
176670 match table created
176681 match table created
176690 match table created
176691 match table created
176693 match table created
176694 match table created
176700 match table created
176715 match table created
176734 match table created
176778 match table created
176879 match table created
176925 match table created
176929 match table created
176943 match table created
176950 match table created
177016 match table created
177110 match table created
177194 match table created
177304 match table created
1

In [22]:
# continue compute the cloud cover, ice cover (SCL), total area
s2_sys_ids = ee.List(match_table.s2_system_ids.to_list())
s2_img_col = s2_with_dates.filter(ee.Filter.inList('system:id',s2_sys_ids))
def get_scl_s2(img):
    # total valid pixel
    valid = img.select("SCL").neq(1).rename("total")
    # cloud, cloud shadow, dark
    cloud_high = img.select("SCL").eq(9)
    cloud_shadow = img.select("SCL").eq(3)
    dark_pixel = img.select("SCL").eq(2)
    cloud_mask = cloud_high.Or(cloud_shadow).Or(dark_pixel).rename("cloud")
    # ice
    ice_mask = img.select("SCL").eq(11).rename("ice")
    
    return valid.addBands(cloud_mask).addBands(ice_mask)

# get the layer of ice and cloud info
s2_bands = s2_img_col.map(get_scl_s2).toBands()

stats = s2_bands.reduceRegions(
    collection = aoi,
    reducer = ee.Reducer.sum(),
    scale = None,
)

In [44]:
# compute cloud cover for each image
stat_df = geemap.ee_to_df(stats).iloc[:, :-21].T
stat_df.columns = ["pixel_sum"]

In [45]:
stat_df["system_id"] = ["".join(stat_df.index[i].split("_")[:-1]) for i in range(len(stat_df))]
stat_df["pixel_type"] = [stat_df.index[i].split("_")[-1] for i in range(len(stat_df))]

In [48]:
# compute coverage
cloud_df = 


Unnamed: 0_level_0,pixel_sum,pixel_type
system_id,Unnamed: 1_level_1,Unnamed: 2_level_1
20190115T04413120190115T044130T46RBV,0.000000,cloud
20190115T04413120190115T044130T46RBV,273569.333333,ice
20190115T04413120190115T044130T46RBV,501412.776471,total
20190204T04400120190204T044305T46RBV,0.000000,cloud
20190204T04400120190204T044305T46RBV,151874.705882,ice
...,...,...
20240707T04370120240707T044634T46RBV,0.000000,ice
20240707T04370120240707T044634T46RBV,501412.776471,total
20240712T04365920240712T044748T46RBV,0.000000,cloud
20240712T04365920240712T044748T46RBV,0.000000,ice
