<b>Detection of Sargassum in open sea</b>  
Notebook for classifying and analyzing Sargassum in Bonaire with Sentinel-2 images

* Density slicing (GNDVI),PCA, Decision Tree Classifier (DTC) and Maximum Likelihood Classifier (MLC) are employed
* 7 different subsets in open sea are investigated


In [None]:
import os
import re
import pandas as pd
import numpy as np
import rasterio as rio
from rasterio.merge import merge
from rasterio.features import sieve
import matplotlib.pyplot as plt
from glob import glob
import geopandas as gpd
from joblib import load
from tqdm import tqdm,tqdm_notebook
from tqdm.contrib import tzip
import rasterio.features 
from shapely.geometry import shape
from shapely.geometry.multipolygon import MultiPolygon

#custom functions
from Python.prep_raster import stack_bands,clip_raster,pixel_sample,computeIndexStack,compute_index
from Python.pca_slice import compute_pca_score,compute_pca_image,pca_triangle,density_slice
from Python.mlc import mlClassifier
from Python.pred_raster import stack2pred, dtc_pred_stack
from Python.misc import get_feat_layer_order

#setup IO directories
parent_dir = os.path.join(os.path.abspath('..'),'objective2')                  #change according to preference
sub_dirs = ['fullstack','clippedstack','indexstack','predicted','stack2pred']
make_dirs = [os.makedirs(os.path.join(parent_dir,name),exist_ok=True) for name in sub_dirs]

<b>Sentinel-2 data preparation</b>
* Resample coarse bands to 10m resolution
* Stack multiband images 
* Calculate spectral indices

In [None]:
#dates considered for classification and analysis 
dates_tiles = {"T19PEP":[20180304, 20190309],
               "T19PFP":[20180304, 20190304, 20190428]}

#band names
bands = ['B01_60m','B02_10m','B03_10m','B04_10m','B05_20m','B06_20m',
         'B07_20m','B08_10m','B8A_20m','B09_60m','B11_20m','B12_20m']

#get product file paths according to dates and tile ID T19PEP (covers Bonaire)
level2_dir = '...' #change according to preference
level2_files = glob(level2_dir+"/*.SAFE")

scene_paths = []
for file in level2_files:
    for key,value in dates_tiles.items():
        if key in file and any(str(v) in file for v in value):
            scene_paths.append(file)

#sort multiband image paths according to date
image_collection ={}

for scene in scene_paths:
    date = re.findall(r"(\d{8})T", scene)[0]
    tileid = re.findall(r"(T\d{2}P..)", scene)[0]
    
    #collect all .jp2 band images in SAFE directory
    all_images = [f for f in glob(scene + "*/**/*.jp2", recursive=True)]
    img_paths = [img_path for band in bands for img_path in all_images if band in img_path]
    image_collection[f'{tileid}_{date}'] = img_paths

#check nr. of images per date
for key in image_collection.keys():print(f'Date: {key} Images: {len(image_collection[key])}')

In [None]:
#stack multiband images to a geotiff (!computationaly intensive)
for date in tqdm(image_collection.keys(),position=0, leave=True):
    ref10m= image_collection[date][1]                                   #use band B02 (10m) as reference metadata
    outfile = os.path.join(parent_dir,'fullstack',f'stack_{date}.tif')
    stack_bands(image_collection[date],ref10m,outfile)

In [None]:
#crop multiband image stack 
roi_file = gpd.read_file('./data/boundaries/objective2/sea_rois.geojson') #polygon for cropping image

sub_geoms = dict(zip(roi_file['subset'],
                     [[roi_file.__geo_interface__['features'][i]['geometry']] for i in range(len(roi_file))]))

stack_files = glob(parent_dir + "/fullstack/*.tif")
clip_dir = os.path.join(parent_dir,'clippedstack')

for file in tqdm(stack_files,position=0, leave=True):
    date = re.findall(r"(\d{8})",file)[0]
    subset = os.path.basename(file)
    if 'T19PEP_20180304' in subset:
        clip_raster(file,sub_geoms['20180304_A'],
                    os.path.join(clip_dir,subset.replace(date,'20180304_A_clipped')),fill=True,nodat=0)
        clip_raster(file,sub_geoms['20180304_B'],
                    os.path.join(clip_dir,subset.replace(date,'20180304_B_clipped')),fill=True,nodat=0)
    elif 'T19PEP_20190309' in subset:
        clip_raster(file,sub_geoms['20190309'],
                    os.path.join(clip_dir,subset.replace(date,'20190309_clipped')),fill=True,nodat=0)
    elif 'T19PFP_20180304' in subset:
        clip_raster(file,sub_geoms['20180304_C'],
                    os.path.join(clip_dir,subset.replace(date,'20180304_C_clipped')),fill=True,nodat=0)
    elif 'T19PFP_20190304' in subset:
        clip_raster(file,sub_geoms['20190304'],
                    os.path.join(clip_dir,subset.replace(date,'20190304_clipped')),fill=True,nodat=0)
    elif 'T19PFP_20190428' in subset:
        clip_raster(file,sub_geoms['20190428_A'],
                    os.path.join(clip_dir,subset.replace(date,'20190428_A_clipped')),fill=True,nodat=0)
        clip_raster(file,sub_geoms['20190428_B'],
                    os.path.join(clip_dir,subset.replace(date,'20190428_B_clipped')),fill=True,nodat=0)
        


<b>PCA</b>
* Sparsely distributed Sargassum slicks (aka windrows) in rough waters are more visible on PCA 3-5
* Large seaweed at dark calm waters are visible on PCA 1-2 
* Additional stripe/ detector artifacts may cause false/ noisy pixels 
* PCA performance depends on the scene and image scale 

In [None]:
#crop pca border (fix)
roi_file = gpd.read_file('./data/boundaries/objective2/sea_rois.geojson') #polygon for cropping image
sub_geoms = dict(zip(roi_file['subset'],
                     [[roi_file.__geo_interface__['features'][i]['geometry']] for i in range(len(roi_file))]))

clip_files = glob(parent_dir+'/clippedstack/*_clipped.tif' )

#setup pca parameters and dir
os.makedirs(os.path.join(parent_dir,'predicted\\pca'),exist_ok=True)
bands = ['B02','B03','B04','B05','B06','B07','B08','B8A','B11','B12']
band_order = [2,3,4,5,6,7,8,9,11,12]

#collect pca score
pca_score_dfs = []

for file in tqdm(clip_files,position=0, leave=True):
    subset = re.search(r'stack_(.*?)_clipped', os.path.basename(file)).group(1)
    
    #compute pca raster with components = 5, also crop to fix nodata boundary
    outfile = os.path.join(parent_dir,'predicted/pca',f'pca_{subset}_x.tif')
    compute_pca_image(file,band_order,nr_components=5,outfile=outfile)
    clip_raster(outfile,sub_geoms[subset[7:]],outfile.replace('_x',""),fill=True,nodat=0)
    os.remove(outfile)
    
    #compute pca score and variance
    pca_score_dfs.append(compute_pca_score(file,band_order,bands,nr_components=5,subset))

#export pca score and variance
pd.concat(pca_score_dfs,axis=0).to_csv(r'./data/output/objective2/pca_score_obj2.csv')

* Triangle thresholding for unimodal image segmentation ([reference](https://imagej.net/Auto_Threshold))

In [None]:
#get PC images and create new output dir
pca_files = glob(parent_dir+'/predicted/pca/pca_*.tif' )
pc_sel = dict(zip(pca_files,[1,1,5,2,4,4,5]))

#perform triangle thresholding 
for file,pc in pc_sel.items():
    subset = re.search(r'pca_(.*?).tif', os.path.basename(file)).group(1)
    outfile = os.path.join(parent_dir,'predicted/pca',f'pca2_{subset}.tif')
    pca_triangle(file,pc,outfile)
    os.remove(file) #remove this line if you like to keep the original component images

<b>Density Slicing of GNDVI image </b>
* GNDVI value between -0.25 and -0.30 suitable for detecting Sargassum patches in calm dark waters (no clouds)
* GNDVI value > 0.05 more suitable for noisy scenes (rough waters/ cloudy)

In [None]:
#get clipped rasters and create output dir
clip_files = glob(parent_dir+'/clippedstack/*_clipped.tif')
os.makedirs(os.path.join(parent_dir,'predicted/slice/'),exist_ok=True)

index_sel = dict(zip(clip_files,[-0.25,-0.3]+[0.05]*5))

for clip_file,thr in index_sel.items():
    
    #create GNDVI raster
    subset = re.search(r'stack_(.*?)_clipped.tif', os.path.basename(clip_file)).group(1)
    gndvi_outfile = os.path.join(parent_dir,f'indexstack/gndvi_{subset}.tif')
    computeIndexStack(clip_file,['GNDVI'],gndvi_outfile)
    
    #perform density slicing
    slice_outfile = os.path.join(parent_dir,f'predicted/slice/slice_{subset}.tif')
    density_slice(gndvi_outfile,1,thr,slice_outfile)
    
    os.remove(gndvi_outfile) #remove this line if you like to keep the original GNDVI image

<b>DTC and MLC classification</b>
* Only the MLC base model (no threshold) is used

In [None]:
#load models
dtc = load(r".\data\models\dtc_model_sargassum.joblib")
mlc = load(r".\data\models\mlc_model_sargassum.joblib")

make_dirs = [os.makedirs(os.path.join(parent_dir,f'predicted/{name}'),exist_ok=True) for name in ['dtc','mlc']]

#clipped files
clip_files = glob(parent_dir+'/clippedstack/*_clipped.tif')

for file in tqdm(clip_files,position=0, leave=True):
    subset = re.search(r'stack_(.*?)_clipped.tif',file).group(1)

    with rio.open(file) as src:
        #DTC and MLC classifications
        stack2pred_img = np.concatenate((computeIndexStack(src.read(),['NDVI','REP']), src.read([5,11])))
        mlc_img = np.array([mlc.classify_raster_gx(stack2pred_img)])
        dtc_img = np.array([dtc_pred_stack(dtc,stack2pred_img)])
        
        #export results
        profile = src.profile.copy()
        profile.update({'nodata':0,'dtype':rio.uint8,'count':1})
        dtc_out,mlc_out = f'{parent_dir}/predicted/dtc/dtc_{subset}.tif',f'{parent_dir}/predicted/mlc/mlc_{subset}.tif'
        with rio.open(dtc_out,'w',**profile) as dtc_dst, rio.open(mlc_out,'w',**profile) as mlc_dst:
            dtc_dst.write(dtc_img.astype(rio.uint8))
            mlc_dst.write(mlc_img.astype(rio.uint8))

<b>Comparative analysis</b>  
* Compare Sargassum classified area in open sea across classifications (PCA, density slicing, DTC, MLC)

In [None]:
#get classification result paths
dtc_paths = glob(parent_dir+'/predicted*/dtc/dtc*.tif')
mlc_paths = glob(parent_dir+'/predicted*/mlc/mlc*.tif')
pca_paths = glob(parent_dir+'/predicted*/pca/pca2*.tif')
slice_paths = glob(parent_dir+'/predicted*/slice/slice*.tif')

#collect Sargassum pixel count 
data = {}
for dtc_file,mlc_file,pca_file,slice_file in zip(dtc_paths,mlc_paths,pca_paths,slice_paths):
    
    data.setdefault('Subset',[]).append(re.search(r'dtc_(.*?).tif', dtc_file).group(1))
    
    with rio.open(dtc_file) as dtc_src, rio.open(mlc_file) as mlc_src,rio.open(pca_file) as pca_src, rio.open(slice_file) as slice_src:

        data.setdefault('DTC',[]).append(np.count_nonzero(np.where(dtc_src.read(1)==3,1,0)))
        data.setdefault('MLC Base',[]).append(np.count_nonzero(np.where(mlc_src.read(1)==3,1,0)))
        data.setdefault('PCA Triangle',[]).append(np.count_nonzero(pca_src.read(1)))
        data.setdefault('GNDVI Slice',[]).append(np.count_nonzero(slice_src.read(1))) 
        
#export data
pd.DataFrame(data).to_csv('./data/output/objective2/sargassum_count_obj2.csv',index=False)   

* Plot Sargassum classified area

In [None]:
#load data and subset only the 2019 results
data = pd.read_csv('./data/output/objective2/sargassum_count_obj2.csv',index_col='Subset')
data = data.T[[data.T.columns[i] for i in [0,1,3,4,2,5,6]]].T 

#plot Sargassum 
plots = [plt.plot(data[col]/100,label=col) for col in data.columns]
plt.ylabel('Classified area (ha)')
plt.xticks(rotation=45)
plt.legend()

<b>Large scene classification</b>  
* Classify a larger scale image with the DTC, MLC, PCA and density slicing methods

In [None]:
#crop roi file and create output dir
roi_file = './data/boundaries/objective2/sea_extent_small.geojson'
os.makedirs(os.path.join(parent_dir,'predicted/large_scene'),exist_ok=True)

#load models
dtc = load(r".\data\models\dtc_model_sargassum.joblib")
mlc = load(r".\data\models\mlc_model_sargassum.joblib")

stack_files = glob(parent_dir+'/fullstack/*20180304.tif')

for file,pc in tzip(stack_files,[3,2]): 
    scene_name = re.search(r'stack_(.*?).tif',file).group(1)
    outfile = os.path.join(parent_dir,'predicted/large_scene',f'{scene_name}_clipped.tif')
    clip_raster(file,roi_file,outfile,fill=True)
        
    if os.path.exists(outfile):
        
        with rio.open(outfile) as src:
            
            #DTC and MLC classifications
            stack2pred_img = np.concatenate((computeIndexStack(src.read(),['NDVI','REP']), src.read([5,11])))
            mlc_img = np.where(np.array([mlc.classify_raster_gx(stack2pred_img)])==3,1,0)
            dtc_img = np.where(np.array([dtc_pred_stack(dtc,stack2pred_img)])==3,1,0)
            
            #PCA and GNDVI density slicing classifications
            pca_img = np.array([pca_triangle(compute_pca_image(outfile,[2,3,4,5,6,7,8,9,11,12],nr_components=pc),pc-1)])
            slice_img = np.array([np.where(compute_index(src.read(),'GNDVI')>=0,1,0)])
            
            #stack all results and apply cloud mask
            results = np.concatenate((mlc_img,dtc_img,pca_img,slice_img))
            cloud_mask = np.where(src.read([2])>=0.09,1,0)
            results_masked = np.where(cloud_mask!=1,results,0)

            #export results
            profile = src.profile.copy()
            profile.update({'nodata':None,'dtype':rio.uint16,'count':4})
            
            results_out = outfile.replace('_clipped','_multi')
            with rio.open(results_out,'w',**profile) as dst:
                dst.write(results_masked.astype(rio.uint16))


* Merge classification results over two tiles

In [None]:
img_files = glob(parent_dir+'/predicted/large_scene/T*_multi.tif')

#open image files and merge them
data = [rio.open(file) for file in img_files]
merged_data,data_affine = merge(data)

#sieve filter (spatial filter)
sieved_data= []
for img in merged_data:
    sieved_data.append(sieve(img,size=8))
sieved_data = np.array(sieved_data)

#export merged and sieve image
profile = data[0].profile.copy()
profile.update({'transform':data_affine,'width':sieved_data.shape[2],'height':sieved_data.shape[1],
                'nodata':0,'dtype':rio.uint16,'count':4})
outfile = os.path.join(parent_dir,'predicted/large_scene/merged_20180304_multi.tif')
with rio.open(outfile,'w',**profile) as dst:
    dst.write(sieved_data.astype(rio.uint16))  
    
#close image files
list(map(lambda x:x.close(),data))

* Convert merged results into vector data to reduce data size

In [None]:
#merged image
img_file = glob(parent_dir+'/predicted/large_scene/merged*_multi.tif')[0]

with rasterio.open(img_file) as src:
    imgs = src.read()
    data = []
    
    #iterate over the results in the multiband image
    for img,method in tzip(imgs,['MLC Base','DTC','PCA Triangle','GNDVI Slice']):
    
        #get coordinates of every pixel and filter only those with value =1 (Sargassum)
        shapes = list(rasterio.features.shapes(img, transform=src.transform))
        shapes = list(filter(lambda x: x[1]==1, shapes))
        
        #convert into geodataframe
        multipolygon = [MultiPolygon([shape(geom[0]) for geom in shapes])]
        gdf = gpd.GeoDataFrame({'Result': [method]},geometry=multipolygon,crs=src.crs) 
        data.append(gdf)
        
    #export polygonize features as geojson
    data = pd.concat(data,ignore_index=True)
    data.to_file('./data/output/objective2/4classifications_20180304.geojson',driver='GeoJSON')