<b>Mapping seagrass distribution in Lac Bay</b>  
Notebook for classifying and analyzing seagrass distribution in Lac Bay, Bonaire with Sentinel-2 images

* Decision Tree Classifier (DTC) and Maximum Likelihood Classifier (MLC) are employed
* Training sites covering 2 different classes (non-seagrass,seagrass) are used to extract pixel values (training samples) over RGB bands 
* 80:20 train-test ratio for splitting the training samples
* K-Fold cross-validation performed for tuning the DTC model
* MLC model developed with 4 different chi-square thresholds: 0% (base), 10%,20%,50%


In [None]:
import os
import re
import pandas as pd
import numpy as np
import rasterio as rio
import matplotlib.pyplot as plt
import seaborn as sns
from glob import glob
import geopandas as gpd
from joblib import dump,load
from tqdm import tqdm,tqdm_notebook

#custom functions
from Python.prep_raster import stack_bands,clip_raster,pixel_sample,computeIndexStack
from Python.spec_analysis import transpose_df,jmd2df
from Python.data_viz import ridgePlot,validation_curve_plot
from Python.mlc import mlClassifier
from Python.calc_acc import calc_acc
from Python.pred_raster import stack2pred, dtc_pred_stack

#sklearn functions
from sklearn.model_selection import train_test_split,validation_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.tree import DecisionTreeClassifier

#setup IO directories
parent_dir = os.path.join(os.path.abspath('..'),'objective3')                  #change according to preference
sub_dirs = ['fullstack','clippedstack','indexstack','predicted','stack2pred']
make_dirs = [os.makedirs(os.path.join(parent_dir,name),exist_ok=True) for name in sub_dirs]

<b>Sentinel-2 data preparation</b>
* Resample coarse bands to 10m resolution
* Stack multiband images 
* Calculate spectral indices

In [None]:
#dates considered for classification and analysis 
dates = [20190108,20190128,20190212,20190304,20190821,20191129]

#band names
bands = ['B02_10m','B03_10m','B04_10m']

#get product file paths according to dates and tile ID T19PEP (covers Bonaire)
level2_dir = '...' #change according to preference
level2_files = glob(level2_dir+'/*.SAFE')

scene_paths=[file for date in dates for file in level2_files if str(date) in file and 'T19PEP' in file]

#sort multiband image paths according to date
image_collection ={}

for scene in scene_paths:
    date = re.findall(r"(\d{8})T", scene)[0]
    
    #collect all .jp2 band images in SAFE directory
    all_images = [f for f in glob(scene + "*/**/*.jp2", recursive=True)]
    img_paths = [img_path for band in bands for img_path in all_images if band in img_path]
    image_collection[date] = img_paths

#check nr. of images per date
for key in image_collection.keys():print(f'Date: {key} Images: {len(image_collection[key])}')

In [None]:
#polygon for cropping image
roi_file = './data/boundaries/objective3/lacbay_roi.geojson'                   
cm_20190128 = './data/boundaries/objective3/cloudmask_20190128.geojson'  
cm_20190212 = './data/boundaries/objective3/cloudmask_20190212.geojson' 

#stack multiband images to a geotiff 
for date in tqdm(image_collection.keys(),position=0, leave=True):
    stack_file = os.path.join(parent_dir,'fullstack',f'stack_{date}.tif')
    stack_bands(image_collection[date],image_collection[date][1],stack_file)
    
    clip_outfile = os.path.join(parent_dir,'clippedstack',f'stack_{date}_clipped.tif')
    #crop multiband image
    if '20190128' in date:
        clip_raster(stack_file,cm_20190128,clip_outfile,fill=True,nodat=0)
    elif '20190212' in date:
        clip_raster(stack_file,cm_20190212,clip_outfile,fill=True,nodat=0)
    else:
        clip_raster(stack_file,roi_file,clip_outfile,fill=True,nodat=0)

<b>Sample pixel values from multiband images based on training sites</b>  
* Training scenes from 1 and 28 January and 12 February 2019

In [None]:
#get training sites and corresponding images
train_sites = glob(r".\data\training_input\objective3\*_lac.geojson")    
dates = [20190108,20190128,20190212]                                 
stack_bands = [f for date in dates for f in glob(parent_dir+'/clipped*/*_clipped.tif') if str(date) in f]  

#bands
band_names = ['B02','B03','B04']

dataset = []
for i in range(len(train_sites)):

    #sample multibands and spectral indices
    df_sample= pixel_sample(stack_bands[i],train_sites[i],band_names)
    dataset.append(df_sample)
    
#final dataset
dataset=pd.concat(dataset,sort=False).reset_index(drop=True) 
dataset.to_csv(r'./data/training_input/csv/training_samples_20190108_20190212_seagrass.csv',index=False)

<b>Expore spectral signature</b>  
* Jeffries-Matusita distance (JMD) used for feature selection ([reference](https://books.google.nl/books?id=RxHbb3enITYC&pg=PA52&lpg=PA52&dq=for+one+feature+and+two+classes+the+Bhattacharyya+distance+is+given+by&source=bl&ots=sTKLGl1POo&sig=ACfU3U2s7tv0LT9vfSUat98l4L9_dyUgeg&hl=nl&sa=X&ved=2ahUKEwiKgeHYwI7lAhWIIlAKHZfJAC0Q6AEwBnoECAkQAQ#v=onepage&q&f=false))
* RGB (bands 4,3,2) are selected as input features for the classifiers (though worst JMD scores)

In [None]:
#load training sample
df = pd.read_csv(r'./data/training_input/csv/training_samples_20190108_20190212_seagrass.csv')

#plot JMD heatmap for each band
jmd_bands = [jmd2df(transpose_df(df,'C',band)) for band in ['B02','B03','B04']]
sns.heatmap(pd.concat(jmd_bands,sort=True),annot=True)
ridgePlot(df[['C','B02','B03','B04']],'C')


<b>Build classifiers</b>  

In [None]:
#load training sample
df = pd.read_csv(r'./data/training_input/csv/training_samples_20190108_20190212_seagrass.csv')
subset_df = df[['C','B02','B03','B04']]

#split into train and test datasets 80:20
train,test = train_test_split(subset_df, train_size = 0.8,random_state=1,shuffle=True,stratify=np.array(subset_df['C']))
train = train.sort_values(by='C',ascending=True) #sort labels

#split pedictors from labels (for DTC)
le = LabelEncoder()
X_train,y_train = train[['B02','B03','B04']],le.fit_transform(train['C'])
X_test,y_test = test[['B02','B03','B04']],le.fit_transform(test['C'])

* Decision Tree Classifier

In [None]:
#perform k-fold (=10) cross-validation 

#parameters considered in this step
max_depth = np.arange(1,40,2)                    
min_samples_split = list(range(2, 100,10))         
max_leaf_nodes= list(range(2, 50,5))              
min_samples_leaf= list(range(1, 100,10))           
min_impurity_decrease=[0,0.00005,0.0001,0.0002,0.0005,0.001,0.0015,0.002,0.005,0.01,0.02,0.05,0.08]   
criterion = ['gini','entropy']

#assign parameters to a dictionary
params = {'max_depth':max_depth,'min_samples_split':min_samples_split,
          'max_leaf_nodes':max_leaf_nodes,'min_samples_leaf':min_samples_leaf,
          'min_impurity_decrease':min_impurity_decrease,'criterion':criterion}

#plot validation curve
fig,axs = plt.subplots(3,2,figsize=(10,8))
axs = axs.ravel()
dtc = DecisionTreeClassifier(random_state=1,criterion='entropy')                    #default model

for (param_name,param_range),i in zip(params.items(),range(len(params.items()))):
    train_scores,test_scores = validation_curve(dtc,X_train.values,y_train,cv=10,scoring='accuracy',
                                                n_jobs=-1,param_range=param_range,param_name=param_name)
    validation_curve_plot(train_scores,test_scores,param_range,param_name,axs[i])
plt.show()



In [None]:
#train dtc model based on best parameters
dtc = DecisionTreeClassifier(max_depth=5,random_state=42,criterion='entropy',
                             min_samples_split=50,max_leaf_nodes=10,min_samples_leaf=30,min_impurity_decrease=0.02)
dtc = dtc.fit(X_train,y_train)

#export model as joblib file
dump(dtc,r".\data\models\dtc_model_seagrass.joblib")

* Maximum Likelihood Classifier

In [None]:
#train mlc model
mlc = mlClassifier(train,'C')

#export model as joblib file
dump(mlc,r".\data\models\mlc_model_seagrass.joblib")

* Compute model accuracies (based on test split)

In [None]:
#load models
dtc = load(r".\data\models\dtc_model_seagrass.joblib")
mlc = load(r".\data\models\mlc_model_seagrass.joblib")

#DTC model accuracy
dtc_y_pred = dtc.predict(X_test)
con_mat_dtc = calc_acc(le.inverse_transform(y_test),le.inverse_transform(dtc_y_pred))
con_mat_dtc['classifier'] = 'DTC'

#MLC model accuracies with chi-square threshold
chi_table = {'MLC base':None,'MLC 10%':7.78,'MLC 20%':5.99,'MLC 50%':3.36}

mlc_conmats = []
for key,value in chi_table.items():
    con_mat_mlc = mlc.classify_testdata(test,'C',threshold=value)
    con_mat_mlc['classifier'] = key
    mlc_conmats.append(con_mat_mlc)

#export model accuracies
mlc_conmats = pd.concat(mlc_conmats)
model_acc = pd.concat([con_mat_dtc,mlc_conmats])
model_acc.to_csv('./data/output/objective3/dtc_mlc_model_acc_obj3.csv')

<b>Classification</b>  

In [None]:
#load models
dtc = load(r".\data\models\dtc_model_seagrass.joblib")
mlc = load(r".\data\models\mlc_model_seagrass.joblib")

#output dir
os.makedirs(os.path.join(parent_dir,'predicted/dtc'),exist_ok=True)
os.makedirs(os.path.join(parent_dir,'predicted/mlc'),exist_ok=True)

clipped_files = glob(parent_dir+'/clippedstack/*_clipped.tif')
dates= [20190108,20190304,20190821,20191129]
clipped_files = [path for path in clipped_files for date in dates if str(date) in path]

for file in clipped_files:
    date = re.findall(r"(\d{8})", file)[0]
    chi_probs = [None,7.78,5.99,3.36]
    with rio.open(file) as src:
        stack2pred_img = src.read()
        mlc_imgs = np.array([mlc.classify_raster_gx(stack2pred_img,threshold=prob) for prob in chi_probs])
        dtc_img = np.array([dtc_pred_stack(dtc,stack2pred_img)])
        
        #export results
        mlc_profile = src.profile.copy()
        mlc_profile.update({'nodata':None,'dtype':rio.uint16,'count':4})
        mlc_out = os.path.join(parent_dir,'predicted/mlc',f'mlc_{date}.tif')
        
        dtc_profile = src.profile.copy()
        dtc_profile.update({'nodata':None,'dtype':rio.uint8,'count':1})
        dtc_out = os.path.join(parent_dir,'predicted/dtc',f'dtc_{date}.tif')
        
        with rio.open(mlc_out,'w',**mlc_profile) as mlc_dst, rio.open(dtc_out,'w',**dtc_profile) as dtc_dst:
            mlc_dst.write(mlc_imgs.astype(rio.uint16))
            dtc_dst.write(dtc_img.astype(rio.uint8))

<b>External validity</b>  
* Classify DTC and MLC results for a scene taken on 2019-03-04
* Seagrass pixel value = 2 in the DTC and MLC rasters 

In [None]:
#get file paths
val_samples = gpd.read_file(r'./data/training_input/objective3/sg_validation_2019.geojson')
dtc_file = glob(parent_dir+'/predicted*/dtc/dtc*20190304*.tif')[0]
mlc_file = glob(parent_dir+'/predicted*/mlc/mlc*20190304*.tif')[0]

coords =  [(val_samples.geometry[i][0].x,val_samples.geometry[i][0].y) for i in range(len(val_samples))]

with rio.open(dtc_file) as dtc_src, rio.open(mlc_file) as mlc_src:
    #sample from dtc raster
    val_samples['DTC'] = [pt[0] for pt in dtc_src.sample(coords)]
    
    #sample from multilayer mlc raster
    mlc_multi = pd.concat([pd.DataFrame(pt).T for pt in mlc_src.sample(coords)],ignore_index=True)
    val_samples[['MLC base','MLC 10%','MLC 20%','MLC 50%']] = mlc_multi
    
#convert pixel values to 1 if seagrass, else to 0 for others
val_samples[val_samples.columns[-5:]] = (val_samples[val_samples.columns[-5:]]==2).astype(int)
val_samples.drop(['site','mean_cover'],axis=1,inplace=True)

#compute classification (validation) accuracy 
df_val = pd.DataFrame(val_samples.drop(columns='geometry'))

acc_val_dfs = []
for pred in df_val.columns[df_val.columns!='label']:
    acc = calc_acc(df_val['label'].values, df_val[pred].values)
    acc['classifier'] = pred
    acc_val_dfs.append(acc)
acc_val_dfs = pd.concat(acc_val_dfs)
acc_val_dfs.to_csv('./data/output/objective3/dtc_mlc_external_val_obj3.csv')

In [None]:
model_df = pd.read_csv('./data/output/objective3/dtc_mlc_model_acc_obj3.csv').set_index('Unnamed: 0')
val_df  = pd.read_csv('./data/output/objective3/dtc_mlc_external_val_obj3.csv').set_index('Observed')

acc2plot = {'Model accuracy (2 classes)':model_df.loc['PA','UA'].str[:4].astype(float),
            'Model F1-score (Sg)':model_df.loc['sg','F1-score'].astype(float),
            'Validation accuracy (2 classes)':val_df.loc['PA','UA'].str[:4].astype(float),
            'Validation F1-score (Sg)':val_df.loc['1','F1-score'].astype(float)}

[plt.plot(val_df['classifier'].unique(),value,label=key) for key,value in acc2plot.items()]
plt.legend()

<b>Comparative analysis</b>  
* Compare seagrass (Sg classified area across different scenes for each model

In [None]:
#get classification result paths
dtc_paths = glob(parent_dir+'/predicted/dtc/dtc*.tif')
mlc_paths = glob(parent_dir+'/predicted/mlc/mlc*.tif')

data = dict.fromkeys(['Date','Sg MLC Base','Sg MLC 10%','Sg MLC 20%','Sg MLC 50%','Sg DTC'], [])

for i in range(len(mlc_paths)):
    date = re.findall(r"(\d{8})", mlc_paths[i])
    data['Date'] = data['Date']+ [str(pd.to_datetime(date)[0].date())]
    
    with rio.open(dtc_paths[i]) as dtc_src, rio.open(mlc_paths[i]) as mlc_src:
        data['Sg DTC'] = data['Sg DTC'] + [np.unique(dtc_src.read(),return_counts=True)[1][1]]
        for k,sf_mlc_key in enumerate(list(data.keys())[1:-1]):
            data[sf_mlc_key] = data[sf_mlc_key]+ [np.unique(mlc_src.read([k+1]), return_counts=True)[1][1]]
            
#export data
data = pd.DataFrame(data)
data.to_csv('./data/output/objective3/classified_area_obj3.csv',index=False)            

* Plot seagrass classified area in 2019

In [None]:
#load data and subset only the 2019 results
data = pd.read_csv('./data/output/objective3/classified_area_obj3.csv',index_col='Date')

#plot seagrass classified area in Lac Bay
plt.ylabel('Classified area (ha)')
plt.plot(data/100)
plt.legend(data.columns,loc='upper left')