# Train CoastSat classifier

In this notebook the CoastSat classifier is trained using satellite images from new sites. This can improve the accuracy of the shoreline detection if the users are experiencing issues with the default classifier.

#### Initial settings

In [26]:
# load modules
%load_ext autoreload
%autoreload 2
import os
import numpy as np
import pickle
import warnings
warnings.filterwarnings("ignore")
import matplotlib.pyplot as plt

# sklearn modules
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.externals import joblib
from sklearn.model_selection import cross_val_score

# coastsat modules
from coastsat import SDS_download, SDS_preprocess, SDS_shoreline, SDS_tools, SDS_classify

# filepaths 
filepath_images = os.path.join(os.getcwd(), 'data')
filepath_train = os.path.join(os.getcwd(), 'examples', 'training_data')

# settings
settings ={'cloud_thresh':0.1, # percentage of cloudy pixels accepted on the image
           'cloud_mask_issue':False, # set to True if problems with the default cloud mask 
           'inputs':{'filepath':filepath_images}, # folder where the images are stored
           'labels':{'sand':1,'white-water':2,'water':3,'other land features':4}, # labels for the classifier
           'flood_fill': True, # set to True to use the flood fill functionality
           'tolerance':0.02, # if flood_fill set to True, this is the pixel intensity tolerance 
           'filepath_train':filepath_train} # folder where the labelled images are stored

# read kml files for the training sites
filepath_sites = os.path.join(os.getcwd(), 'examples', 'training_sites')
train_sites = os.listdir(filepath_sites)
print('Sites for training:\n%s\n'%train_sites)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload
Sites for training:
['COLLAROY.kml']



### 1. Download images

For each site on which you want to train the classifier, save a .kml file with the region of interest (5 vertices clockwise, first and last points are the same, can be created from Google myMaps) in the folder *\training_sites*

In [4]:
# dowload images at the sites
filepath = os.path.join(os.getcwd(), 'data')
dates = ['2017-01-01', '2017-03-01']
sat_list = 'L8'
for site in train_sites:
   polygon = SDS_tools.polygon_from_kml(os.path.join(filepath_sites,site))
   sitename = site[:site.find('.')]  
   inputs = {'polygon':polygon, 'dates':dates, 'sat_list':sat_list,
             'sitename':sitename, 'filepath':filepath}
   print(site)
   metadata = SDS_download.retrieve_images(inputs)

Sites for training:
['COLLAROY.kml']
COLLAROY.kml
Downloading images:
L8: 6 images
100%


### 2. Label images

Label the images into 4 classes: sand, white-water, water and other land features.

The labelled images are saved in the *filepath_train* and can be visualised afterwards for quality control. If yo make a mistake, don't worry, this can be fixed later by deleting the labelled image.

In [31]:
# label the images with an interactive annotator
%matplotlib qt
for site in train_sites:
    settings['inputs']['sitename'] = site[:site.find('.')] 
    # load metadata
    metadata = SDS_download.get_metadata(settings['inputs'])
    # label images
    SDS_classify.label_images(metadata,settings)

[45 46]
[46 45]
[45 43]
[45 41]
[103   1]
[45 47]
[46 46]


StopIteration: User cancelled labelling images

### 3. Train Classifier

A Multilayer Perceptron is trained with *scikit-learn*. To train the classifier, the training data needs to be loaded.

You can use the data that was labelled here and/or the original CoastSat training data.

In [None]:
# load the labelled data
# initialise the matrix with all the features
n_features = 20
first_row = np.nan*np.ones((1,n_features))
features_matrix = {'sand':first_row, 'white-water':first_row,
                   'water':first_row, 'other land features':first_row}
# read image labels from each site
train_sites = os.listdir(filepath_sites)
for site in train_sites:
    sitename = site[:site.find('.')] 
    filepath = os.path.join(filepath_train,sitename)
    if os.path.exists(filepath):
        list_files = os.listdir(filepath)
    else:
        continue
    # only keep the .pkl files
    list_files_pkl = []
    for file in list_files:
        if '.pkl' in file:
            list_files_pkl.append(file)
    # load and append the training data to the features matrix
    for file in list_files_pkl:
        with open(os.path.join(filepath, file), 'rb') as f:
            training_data = pickle.load(f)  
            for key in training_data['features'].keys():
                # check if empty
                if len(training_data['features'][key])>0:
                    features_matrix[key] = np.append(features_matrix[key],
                                training_data['features'][key], axis=0)  
# remove the first row (initialized with nans)
print('Number of pixels per class in training data:')
for key in settings['labels'].keys(): 
    features_matrix[key] = features_matrix[key][1:,:]
    print('%s : %d pixels'%(key,len(features_matrix[key])))

Or load the original CoastSat dataset (from NSW beaches), which is stored in a .pkl file (you can also combine both):

In [None]:
# you can also load the original CoastSat training data (and optionally merge it with your labelled data)
with open(os.path.join(os.getcwd(),'training_data', 'CoastSat_original_training_set_L8.pkl'), 'rb') as f:
    features_matrix_original = pickle.load(f)
features_matrix = features_matrix_original # comment this line if you want to merge the two datasets and uncomment the line that is commented below
print('Number of pixels per class in training data:')
for key in features_matrix.keys():
#     features_matrix[key] = np.append(features_matrix[key], features_matrix_original[key], axis=0)
    print('%s : %d pixels'%(key,len(features_matrix[key])))

As the classes do not have the same number of pixels, it is good practice to subsample the very large classes (in this case 'water' and 'other land features'):

In [None]:
# subsample randomly the land and water classes
n_samples = 7000 # as the most important class is 'sand', this value should be close to the number of sand pixels
features_matrix['water'] =  features_matrix['water'][np.random.choice(features_matrix['water'].shape[0],
             n_samples, replace=False),:]
features_matrix['other land features'] =  features_matrix['other land features'][np.random.choice(features_matrix['other land features'].shape[0],
             n_samples, replace=False),:]
for key in features_matrix.keys():
    print('%s : %d pixels'%(key,len(features_matrix[key])))
    
# combine into X matrix of features and y vector with the corresponding labels (for each row of X)
X = first_row
y = np.nan*np.ones((1,1))
label_names = ['sand','white-water','water','other land features']
labels = [1,2,3,0]
for i,key in enumerate(label_names):
    y = np.append(y, labels[i]*np.ones((features_matrix[key].shape[0],1)), axis=0)
    X = np.append(X, features_matrix[key], axis=0)
X = X[1:,:]
X[np.isnan(X)] = 1e-9 # nan values will break the training algorithms
y = y[1:]

Divide the dataset into train and test: train on 70% of the data and evaluate on the other 30%:

In [None]:
# divide in train and test and evaluate the classifier
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle=True, random_state=0)
classifier = MLPClassifier(solver='adam')
classifier.fit(X_train,y_train)
print(classifier.score(X_test,y_test))

A more robust evaluation is 10-fold cross-validation (may take a few minutes to run):

In [None]:
# cross-validation
scores = cross_val_score(classifier, X, y, cv=10)
print("Accuracy: %0.4f (+/- %0.4f)" % (scores.mean(), scores.std() * 2))

Plot a confusion matrix:

In [None]:
# plot non-normalized confusion matrix
%matplotlib inline
y_pred = classifier.predict(X_test)
SDS_classify.plot_confusion_matrix(y_test, y_pred,
                                   classes=['other land features','sand','white-water','water'],
                                   normalize=False);

When satisfied with the accuracy and confusion matrix, train the model using ALL the training data and save it:

In [None]:
# train and save the final classifier
clf_final = MLPClassifier(solver='adam')
clf_final.fit(X,y)
joblib.dump(clf_final, os.path.join(os.getcwd(), 'classifiers', 'NN_4classes_L8_test.pkl'))

### 4. Visually evaluate the classifier

In [None]:
# load the classifier to be tested
%matplotlib qt
classifier = joblib.load(os.path.join(os.getcwd(), 'classifiers', 'NN_4classes_L8_test.pkl'))
# visualise the classified images
%matplotlib qt
for site in train_sites:
    settings['inputs']['sitename'] = site[:site.find('.')] 
    # load metadata
    metadata = SDS_download.get_metadata(settings['inputs'])
    # plot the classified images
    SDS_classify.check_classifier(clf_final,metadata,settings)