# Comparing CNN and RF

Rather than training RF on 30k sample points, this time I'm going to train it on the entire image

In [21]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pickle
import ee
import os
import rasterio
ee.Initialize()

In [22]:
# Parameters
path_clear = 'C:/Users/ipdavies/CPR/data/images/clear_4337_LC08_026038_20160325'
path_clouds = 'C:/Users/ipdavies/CPR/data/images/clouds_4337_LC08_026038_20160325'
model_path = 'C:/Users/ipdavies/CPR/data/models/'

# Misc functions
import time
def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    return str("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

## Random Forest with oversampling of flooded pixels
### Load images, stack and convert to np.array

In [23]:
# Stack exported tifs from GEE into one multiband tif

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

file_list = []
for file in listdir_fullpath(path_clear):
    if file.endswith('.tif'):
        file_list.append(file)

feat_list_files = list(map(lambda x: x.split('.')[-2], file_list)) # list of features in file order


#=========== Want to rearrange the order of files so that target feature is last

# Create list of features with target feature (flooded) last
feat_list_new = ['aspect','curve', 'developed', 'distExtent', 'elevation', 'forest',
 'GSW_maxExtent', 'hand', 'other_landcover', 'planted', 'slope', 'spi', 'twi', 'wetlands', 'flooded']

# Create 1 row df of file names where each col is a feature name, in the order files are stored locally
file_arr = pd.DataFrame(data=[file_list], columns=feat_list_files)

# Then index the file list by the ordered list of feature names used in training
file_arr = file_arr.loc[:, feat_list_new]

# The take this re-ordered row as a list - the new file_list
file_list = list(file_arr.iloc[0,:])
    
# Read metadata of first file. This needs to be a band in float32 dtype, because it sets the metadata for the entire stack
# and we are converting the other bands to float64
with rasterio.open(file_list[1]) as src0:
    meta = src0.meta
    meta['dtype'] = 'float32'
#         print(meta)

# Update meta to reflect the number of layers
meta.update(count = len(file_list))

# Read each layer, convert to float, and write it to stack
# There's also a gdal way to do this, but unsure how to convert to float: https://gis.stackexchange.com/questions/223910/using-rasterio-or-gdal-to-stack-multiple-bands-without-using-subprocess-commands

# Make new directory for stacked tif if it doesn't already exist
try:
    os.mkdir(path_clear+'/stack')
except OSError:
    pass

# Remove stack file if already exists
try:
    os.remove(path_clear + '/stack/stack.tif')
except OSError:
    pass

with rasterio.open(path_clear + '/stack/stack.tif', 'w', **meta) as dst:
    for id, layer in enumerate(file_list, start=0):
        with rasterio.open(layer) as src1:
            dst.write_band(id+1, src1.read(1).astype('float32'))

  transform = guard_transform(transform)


In [24]:
# Preprocess the array

# Get local image
with rasterio.open(path_clear + '/stack/stack.tif', 'r') as ds:
    data = ds.read()
    data = data.transpose((1, -1, 0)) # Not sure why the rasterio.read output is originally (D, W, H)

# Need to remove NaNs because any arithmetic operation involving an NaN will result in NaN

# Convert -999999 to None
data[data == -999999] = np.nan

# Get indices of non-nan values. These are the indices of the original image array
# data_ind = np.where(data[:,:,1] != None)
data_ind = np.where(~np.isnan(data[:,:,1]))
row, col = zip(np.where(~np.isnan(data[:,:,1]))) # image row and col of values
len(*row)

14000674

### Shuffle, split into train/test, normalize

In [25]:
# # Can't train on the entire image, so let's take a random sample

# HOLDOUT_FRACTION = 0.1

# # Reshape into a single vector of pixels.
# data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])

# # Remove NaNs
# data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
# data_vector.shape

# # Shuffle data
# np.random.shuffle(data_vector)

# # Compute per-band means and standard deviations of the input bands.
# data_mean = training_data[:,0:14].mean(0)
# data_std = training_data[:,0:14].std(0)

# # Hold out a fraction of the labeled data for validation.
# training_size = int(data_vector.shape[0] * (1 - HOLDOUT_FRACTION))
# training_data = data_vector[0:training_size,:]
# validation_data = data_vector[training_size:-1,:]

In [26]:
# # Let's create another sample, this time stratified flood/non-flood. We oversample flooded pixels
# from imblearn.over_sampling import SMOTE

# # Reshape into a single vector of pixels.
# data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])

# # Remove NaNs
# data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
# data_vector.shape

# # Compute per-band means and standard deviations of the input bands.
# data_mean = data_vector[:,0:14].mean(0)
# data_std = data_vector[:,0:14].std(0)

# # Normalize data
# X = (data_vector[:,0:14] - data_mean) / data_std
# y = data_vector[:,14]

# X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=12)
# sm = SMOTE(random_state=12, ratio = 1.0)
# X_train_res, y_train_res = sm.fit_sample(X_train, y_train)

In [27]:
# sm.fit_sample is taking way too long to run, so let's try just running it on a sample.
# This time with ALL flooded pixels and a subset of non-flooded pixels

# Reshape into a single vector of pixels.
data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])

# Remove NaNs
data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
data_vector.shape

(14000674, 15)

In [28]:
sample_per = 0.1 # Size of sample as a %

training_size = int(data_vector.shape[0] * (sample_per)) 
training_data = data_vector[0:training_size,:]
validation_data = data_vector[training_size:-1,:]

flooded = data_vector[data_vector[:,14]==1] # Select only flooded pixels
np.random.shuffle(flooded) # Shuffle
sample = data_vector[data_vector[:,14]==0] # Select only non-flooded pixels
np.random.shuffle(sample) # Shuffle

sample_size = int(sample.shape[0] * (sample_per)) # Get sample size in number of rows
sample = sample[0:sample_size,:] # Sample non-flooded pixels
training_data = np.concatenate((flooded, sample), axis=0) # Combine flooded and non-flooded
training_data.shape

# Compute per-band means and standard deviations of the input bands.
data_mean = training_data[:,0:14].mean(0)
data_std = training_data[:,0:14].std(0)

# Normalize data
X = (training_data[:,0:14] - data_mean) / data_std
y = training_data[:,14]

X_train_res, X_val_res, y_train_res, y_val_res = train_test_split(X, y, test_size = 0.2, random_state=12)

In [29]:
# No oversampling

sample_per = 0.1 # Size of sample as a %

sample = data_vector

np.random.shuffle(sample) # Shuffle dataset

# Sample a percentage of pixels
sample_size = int(sample.shape[0] * sample_per)
sample = sample[0:sample_size,:]

# Compute per-band means and standard deviations of the input bands.
data_mean = sample[:,0:14].mean(0)
data_std = sample[:,0:14].std(0)

# Normalize data
X = (data_vector[:,0:14] - data_mean) / data_std
y = data_vector[:,14]

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.2, random_state=12)

In [30]:
from sklearn.ensemble import RandomForestClassifier

model_name = 'RF_oversamp'

# Hyperparameters optimized for the 30k sample (using a random search)
# best_n_estimators = 800
best_n_estimators = 15
best_min_samples_split = 10
best_min_samples_leaf = 2
best_max_features = 'sqrt'
best_max_depth = 20
best_bootstrap = False

# Instantiate model with best parameters from random search
clf = RandomForestClassifier(n_estimators = best_n_estimators, 
#                              max_depth = best_max_depth,
#                              max_features = best_max_features,
#                              min_samples_split = best_min_samples_split,
#                              min_samples_leaf = best_min_samples_leaf,
                             bootstrap = best_bootstrap,
                             random_state=0)



start_time = time.time()

# Fit the RF model to data
clf.fit(X_train_res, y_train_res)

print('RF training time: ' + timer(start_time, time.time()))

# Save model
pickle.dump(clf, open(model_path+model_name+'.sav', 'wb'))

RF training time: 00:02:58.21


In [None]:
# No oversampling RF

model_name = 'RF_NoOversamp'

# Hyperparameters optimized for the 30k sample (using a random search)
# best_n_estimators = 800
best_n_estimators = 15
best_min_samples_split = 10
best_min_samples_leaf = 2
best_max_features = 'sqrt'
best_max_depth = 20
best_bootstrap = False

# Instantiate model with best parameters from random search
clf = RandomForestClassifier(n_estimators = best_n_estimators, 
#                              max_depth = best_max_depth,
#                              max_features = best_max_features,
#                              min_samples_split = best_min_samples_split,
#                              min_samples_leaf = best_min_samples_leaf,
                             bootstrap = best_bootstrap,
                             random_state=0)



start_time = time.time()

# Fit the RF model to data
clf.fit(X_train, y_train)

print('RF training time: ' + timer(start_time, time.time()))

# Save model
pickle.dump(clf, open(model_path+model_name+'.sav', 'wb'))

### Performance Metrics of Random Forest models on train/test

In [48]:
# Import models
clf_rf_oversamp = pickle.load(open(model_path+'RF_oversamp'+'.sav', 'rb'))
clf_rf_NoOversamp = pickle.load(open(model_path+'RF_NoOversamp'+'.sav', 'rb'))
clf_rf = pickle.load(open(model_path+'RF1'+'.sav', 'rb'))


models = {'RF_oversamp':clf_rf_oversamp,
          'RF_NoOversamp':clf_rf_NoOversamp,
         'RF':clf_rf}

# Performance metrics
from sklearn import metrics

# Confusion matrix
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()

for name, model in models.items():
    y_pred = model.predict(X_val)
    print(name)
    print("Classification report for classifier %s:\n%s\n"
      % (model, metrics.classification_report(y_val, y_pred)))
    print_cm(metrics.confusion_matrix(y_val, y_pred)/len(y_pred)*100, labels=['flooded','notFlooded'])
# Top row = predicted, left col = true



RF_oversamp
Classification report for classifier RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False):
              precision    recall  f1-score   support

         0.0       1.00      0.86      0.92   2688332
         1.0       0.22      0.94      0.36    111803

   micro avg       0.87      0.87      0.87   2800135
   macro avg       0.61      0.90      0.64   2800135
weighted avg       0.97      0.87      0.90   2800135


                  flooded notFlooded 
       flooded       82.8       13.2 
    notFlooded        0.2        3.8 
RF_NoOversamp
Classification report for classifier RandomForestClassifier(bootstrap=False, class_w

### Performance on gap filling

In [7]:
# Stack exported tifs from GEE into one multiband tif
import rasterio
import os

path = '../data/images/clouds_4337_LC08_026038_20160325'

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

file_list = []
for file in listdir_fullpath(path):
    if file.endswith('.tif'):
        file_list.append(file)

feat_list_files = list(map(lambda x: x.split('.')[-2], file_list)) # list of features in file order

#=========== Want to rearrange the order of files so that target feature is last

# Create list of features with target feature (flooded) last
feat_list_new = ['aspect','curve', 'developed', 'distExtent', 'elevation', 'forest',
 'GSW_maxExtent', 'hand', 'other_landcover', 'planted', 'slope', 'spi', 'twi', 'wetlands', 'flooded']

# Create 1 row df of file names where each col is a feature name, in the order files are stored locally
file_arr = pd.DataFrame(data=[file_list], columns=feat_list_files)

# Then index the file list by the ordered list of feature names used in training
file_arr = file_arr.loc[:, feat_list_new]

# The take this re-ordered row as a list - the new file_list
file_list = list(file_arr.iloc[0,:])
    
# Read metadata of first file. This needs to be a band in float32 dtype, because it sets the metadata for the entire stack
# and we are converting the other bands to float64
with rasterio.open(file_list[1]) as src0:
    meta = src0.meta
    meta['dtype'] = 'float32'
#         print(meta)

# Update meta to reflect the number of layers
meta.update(count = len(file_list))

# Read each layer, convert to float, and write it to stack
# There's also a gdal way to do this, but unsure how to convert to float: https://gis.stackexchange.com/questions/223910/using-rasterio-or-gdal-to-stack-multiple-bands-without-using-subprocess-commands

# Make new directory for stacked tif if it doesn't already exist
try:
    os.mkdir(path+'/stack')
except OSError:
    pass

# Remove stack file if already exists
try:
    os.remove(path + '/stack/stack.tif')
except OSError:
    pass

with rasterio.open(path + '/stack/stack.tif', 'w', **meta) as dst:
    for id, layer in enumerate(file_list, start=0):
        with rasterio.open(layer) as src1:
            dst.write_band(id+1, src1.read(1).astype('float32'))


  transform = guard_transform(transform)


In [19]:
# Get local image
with rasterio.open(path + '/stack/stack.tif', 'r') as ds:
    data = ds.read()
    data = data.transpose((1, -1, 0)) # Not sure why the rasterio.read output is originally (D, W, H)

# Need to remove NaNs because any arithmetic operation involving an NaN will result in NaN

# Convert -999999 to None
data[data == -999999] = np.nan

# Get indices of non-nan values. These are the indices of the original image array
# data_ind = np.where(data[:,:,1] != None)
data_ind = np.where(~np.isnan(data[:,:,1]))
row, col = zip(np.where(~np.isnan(data[:,:,1]))) # image row and col of values
len(*row)

# Reshape into an array of pixels.
data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])

# Remove NaNs
data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
data_vector.shape

X = data_vector[:,0:14]
y = data_vector[:,14]

In [20]:
# Predict using models

# Import models
clf_rf_oversamp = pickle.load(open(model_path+'RF_oversamp'+'.sav', 'rb'))
clf_rf_NoOversamp = pickle.load(open(model_path+'RF_NoOversamp'+'.sav', 'rb'))
clf_rf = pickle.load(open(model_path+'RF1'+'.sav', 'rb'))


models = {'RF_oversamp':clf_rf_oversamp,
          'RF_NoOversamp':clf_rf_NoOversamp,
         'RF':clf_rf}

# Performance metrics
from sklearn import metrics

# Confusion matrix
def print_cm(cm, labels, hide_zeroes=False, hide_diagonal=False, hide_threshold=None):
    """pretty print for confusion matrixes"""
    columnwidth = max([len(x) for x in labels] + [5])  # 5 is value length
    empty_cell = " " * columnwidth
    # Print header
    print("    " + empty_cell, end=" ")
    for label in labels:
        print("%{0}s".format(columnwidth) % label, end=" ")
    print()
    # Print rows
    for i, label1 in enumerate(labels):
        print("    %{0}s".format(columnwidth) % label1, end=" ")
        for j in range(len(labels)):
            cell = "%{0}.1f".format(columnwidth) % cm[i, j]
            if hide_zeroes:
                cell = cell if float(cm[i, j]) != 0 else empty_cell
            if hide_diagonal:
                cell = cell if i != j else empty_cell
            if hide_threshold:
                cell = cell if cm[i, j] > hide_threshold else empty_cell
            print(cell, end=" ")
        print()
        
for name, model in models.items():
    y_pred = model.predict(X)
    print(name)
    print("Classification report for classifier %s:\n%s\n"
      % (model, metrics.classification_report(y, y_pred)))
    print_cm(metrics.confusion_matrix(y, y_pred)/len(y_pred)*100, labels=['flooded','notFlooded'])
# Top row = predicted, left col = true



RF_oversamp
Classification report for classifier RandomForestClassifier(bootstrap=False, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=15, n_jobs=None,
            oob_score=False, random_state=0, verbose=0, warm_start=False):
              precision    recall  f1-score   support

         0.0       0.89      1.00      0.94    970486
         1.0       0.69      0.00      0.00    124961

   micro avg       0.89      0.89      0.89   1095447
   macro avg       0.79      0.50      0.47   1095447
weighted avg       0.86      0.89      0.83   1095447


                  flooded notFlooded 
       flooded       88.6        0.0 
    notFlooded       11.4        0.0 
RF_NoOversamp
Classification report for classifier RandomForestClassifier(bootstrap=False, class_w

In [None]:
# Plot results
# Reshape predicted values back into image band
with rasterio.open(path + '/stack/stack.tif', 'r') as ds:
        shape = ds.read(2).shape # Shape of full original image
        arr_empty = np.zeros(shape) # Create empty array with this shape

output_image = arr_empty
rows, cols = zip(data_ind)
output_image[rows, cols] = output_data

plt.figure(figsize=(20,30))
columns = 2
images = [output_image, data[:,:,14]]
titles = ['Predicted Flooding','Actual Flooding']
for i, image in enumerate(images):
    plt.subplot(len(images) / columns + 1, columns, i + 1)
    plt.title(titles[i], fontdict = {'fontsize' : 18})
    plt.imshow(image)
    plt.colorbar