# Comparing CNN and RF

Rather than training RF on 30k sample points, this time I'm going to train it on the entire image

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
import pickle
import ee
import os
import rasterio
ee.Initialize()

In [21]:
# Parameters
path_clear = 'C:/Users/ipdavies/CPR/data/images/clear_4337_LC08_026038_20160325'
path_clouds = 'C:/Users/ipdavies/CPR/data/images/clouds_4337_LC08_026038_20160325'
model_path = 'C:/Users/ipdavies/CPR/data/models/'

# Misc functions
import time
def timer(start,end):
    hours, rem = divmod(end-start, 3600)
    minutes, seconds = divmod(rem, 60)
    return str("{:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))

In [None]:
# Import CNN?

## Pre-process image

In [9]:
# Stack exported tifs from GEE into one multiband tif

def listdir_fullpath(d):
    return [os.path.join(d, f) for f in os.listdir(d)]

file_list = []
for file in listdir_fullpath(path_clear):
    if file.endswith('.tif'):
        file_list.append(file)

feat_list_files = list(map(lambda x: x.split('.')[-2], file_list)) # list of features in file order


#=========== Want to rearrange the order of files so that target feature is last

# Create list of features with target feature (flooded) last
feat_list_new = ['aspect','curve', 'developed', 'distExtent', 'elevation', 'forest',
 'GSW_maxExtent', 'hand', 'other_landcover', 'planted', 'slope', 'spi', 'twi', 'wetlands', 'flooded']

# Create 1 row df of file names where each col is a feature name, in the order files are stored locally
file_arr = pd.DataFrame(data=[file_list], columns=feat_list_files)

# Then index the file list by the ordered list of feature names used in training
file_arr = file_arr.loc[:, feat_list_new]

# The take this re-ordered row as a list - the new file_list
file_list = list(file_arr.iloc[0,:])
    
# Read metadata of first file. This needs to be a band in float32 dtype, because it sets the metadata for the entire stack
# and we are converting the other bands to float64
with rasterio.open(file_list[1]) as src0:
    meta = src0.meta
    meta['dtype'] = 'float32'
#         print(meta)

# Update meta to reflect the number of layers
meta.update(count = len(file_list))

# Read each layer, convert to float, and write it to stack
# There's also a gdal way to do this, but unsure how to convert to float: https://gis.stackexchange.com/questions/223910/using-rasterio-or-gdal-to-stack-multiple-bands-without-using-subprocess-commands

# Make new directory for stacked tif if it doesn't already exist
try:
    os.mkdir(path_clear+'/stack')
except OSError:
    pass

# Remove stack file if already exists
try:
    os.remove(path_clear + '/stack/stack.tif')
except OSError:
    pass

with rasterio.open(path_clear + '/stack/stack.tif', 'w', **meta) as dst:
    for id, layer in enumerate(file_list, start=0):
        with rasterio.open(layer) as src1:
            dst.write_band(id+1, src1.read(1).astype('float32'))

  transform = guard_transform(transform)


In [10]:
# Preprocess the array

# Get local image
with rasterio.open(path_clear + '/stack/stack.tif', 'r') as ds:
    data = ds.read()
    data = data.transpose((1, -1, 0)) # Not sure why the rasterio.read output is originally (D, W, H)

# Need to remove NaNs because any arithmetic operation involving an NaN will result in NaN

# Convert -999999 to None
data[data == -999999] = np.nan

# Get indices of non-nan values. These are the indices of the original image array
# data_ind = np.where(data[:,:,1] != None)
data_ind = np.where(~np.isnan(data[:,:,1]))
row, col = zip(np.where(~np.isnan(data[:,:,1]))) # image row and col of values
len(*row)

14000674

In [22]:
HOLDOUT_FRACTION = 0.1

# Reshape into a single vector of pixels.
data_vector = data.reshape([data.shape[0] * data.shape[1], data.shape[2]])

# Remove NaNs
data_vector = data_vector[~np.isnan(data_vector).any(axis=1)]
data_vector.shape

# Shuffle data
np.random.shuffle(data_vector)

# Compute per-band means and standard deviations of the input bands.
data_mean = training_data[:,0:14].mean(0)
data_std = training_data[:,0:14].std(0)

# Hold out a fraction of the labeled data for validation.
training_size = int(data_vector.shape[0] * (1 - HOLDOUT_FRACTION))
training_data = data_vector[0:training_size,:]
validation_data = data_vector[training_size:-1,:]

## Random forest model

In [None]:
model_name = 'RF2'

# Normalize data
training_data[:,0:14] = (training_data[:,0:14] - data_mean) / data_std
validation_data[:,0:14] = (validation_data[:,0:14] - data_mean) / data_std

# Split into target and features
X_train = training_data[:,0:14]
y_train = training_data[:,14]
X_test = validation_data[:,0:14]
y_test = validation_data[:,14]

# Hyperparameters optimized for the 30k sample (using a random search)
best_n_estimators = 800
best_min_samples_split = 10
best_min_samples_leaf = 2
best_max_features = 'sqrt'
best_max_depth = 20
best_bootstrap = False

# Instantiate model with best parameters from random search
clf = RandomForestClassifier(n_estimators = best_n_estimators, 
                             max_depth = best_max_depth,
                             max_features = best_max_features,
                             min_samples_split = best_min_samples_split,
                             min_samples_leaf = best_min_samples_leaf,
                             bootstrap = best_bootstrap,
                             random_state=0)

start_time = time.time()

# Fit the RF model to data
clf.fit(X_train, y_train)

print('RF training time: ' + timer(start_time, time.time()))

# Save model
pickle.dump(clf, open(model_path+model_name+'.sav', 'wb'))