In [None]:
import os
from google.colab import drive

drive.mount('/content/drive')
os.chdir('/content/drive/MyDrive/Colab Notebooks/Dissertation/Space Intelligence/')
!ls

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
 checkpoints
'Copy of cyclegan.ipynb'
'Copy of pix2pix.ipynb'
 Data
 Exploratory_Data_Analysis_and_Linear_Models.ipynb
'Filling the GAPS S-CycleGAN_Attempt.ipynb'
 JRH_Filling_the_GAPS_Build_Training_Sets.ipynb
 JRH_Filling_the_Gaps_Image_Generation2.ipynb
 JRH_Filling_the_Gaps_Image_Generation.ipynb
 JRH_Filling_the_Gaps_Model_Building.ipynb
 logs
 output_images
 pix2pix_cGAN
 S-CycleGAN_Attempt.ipynb


In [None]:
#Installations
%%capture
!pip install tensorflow-io-nightly[tensorflow-gpu]
!apt install gdal-bin python-gdal python3-gdal ##base software for rasterio
!pip install rasterio ##raster image handling


In [None]:
import tensorflow as tf

import os
import pathlib
import time
import datetime
import re
import rasterio as rio
import numpy as np
import pandas as pd
import seaborn as sns

from matplotlib import pyplot as plt
from IPython import display
from natsort import natsorted, index_natsorted  ##natural sorting of strings
from rasterio.plot import show
from os.path import isfile, join

We are processing only ONE image for computational efficiency and storage issues

In [None]:
##specify which image to choose (out of two generated)
choose_ind = 0

##Load Data

##creating directory strings
data_dir = os.getcwd() + "/Data"
sen1_dir = data_dir + "/s1_filled" ##sen1 data for training
sen2_dir = data_dir + "/s2_reproject" ##reading in the sentinel_2 images for testing
numpy_masks_dir = data_dir + "/numpy_cloud_masks" 

##List image names for sentinel 1 and 2 (NOT full path) that we select
sen1_images = [f for f in os.listdir(sen1_dir) if isfile(join(sen1_dir, f))]
sen2_images = [f for f in os.listdir(sen2_dir) if isfile(join(sen2_dir, f))]


##List file names for numpy masks in the chosen image directory (choose_ind)
cloud_mask_dirs = [join(numpy_masks_dir,f) for f in natsorted(os.listdir(numpy_masks_dir))][choose_ind]
cloud_mask_f = [f for f in os.listdir(cloud_mask_dirs) if isfile(join(cloud_mask_dirs, f))]

##order file names according to ao1_{number} and pick the chosen image
sen1_number = [re.search(r'(?<=_)[0-9]+', i).group() for i in sen1_images] ##gets ao1 number for sen1
sen1_images = [sen1_images[i] for i in index_natsorted(sen1_number)][choose_ind] ##sorts according to sorted sen1_number and chooses selected

sen2_number = [re.search(r'(?<=_)[0-9]+', i).group() for i in sen2_images] ##gets ao1 number for sen2
sen2_images = [sen2_images[i] for i in index_natsorted(sen2_number)][choose_ind] ##sorts according to sorted sen2_number and chooses selected

##read raster objects
sen1_datasets = rio.open(join(sen1_dir,sen1_images)) ## read sen1 into a list 
sen2_datasets = rio.open(join(sen2_dir,sen2_images))## read sen2 into a list 

##Read in numpy cloud masks (array to tell us whether a pixel is part of a cloud or not)
##(0 if no cloud on the pixel and 1 if cloud is on the pixel)
cloud_mask_arrays = [np.load(join(cloud_mask_dirs,f), allow_pickle=True) for f in cloud_mask_f]

Define helper functions to convert raster objects to tensors

In [None]:
##write function to convert raster objects to arrays and arrange so that bands are the third dimension
def raster_to_array(raster, band_order=None):
  """
  This function will convert a raster object into an array and arrange the array so that the bands are the third dimension.
  It also allows a user to specify which bands to read and the order to read them in

  Input:
  raster: raster object
  band_order: list of bands to extract and the order to extract in

  Output:
  array: numpy array with dimensions: (height (x), width (y), channel)
  """

  ##If band_order is not supplied, just read the full raster
  if band_order is None:
    array = raster.read()
  
  ##If band_order is supplied, check to make sure it is a list and if it is, read the bands in this order
  else:
    
    if not isinstance(band_order, list):
      raise ValueError("band_order must be a list of numbers")

    array = raster.read(band_order)
    
  ##place the channel dimension as the last dimension
  array = np.transpose(array, (1,2,0))

  return array


In [None]:
##turn all rasters into arrays (sen2 linear predicted is already an array in the correct order)
sen2_band_order = [3,2,1,4] ##3 is Red, 2 is Green, 1 is Blue, 4 is NIR
sen1_arrays = raster_to_array(sen1_datasets)
sen2_arrays = raster_to_array(sen2_datasets, band_order = sen2_band_order) ##Places bands in order R,G,B,NIR

In [None]:
##get max values for normalization
sen1_max_vals = np.max(sen1_arrays[:,:,:2]) ##Dont include VV/VH in the normalization
sen2_max_vals = np.max(sen2_arrays)

print(f'Max Values for Sentinel-1 Arrays: {sen1_max_vals}')
print(f'Max Values for Sentinel-2 Arrays: {sen2_max_vals}')

Now we need to split images into 256x256 tiles and select training and test sets. We will take all tiles without a cloud as a training set and all tiles with a cloud as the test set.

In [None]:
##Create raster tiles with sliding window
##Code adapted from:
##https://towardsdatascience.com/efficiently-splitting-an-image-into-tiles-in-python-using-numpy-d1bf0dd7b6f7

##https://stackoverflow.com/questions/9979800/efficiently-reshaping-reordering-numpy-array-to-properly-ordered-tiles-image

##Define function that uses np.reshape to create tiles
def reshape_tiling(image: np.ndarray, tile_edge_size: int):
  """
  This function takes an image and tiles it into tiles of size tile_edge_size x tile_edge_size
  The input image width and height must be multiples of tile_edge_size for this to work.

  Input:
  image: a numpy array with image pixel intensities
  tile_edge_size: Integer for width/height of tile (in pixels)

  Output:
  tiled_array: array that has been reshaped into tiles of size tile_edge_size
  
  """
  ##Extract image dimensions
  img_height, img_width, channels = image.shape

  ##Create tiled array
  tile_height = tile_edge_size
  tile_width = tile_edge_size
  tiled_array = image.reshape(img_height // tile_height, ##how many tiles for the height
                              tile_height, ##size of tile height
                              img_width // tile_width, ##how many tiles for width
                              tile_width, ##size of tile width
                              channels) ##number of channels
  tiled_array = tiled_array.swapaxes(1, 2) ##swaps axis to get in order (height, width, tile_height, tile_width, channels)
  return tiled_array



In [None]:
##Create a tiled array for each image (both sentinel-1 and sentinel2)
tile_edge_size = 256

##Sentinel-1 tile generation
sen1_tiled = reshape_tiling(np.array(sen1_arrays), tile_edge_size) ##get the tiled array
##Sentinel-2 tile generation
sen2_tiled = reshape_tiling(np.array(sen2_arrays), tile_edge_size) ##get the tiled array

In [None]:
#Show untiled image and tiled image for one sen2 image (to show clouds)



print(f"Full Image {choose_ind + 1}")
plt.imshow(sen2_arrays[:,:,0], cmap = "pink")
plt.title("Sentinel-2: Red Channel")
plt.show()

print(f"Tiled Image {choose_ind + 1}")
##Loop over both tile dimensions (number of height tiles, number of width tiles) to plot all
figure, ax = plt.subplots(nrows = sen2_tiled.shape[0], ncols = sen2_tiled.shape[1], figsize = (20,20))
for i in range(0, sen2_tiled.shape[0]):
  for j in range(0, sen2_tiled.shape[1]):

    ax[i,j].imshow(sen2_tiled[i,j,:,:,0], cmap = "pink") ##Plot the tile and the first (Red channel)

    ##Remove ticks
    ax[i,j].set_xticks([])
    ax[i,j].set_yticks([])
    # figure.title(f"Sentinel-2 Tiles: Red Channel")

figure.subplots_adjust(wspace=0, hspace=0)
plt.show()

In [None]:
##get tiled arrays for cloud masks (10 Random images)
cloud_mask_tiled = [None] * len(cloud_mask_arrays) ##initialize cloud mask tile list

for i in range(0, len(cloud_mask_arrays)):
  reshaped_mask = np.reshape(cloud_mask_arrays[i], newshape = (cloud_mask_arrays[i].shape[0], cloud_mask_arrays[i].shape[1], 1)) ##3D with x,y, z=1
  cloud_mask_tiled[i] = reshape_tiling(reshaped_mask, tile_edge_size) ##reshape to 256x256 tiles

Now we will split each tiled image into a training and testing set. We do not want to include any tiles with a cloud in the training set because these are "unknown" data that we wish to predict. As such, we need to identify whether a cloud is present in each tile depending on the cloud masks that were loaded data the beginning of this file. We will define some functions to make this process easier

In [None]:
##define a helper function to check if a cloud is present in a tiled image
def check_tiles(cloud_mask_tile_array):
  """
  This function takes a tiled cloud mask image and checks to see if any clouds are present in each tile

  Input:
  cloud_mask_tile_array: tiled cloud mask image array

  Output:
  cloud_check: Array for each tile with a value of 1 if there is a cloud in the tile and 0 if there is not
  """

  tile_rows, tile_cols = cloud_mask_tile_array.shape[0], cloud_mask_tile_array.shape[1] ##get number of tile rows and cols

  ##Initialize cloud checking array with a zeros array
  cloud_check = np.zeros(shape = (tile_rows, tile_cols))
  ##Loop over tile rows and columns to check if there is a cloud in each tile
  ##If there is a cloud, change that index from 0 to 1
  for i in range(0, tile_rows):

    for j in range(0, tile_cols):
      ##if there are ANY cloud pixels (array.sum > 0), that tile has a cloud
      ##image dimensions are (tile row, tile col, tile width, tile height, 1)
      ##It only has one channel, so the channel's index is 0
      if cloud_mask_tile_array[i,j,:,:,0].sum() > 0: 
        cloud_check[i,j] = 1 ##set to 1 if there is a cloud

  return(cloud_check)



##Write a function that will get the indices required to split a tiled dataframe into train and test data for each image
##Train is the non-cloud tiles and test is the cloud tiles
def get_train_test_indices(cloud_check_array):
  """
  This function takes a boolean cloud check array (whether a cloud is in a tile) and gets the
  indices of training set that does NOT include cloudy tiles.

  The test set is defined as the tiles WITH clouds

  This method for splitting maximizes the amount of training data that can be used in the deep learning model

  Input: 
  cloud_check_array: Array for each tile with a value of 1 if there is a cloud in the tile and 0 if there is not
  
  Output: 
  train_ind: Indices of tiles used for training (no cloud tiles present)
  test_ind: Indices of tiles used for testing (cloud tiles present)
  total_percentage_selected: total percentage of data used for training to yield a "train_perc" (which does not include cloud tiles)
  """

  ##get indices where NO clouds are present
  no_cloud_index = np.where(cloud_check_array == 0) ##this returns a tuple with dimensions (0: 2, 1: number of no cloud pixels)

  ##Calculate the percentage of tiles not covered by clouds for each image
  n_tiles = cloud_check_array.size ##total number of tiles
  n_cloud_tiles = cloud_check_array.sum() ##number of cloud tiles
  n_no_clouds = n_tiles-n_cloud_tiles ##count of tiles with no clouds
  no_cloud_perc = n_no_clouds/n_tiles ##percentage of tiles with no clouds

  ##Identify training Indices
  train_ind = no_cloud_index ##indices of original tiled image with no clouds

  ##Identify testing Indices (portion of no_cloud index without clouds)
  ##We will do this by creating a boolean masks for test data so we can get the actual indices using np.where
  test_ind = np.where(cloud_check_array == 1)


  ##return train indices, test indices, and no cloud percent (true training percentage)
  return train_ind, test_ind, no_cloud_perc

##Define function that will separate a tiled image into train and test depending on a defined training percentage
##This function does NOT include tiles with clouds in the training dataset, so the "train_perc"
##Is actually defined by n_train/number_tiles_without_clouds. The test set will have the remaining tiles without clouds
##AND the tiles with clouds. It has an option to return the "true" training percentage for visualization purposes

##This funciton saves the tiled image to a defined output path rather than outputing the object


def train_test_split_image(tiled_image, cloud_mask_tile_array, train_output_dir, test_output_dir,
                           train_image_identifier, test_image_identifier, return_total_train_perc= False):
  """
  This function takes a tiled image and a boolean mask for whether a cloud is in a tile,
  and selects the indices of a training and test set. The training set is the set of tiles with no clouds
  and the test set is the set of tiles with a cloud

  It has an option to return the "true" training percentage for visualization purposes.

  Input:
  tiled_image: An image separated into tiles
  cloud_mask_tile_array: A boolean array (1 for if a pixel is a cloud and 0 for if a pixel is not a cloud) separated into tiles
  train_output_dir: Output directory for the separated training tiled arrays
  test_output_dir: Output directory for the separated test tiled arrays
  train_image_identifier: String name for image (no spaces) that can be used to identify the training array/image it came from 
  test_image_identifier: String name for image (no spaces) that can be used to identify the test array/image it came from 
  return_true_train_perc: Boolean for whether to return the total percentage of data used for training (not just tiles with no clouds)

  Output:
  Saves tiled numpy arrays to destination_path
  total_percentage_selected(Optional): the total percentage of data used for training (percent calculated including cloudy tiles)
  """

  ##First step is to check each tile to see if clouds are present and return
  ##a boolean array for each tile where 1 = cloud and 0 = no cloud
  cloud_check_array = check_tiles(cloud_mask_tile_array)

  ##Next we get the indices of the training, test, and verification/cloud sets that yield a training percentage of train_perc
  ##Again train_perc is the percent out of the NON-cloudy tiles
  ##We also output the total percentage of data used for training INCLUDING cloudy tiles
  train_index, test_index, total_percentage_selected = get_train_test_indices(cloud_check_array)
  train_index = np.array(train_index).reshape(2,-1) ##turn to array for easy indexing
  test_index = np.array(test_index).reshape(2,-1) ##turn to array for easy indexing


  ##separate the train tiles into separate images and save
  n_train = train_index.shape[1] ##size of training set
  for i in range(n_train):
    row_ind, col_ind = train_index[:,i] ##get tile row and column indices
    destination_path = join(train_output_dir, f"{train_image_identifier}_({row_ind},{col_ind}).npy") ##unique destination path for tile
    tile = np.array(tiled_image[row_ind, col_ind, :, :, :]) ##select the tile of interest in the loop
    np.save(destination_path, tile) ##save the tile of interest

  ##separate the test tiles into separate images and save
  n_test = test_index.shape[1] ##size of training set
  for i in range(n_test):
    row_ind, col_ind = test_index[:,i] ##get tile row and column indices
    destination_path = join(test_output_dir, f"{test_image_identifier}_({row_ind},{col_ind}).npy") ##unique destination path for tile
    tile = np.array(tiled_image[row_ind, col_ind, :, :, :]) ##select the tile of interest in the loop
    np.save(destination_path, tile) ##save the tile of interest

  ##If the total percentage of data used for training is desired, return it
  if return_total_train_perc == True:
    return(total_percentage_selected)






For a tensorflow pipeline, it is easier to manage if the input and output image are located in the same file. As such, we will stack sentinel-1 tiles on top of sentinel-2 files, so the output image will have 7 bands, 3 for sentinel one (dims 0,1,2) and 4 for sentinel-2 (dims 3,4,5,6).

In [None]:
##define a function to stack sentinel-1 and sentinel-2 tiles
def stack_tiled_arrays(sentinel_1_tiled, sentinel_2_tiled):
  """
  This function stacks sentinel-1 tiled arrays on tope of sentinel-2 tiled arrays, so the 
  final tiled array has 7 stacked arrays. 
  The dimensions are: (tile_row, tile_col, tile_width, tile_height, channel)
  Sentinel-1 is in channels 0,1,2 and sentinel-2 is in channels 3,4,5

  Input:
  sentinel_1_tiled: sentinel-1 tiled array
  sentinel_2_tiled: sentinel-2 tiled array

  Output:
  stacked_tiled_array: array of dim (tile_row, tole_col, tile_width, tile_height, channel)
                       with the first three channels as sentinel-2 and last four as sentinel-2
  """

  ##stacking sentinel-1 images on top of sentinel-2 images for tensorflow pipeline management
  sen1_channels = sentinel_1_tiled.shape[4] ##total number of channels in sen1
  sen2_channels = sentinel_2_tiled.shape[4] ##total number of channels in sen2
  channel_axis = 4 ##the axis for channels is dimension 4
  sen1_list = np.split(sentinel_1_tiled, sen1_channels, axis = channel_axis) ##split sen1 arrays
  sen2_list = np.split(sentinel_2_tiled, sen2_channels, axis = channel_axis) ##split sen2 arrays
  tiled_list = sen1_list + sen2_list ## combine the lists
  stacked_tiled_array = np.stack(tuple(tiled_list), axis = channel_axis) ##stack the tiles
  stacked_tiled_array = stacked_tiled_array[:,:,:,:,:,0] ##drop the last unused dimension which is created during the np.stack

  return(stacked_tiled_array)


Stacking sentinel-1 and seninel-2 as input for the train-test split

In [None]:
##stack tiles for next part
stacked_tiles = stack_tiled_arrays(sen1_tiled, sen2_tiled)

We are NO LONGER creating a separate testing set. The cloud data will be the testing set itself.

Need to create different training sets for each cloud image

In [None]:
##Separate Tiled images into train and test sets for both sentinel-1 and sentinel-2
##We will stack sen-1 and sen-2 in the same array and save for ease of use with tensorflow pipeline

##Get the image identifier to use for the sen1/sen2_images aoi_{number}_cloud_{number}
identifiers = [f"{re.search(r'aoi_[0-9]+',sen1_images).group()}_cloud_{i+1}" for i in list(range(len(cloud_mask_tiled)))]  ##gets s1_aoi_[1-10] for the selected

##Loop through the total number of random clouds and generate the separates train/test sets for each
##Also generate the total train percentages for each random cloud
total_train_percentages = [None] * len(cloud_mask_tiled) ##Initialize percentage list

##create overall directories to house training/testing data
training_dir = join(data_dir, "training_data") #training dir
testing_dir = join(data_dir, "testing_data") ##testing dir

##set seed for consistent train/test images
seed = 92
np.random.seed(seed)

if not os.path.exists(training_dir):
    os.mkdir(training_dir)
    print("Directory created!")

if not os.path.exists(testing_dir):
    os.mkdir(testing_dir)
    print("Directory created!")


##Loop through the random clouds
for k in range(len(cloud_mask_tiled)):

  ##Make output directories depending on the image for training and testing

  ##training: format is aoi_[choose_ind]_training since images are numbered from 1-10
  train_output_dir = join(training_dir,f"aoi_{choose_ind + 1}_cloud{k+1}_training")
  if not os.path.exists(train_output_dir):
    os.mkdir(train_output_dir)
    print("Directory created!")

  #testing: format is aoi_[k+1]_testing since images are numbered from 1-10
  test_output_dir = join(testing_dir,f"aoi_{choose_ind + 1}_cloud{k+1}_testing") 
  if not os.path.exists(test_output_dir):
    os.mkdir(test_output_dir)
    print("Directory created!")


  ##separate the sen-1/sen-2 stacked array into train/test/verification, save, and return total_train_percentage
  ##using the sentinel-2 linear predicted as the sentinel-2 image here
  print(f"Saving Data for Cloud {k+1}")

  total_train_percentages[k] = train_test_split_image(tiled_image = stacked_tiles, 
                                                      cloud_mask_tile_array = cloud_mask_tiled[k],
                                                      train_output_dir = train_output_dir, 
                                                      test_output_dir = test_output_dir,
                                                      train_image_identifier = identifiers[k], 
                                                      test_image_identifier = identifiers[k], 
                                                      return_total_train_perc= True)

  

Checking the actual percentages used for train test

In [None]:
##Get "training" percentages. This is the percentage of whole data used to train 
##such that 90% of the uncloudy data is used to train
bar_x = list(range(1, len(sen1_tiled) + 1)) ##Image numbers
total_percentage_data = pd.DataFrame({"Random Cloud Image": bar_x, "Percentage of Tiles Selected for Training": np.round(total_train_percentages, 2)}) ##
ax = sns.barplot(x='Random Cloud Image', y='Percentage of Tiles Selected for Training', data = total_percentage_data)
ax.bar_label(ax.containers[0])


The following code was used for a previous version. I kept it because it is useful, but it is not relevant for this analysis

In [None]:
# ##define a helper function to check if a cloud is present in a tiled image
# def check_tiles(cloud_mask_tile_array):
#   """
#   This function takes a tiled cloud mask image and checks to see if any clouds are present in each tile

#   Input:
#   cloud_mask_tile_array: tiled cloud mask image array

#   Output:
#   cloud_check: Array for each tile with a value of 1 if there is a cloud in the tile and 0 if there is not
#   """

#   tile_rows, tile_cols = cloud_mask_tile_array.shape[0], cloud_mask_tile_array.shape[1] ##get number of tile rows and cols

#   ##Initialize cloud checking array with a zeros array
#   cloud_check = np.zeros(shape = (tile_rows, tile_cols))
#   ##Loop over tile rows and columns to check if there is a cloud in each tile
#   ##If there is a cloud, change that index from 0 to 1
#   for i in range(0, tile_rows):

#     for j in range(0, tile_cols):
#       ##if there are ANY cloud pixels (array.sum > 0), that tile has a cloud
#       ##image dimensions are (tile row, tile col, tile width, tile height, 1)
#       ##It only has one channel, so the channel's index is 0
#       if cloud_mask_tile_array[i,j,:,:,0].sum() > 0: 
#         cloud_check[i,j] = 1 ##set to 1 if there is a cloud

#   return(cloud_check)



# ##Write a function that will get the indices required to split a tiled dataframe into train and test data for each image
# def get_train_test_indices(cloud_check_array, train_perc):
#   """
#   This function takes a boolean cloud check array (whether a cloud is in a tile) and gets the
#   indices of a random training set and test set that do NOT include cloudy tiles.
#   These are based on the defined train_perc where train_perc is the percent of tiles to take for training
#   FROM the NON-cloudy tiles.

#   It also gets the indices for the verification set (the set WITH clouds) to select these tiles for verification

#   Input: 
#   cloud_check_array: Array for each tile with a value of 1 if there is a cloud in the tile and 0 if there is not
#   train_perc: percent of tiles to take for training
  
#   Output: 
#   train_ind: Indices of tiles used for training (no cloud tiles present)
#   test_ind: Indices of tiles used for testing (cloud tiles present)
#   total_percentage_selected: total percentage of data used for training to yield a "train_perc" (which does not include cloud tiles)
#   """

#   ##get indices where NO clouds are present
#   no_cloud_index = np.where(cloud_check_array == 0) ##this returns a tuple with dimensions (0: 2, 1: number of no cloud pixels)
#   no_cloud_index_array = np.asarray(no_cloud_index).reshape(2,-1) ##reshape to an array for easy manipulation

#   ##Calculate the percentage of tiles not covered by clouds for each image
#   ##the no_cloud_index dimensions are (0: 2, 1: count of tiles with no clouds)
#   ##so to get the count of tiles with no clouds we do no_cloud_index.shape[1]
#   ##To get total number of tiles, we can just do cloud_check_array.size
#   n_no_clouds = no_cloud_index_array.shape[1] ##count of tiles with no clouds
#   n_tiles = cloud_check_array.size ##total number of tiles
#   no_cloud_perc = n_no_clouds/n_tiles ##percentage of tiles with no clouds

#   ##Calculate the size of the training data required to yield a *train_perc* where the
#   ##training percentage is defined as train_perc = n_train/n_no_clouds
#   n_train = np.floor(n_no_clouds * train_perc).astype(int) ##number to select for training to yield a "train_perc" that doesn't include cloud tiles
#   total_percentage_selected = train_perc * no_cloud_perc ##total percentage of data selected including cloud tiles

#   ##Identify training Indices
#   ##We first randomly select n_train integers from 0 to total number of no clouds
#   ##Then using these integers as indices, we select the indices of tiles (with no clouds) to use for training
#   no_cloud_select = np.sort(np.random.choice(range(0, n_no_clouds), n_train, replace = False)) ##random integers used to select training indices
#   train_ind = tuple(no_cloud_index_array[:,no_cloud_select]) ##indices from original tiled image to choose for training (set to tuple because this is required for array indexing)

#   ##Identify testing Indices (portion of no_cloud index without clouds)
#   ##We will do this by creating a boolean masks for test data so we can get the actual indices using np.where
#   test_mask = np.ones(shape = cloud_check_array.shape) ##create ones array same shape as tiled image
#   test_mask[train_ind] = 0 ##set the train indices to zero (Need to convert to tuple for it to work)
#   test_ind = np.where((test_mask == 1) * (cloud_check_array == 0) == True) ##get indices for test

#   ##Identify cloud/verification indices for model verification
#   verification_indices = np.where(cloud_check_array == 1)


#   return train_ind, test_ind, verification_indices, total_percentage_selected

# ##Define function that will separate a tiled image into train and test depending on a defined training percentage
# ##This function does NOT include tiles with clouds in the training dataset, so the "train_perc"
# ##Is actually defined by n_train/number_tiles_without_clouds. The test set will have the remaining tiles without clouds
# ##AND the tiles with clouds. It has an option to return the "true" training percentage for visualization purposes

# ##This funciton saves the tiled image to a defined output path rather than outputing the object


# def train_test_split_image(tiled_image, cloud_mask_tile_array, train_perc, train_output_dir, test_output_dir, verification_output_dir,
#                            train_image_identifier, test_image_identifier, verification_image_identifier, return_total_train_perc= False):
#   """
#   This function takes a tiled image and a boolean mask for whether a cloud is in a tile,
#   and selects the indices of a random training set, test set, and cloud index (verification). This selection is based
#   on whether a cloud is present in a tile (cloud_check_array) and and the defined train_perc. 
#   The defined train_perc is actually n_train/number_tiles_without_clouds.

#   The train set is the random training set without clouds,
#   the test set is the remaining tiles without clouds, and 
#   the cloud/verification set is all tiles with a cloud.

#   It has an option to return the "true" training percentage for visualization purposes.

#   Input:
#   tiled_image: An image separated into tiles
#   cloud_mask_tile_array: A boolean array (1 for if a pixel is a cloud and 0 for if a pixel is not a cloud) separated into tiles
#   train_perc: The desired percentage of NON-cloud images to be included in the training set (ie n_train/number_tiles_without_clouds)
#   train_output_dir: Output directory for the separated training tiled arrays
#   test_output_dir: Output directory for the separated test tiled arrays
#   verification_output_dir: Output directory for the separated verification (cloud) tiled arrays
#   train_image_identifier: String name for image (no spaces) that can be used to identify the training array/image it came from 
#   test_image_identifier: String name for image (no spaces) that can be used to identify the test array/image it came from 
#   verification_image_identifier: String name for image (no spaces) that can be used to identify the verification (cloudy) array/image it came from 
#   return_true_train_perc: Boolean for whether to return the total percentage of data used for training (not just tiles with no clouds)

#   Output:
#   Saves tiled numpy arrays to destination_path
#   total_percentage_selected(Optional): the total percentage of data used for training (percent calculated including cloudy tiles)
#   """

#   ##First step is to check each tile to see if clouds are present and return
#   ##a boolean array for each tile where 1 = cloud and 0 = no cloud
#   cloud_check_array = check_tiles(cloud_mask_tile_array)

#   ##Next we get the indices of the training, test, and verification/cloud sets that yield a training percentage of train_perc
#   ##Again train_perc is the percent out of the NON-cloudy tiles
#   ##We also output the total percentage of data used for training INCLUDING cloudy tiles
#   train_index, test_index, verification_index, total_percentage_selected = get_train_test_indices(cloud_check_array, train_perc)
#   train_index = np.array(train_index).reshape(2,-1) ##turn to array for easy indexing
#   test_index = np.array(test_index).reshape(2,-1) ##turn to array for easy indexing
#   verification_index = np.array(verification_index).reshape(2,-1) ##turn to array for easy indexing


#   ##separate the train tiles into separate images and save
#   n_train = train_index.shape[1] ##size of training set
#   for i in range(0, n_train):
#     row_ind, col_ind = train_index[:,i] ##get tile row and column indices
#     destination_path = join(train_output_dir, f"{train_image_identifier}_({row_ind},{col_ind}).npy") ##unique destination path for tile
#     tile = np.array(tiled_image[row_ind, col_ind, :, :, :]) ##select the tile of interest in the loop
#     np.save(destination_path, tile) ##save the tile of interest

#   ##separate the test tiles into separate images and save
#   n_test = test_index.shape[1] ##size of training set
#   for i in range(0, n_test):
#     row_ind, col_ind = test_index[:,i] ##get tile row and column indices
#     destination_path = join(test_output_dir, f"{test_image_identifier}_({row_ind},{col_ind}).npy") ##unique destination path for tile
#     tile = np.array(tiled_image[row_ind, col_ind, :, :, :]) ##select the tile of interest in the loop
#     np.save(destination_path, tile) ##save the tile of interest

#   ##separate the verification/cloud tiles into separate images and save
#   n_cloud = verification_index.shape[1] ##size of training set
#   for i in range(0, n_cloud):
#     row_ind, col_ind = verification_index[:,i] ##get tile row and column indices
#     destination_path = join(verification_output_dir, f"{verification_image_identifier}_({row_ind},{col_ind}).npy") ##unique destination path for tile
#     tile = np.array(tiled_image[row_ind, col_ind, :, :, :]) ##select the tile of interest in the loop
#     np.save(destination_path, tile) ##save the tile of interest

#   ##If the total percentage of data used for training is desired, return it
#   if return_total_train_perc == True:
#     return(total_percentage_selected)





In [None]:
##Helper function to convert sen1 and sen2 rasters to tensors to prevent saving multiple intermediate arrays
# def raster_to_tensor(raster, band_order = None):
#   """
#   This function will convert sentinel-1 and sentinel-2 raster objects to tensors by first converting
#   to an array and then converting to a tensor. It also allows specifying the band order, 
#   which we will use to ensure sentinel 2 is in the order Red, Green, Blue, NIR.

#   Note that the intermediate conversion from raster to array changes the dimension
#   order from (channel, height-x, width-y) to (height-x, width-y, channel)

#   Input:
#   raster: raster object to convert to tensor
#   band_order: list of bands to extract and order to extract them in

#   Output:
#   tensor: raster converted to tensor
#   """

#   ##For both sentinel-1 and sentinel-2 we convert to an array
#   array = raster_to_array(raster, band_order) ##convert to array

#   ##For both convert from array to tensor
#   tensor = tf.convert_to_tensor(array, dtype = None, dtype_hint = None, name = None)

#   return tensor