In [None]:
import numpy as np
import os
import PIL
import PIL.Image
import glob
from skimage import io
from skimage.transform import resize
import random
import matplotlib.pyplot as plt
import pandas as pd
import subprocess as sp

In [None]:
random.seed(170)

In [None]:
input_dir = './data/sentinel/'
out_dir = './data/reservoirs_10band/'

In [None]:
sp.call(['mkdir', '-p', out_dir + 'img_dir/train'])
sp.call(['mkdir', '-p', out_dir + 'img_dir/val'])
sp.call(['mkdir', '-p', out_dir + 'img_dir/test'])

sp.call(['mkdir', '-p', out_dir + 'ann_dir/train'])
sp.call(['mkdir', '-p', out_dir + 'ann_dir/val'])
sp.call(['mkdir', '-p', out_dir + 'ann_dir/test'])


In [None]:
val_frac = 0.2
test_frac = 0.2

In [None]:
remove_cloudy_images = True
cloudy_csv = './data/cloud_images.csv'
replace_bad_masks = True
bad_mask_csv = './data/replace_w_zeromask.csv'

In [None]:
if remove_cloudy_images:
    cloudy_base_list = pd.read_csv(cloudy_csv)['name'].values
else:
    cloudy_base_list = np.array([])
if replace_bad_masks:
    replace_mask_base_list = pd.read_csv(bad_mask_csv)['name'].values
else:
    replace_mask_base_list = np.array([])

## First round of loading data:
 1. Save mins and maxes of every band for every image

In [None]:
 # Get mask image names and base image patterns
mask_images = glob.glob('{}*mask.png'.format(input_dir))
mask_images.sort()
image_patterns = [mi.replace('mask.png', '') for mi in mask_images]
image_patterns = np.array([image_pat for image_pat in image_patterns if not os.path.basename(image_pat) in cloudy_base_list])
    

band_mins = []
band_maxes = []
                  
for image_base in image_patterns:
    stacked_ar = np.concatenate([
        io.imread('{}og.tif'.format(image_base)),
        io.imread('{}s1_v2_og.tif'.format(image_base)),
        io.imread('{}s2_20m_og.tif'.format(image_base)),

    ], axis=2)
    
    img_min = np.min(stacked_ar, axis=(0,1))
    img_max = np.max(stacked_ar, axis=(0,1))
    
    
    band_mins += [img_min]
    band_maxes += [img_max]

all_mins = np.stack(band_mins)
all_maxes = np.stack(band_maxes)
bands_min_max_all_imgs = np.stack([all_mins, all_maxes], axis=0)
np.save('./data/all_imgs_bands_min_max_sentinel_v7.npy', bands_min_max_all_imgs)

## Second round of loading data:
1. Skip any cloudy images
2. Split into train, val, test
3. Rescale bands using the previously calculated min and max
4. Calc NDs
5. Select 3 bands for feeding into CNN
6. Calc mean and std in running fashion for those bands, print it out


In [None]:
bands_min_max_all_imgs = np.load('./data/bands_minmax/all_imgs_bands_min_max_sentinel_v7.npy')
bands_min_max = np.array([np.min(bands_min_max_all_imgs[0], axis=0),
                          np.percentile(bands_min_max_all_imgs[1], 80, axis=0)])


In [None]:
def normalized_diff(ar1, ar2):
    """Returns normalized difference of two arrays."""
    
    return np.nan_to_num(((ar1 - ar2) / (ar1 + ar2)),0)
    
    
def calc_nd(img, band1, band2):
    """Add band containing NDWI."""

    nd = normalized_diff(img[:,:,band1].astype('float64'), 
                         img[:,:,band2].astype('float64'))
    
    # Rescale to uint8
    nd = np.round(255.*(nd - (-1))/(1 - (-1)))
    if nd.max()>255:
        print(nd.max())
        print('Error: overflow')
   
    return nd.astype(np.uint8)

def calc_all_nds(img):
    nd_list =[]
    
    # Add  Gao NDWI
    nd_list += [calc_nd(img, 3, 11)]
    # Add  MNDWI
    nd_list += [calc_nd(img, 1, 11)]
    # Add McFeeters NDWI band
    nd_list += [calc_nd(img, 1, 3)]
    # Add NDVI band
    nd_list += [calc_nd(img, 3, 2)]

    return np.stack(nd_list, axis=2)

In [None]:
def rescale_to_minmax_uint8(img, bands_min_max):
    img = np.where(img > bands_min_max[1], bands_min_max[1], img)
    img  = (255. * (img.astype('float64') - bands_min_max[0]) / (bands_min_max[1] - bands_min_max[0]))
    img = np.round(img)
    if img.max()>255:
        print(img.max())
        print('Error: overflow')
    return img.astype(np.uint8)

def select_bands_save_png(fp_base, out_path, band_selection, bands_min_max, crop_size, resample_size,
                          calc_mean_std=False):
    ar = np.concatenate([
        io.imread('{}og.tif'.format(fp_base)),
        io.imread('{}s1_v2_og.tif'.format(fp_base)),
        io.imread('{}s2_20m_og.tif'.format(fp_base))
    ], axis=2)

    ar = rescale_to_minmax_uint8(ar, bands_min_max)
    
    nds = calc_all_nds(ar)
    
    ar = np.concatenate([ar, nds], axis=2)[:, :, band_selection]
    
    if ar.shape[:-1] != crop_size:
        ar = crop(ar, crop_size)
    if ar.shape[:-1] != resample_size:
        ar = resize(ar, resample_size, preserve_range=True).astype(np.uint8)
    io.imsave(out_path, ar)
    if calc_mean_std:
        return ar.reshape((-1, len(band_selection)))
    
def crop(img, out_size):
    crop_size = int((img.shape[0] - out_size[0])/2)
    img = img
    return img[crop_size:(-crop_size), crop_size:(-crop_size)]

def create_nband_pngs(img_list, output_dir, band_selection = [0, 1, 2, 3, 4, 5, 12, 13, 14, 15], crop_size=(640,640),
                      resample_size=(640, 640),
                      calc_mean_std=False):
    if calc_mean_std:
        n = 0
        mean = np.zeros(len(band_selection))
        sums =  np.zeros(len(band_selection))
        M2 =  np.zeros(len(band_selection))

    for fp_base in img_list:
        # out_path is a little tricky, need to remove _ at end and add in .png
        out_path = fp_base.replace(input_dir, output_dir)[:-1] + '.tif'
        if calc_mean_std:
            vals = select_bands_save_png(fp_base, out_path, band_selection, 
                                         bands_min_max, crop_size, resample_size, 
                                         calc_mean_std=calc_mean_std)
            n += vals.shape[0]
            vals = vals
            sums += np.sum(vals, axis=0)
            delta = vals - mean
            mean += np.sum(delta/n, axis=0)
            M2 += np.sum(delta*(vals - mean), axis=0)
        else:
            select_bands_save_png(fp_base, out_path, band_selection, bands_min_max, crop_size, resample_size)
            
    if calc_mean_std:
        return sums/n, np.sqrt(M2 / (n - 1))
    
        
def save_mask_pngs(ann_list, output_dir, resample_size=(500, 500)):
    total_res_pixels = 0
    for fp_base in ann_list:
        fp = '{}mask.png'.format(fp_base)
        ar = io.imread(fp)
        ar[ar>0] = 1
        
        # Sometimes the pngs have 3 dims. We only want 2
        if len(ar.shape) > 2:
            ar = ar[:,:,0]
            
        # Replace bad masks
        if os.path.basename(fp).replace('_mask.png', '') in replace_mask_base_list:
            ar[:] = 0
        
        # Replace if only 1 positive pixel
        ar_sum = ar.sum()
        if ar_sum == 1:
            ar[:] = 0
        else:
            total_res_pixels += ar_sum
            
        # Resize
        if ar.shape != resample_size:
            ar = np.round(resize(ar, resample_size, preserve_range=True)).astype(np.uint8)

        if np.sum(np.logical_and(ar!=1, ar!=0)) > 0:
            print(fp)
            print(np.unique(ar))
            raise ValueError('Mask has non-0 and/or non-1 values')
            
        # Save
        out_path = fp.replace(input_dir, output_dir).replace('_mask.png', '.png')
        io.imsave(out_path, ar)
        
    return total_res_pixels

In [None]:
def split_train_test(img_patterns, test_frac, val_frac):
    """Split data into train, test, val (or just train)

    Returns:
        train_indices, val_indices, test_indices tuple
    """
    total_ims = len(img_patterns)
    if test_frac != 0:

        train_count = round(total_ims * (1 - test_frac - val_frac))
        train_indices = random.sample(range(total_ims), train_count)
        test_val_indices = np.delete(np.array(range(total_ims)), train_indices)

        test_count = round(total_ims * test_frac)
        test_indices = random.sample(list(test_val_indices), test_count)


        if val_frac != 0:
            val_indices = np.delete(np.array(range(total_ims)),
                                    np.append(train_indices, test_indices))

            return train_indices, val_indices, test_indices
        else: 
            return train_indices, test_indices
    else:
        return np.arange(total_ims)

# Get train, test, and val lists, skipping cloudy images
def list_and_split_imgs(input_dir, cloudy_base_list):
    # First get list of images
    mask_images = glob.glob('{}*mask.png'.format(input_dir))
    mask_images.sort()
    image_patterns = [mi.replace('mask.png', '') for mi in mask_images]
    image_patterns = np.array([image_pat for image_pat in image_patterns if not os.path.basename(image_pat) in cloudy_base_list])
    
    # No floodplain images in test set
    floodplain_images = np.array([ip for ip in image_patterns if 'floodplains' in ip])
    non_floodplain_images = np.array([ip for ip in image_patterns if 'floodplains' not in ip])
    train_indices, val_indices, test_indices = split_train_test(non_floodplain_images, test_frac=test_frac, val_frac=val_frac)
    # For including FP in val Using test_frac arg because allows for no 3rd split
    # train_indices_fp, val_indices_fp = split_train_test(floodplain_images, test_frac=val_frac, val_frac=0)
#     train_basename_list = np.concatenate([non_floodplain_images[train_indices], floodplain_images[train_indices_fp]])
#     val_basename_list = np.concatenate([non_floodplain_images[val_indices], floodplain_images[val_indices_fp]])
#     test_basename_list = non_floodplain_images[test_indices]
    # For only putting fp in train
    train_basename_list = np.concatenate([non_floodplain_images[train_indices], floodplain_images])
    val_basename_list = non_floodplain_images[val_indices]
    test_basename_list = non_floodplain_images[test_indices]
    
    return train_basename_list, val_basename_list, test_basename_list

In [None]:
train_basename_list, val_basename_list, test_basename_list = list_and_split_imgs(input_dir, cloudy_base_list)

In [None]:
means_std = create_nband_pngs(train_basename_list, output_dir = '{}/img_dir/train/'.format(out_dir),
                              calc_mean_std = True)
create_nband_pngs(test_basename_list, output_dir = '{}/img_dir/test/'.format(out_dir), calc_mean_std = False)
create_nband_pngs(val_basename_list, output_dir = '{}/img_dir/val/'.format(out_dir), calc_mean_std = False)

In [None]:
sum_res_pixels = save_mask_pngs(train_basename_list, output_dir = '{}/ann_dir/train/'.format(out_dir))
save_mask_pngs(test_basename_list, output_dir = '{}/ann_dir/test/'.format(out_dir))
save_mask_pngs(val_basename_list, output_dir = '{}/ann_dir/val/'.format(out_dir))

In [None]:
np.save('./mean_std_sentinel_v7.npy', np.vstack(means_std))