# 01_pre-segment

This script loads Sentinel data for a given location and time window using DEA sandbox. Load all bands and metadata neccesary for estimating indices

All runs should be initited with a unique identifier 'stub' that carries through to the outputs

Then:
1. Calculates fourier transform 3-band image for input to segment anything
2. Calculate band indices as additional variables in the xarray ds
3. Saves the xarray ds with pickle to be reloaded later on. 

In [1]:
#import hdstats
import joblib

%matplotlib inline

import os
import datacube
import numpy as np
import pandas as pd
import xarray as xr
import rioxarray  # activate the rio accessor

import datetime as dt
import matplotlib.pyplot as plt

import sys
sys.path.insert(1, '../Tools/')
from dea_tools.temporal import xr_phenology, temporal_statistics
from dea_tools.datahandling import load_ard
from dea_tools.bandindices import calculate_indices
from dea_tools.plotting import display_map, rgb
from dea_tools.dask import create_local_dask_cluster

import hdstats

import pickle

In [2]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# Create local dask cluster to improve data load time
client = create_local_dask_cluster(return_client=True)




0,1
Connection method: Cluster object,Cluster type: distributed.LocalCluster
Dashboard: /proxy/8787/status,

0,1
Dashboard: /proxy/8787/status,Workers: 1
Total threads: 7,Total memory: 32.00 GiB
Status: running,Using processes: True

0,1
Comm: tcp://127.0.0.1:40653,Workers: 1
Dashboard: /proxy/8787/status,Total threads: 7
Started: 4 minutes ago,Total memory: 32.00 GiB

0,1
Comm: tcp://127.0.0.1:35805,Total threads: 7
Dashboard: /proxy/37539/status,Memory: 32.00 GiB
Nanny: tcp://127.0.0.1:35353,
Local directory: /jobfs/127931293.gadi-pbs/dask-scratch-space/worker-k50cs950,Local directory: /jobfs/127931293.gadi-pbs/dask-scratch-space/worker-k50cs950


In [3]:
## Connect to datacube
dc = datacube.Datacube(app='Vegetation_phenology')

In [4]:

#os.chdir('/home/106/jb5097/Projects/PaddockTS')

# set output vars
#stub = 'GRANT'
stub = 'MILG_frac_test'
#out_path = 'Data/'
outdir = "/g/data/xe2/John/Data/PadSeg/" # best if output is stored in gdata



In [5]:
# Define area of interest

# # 186 Milgadara Rd, Barwang NSW: -34.38904277303204, 148.46949938279096
# Yelkin -33.47904684379098, 146.3094839864518
# Boomahnoomoona -36.11965805095775, 146.08472404116773
# Adam O'tool site: -33.5040228817206, 148.6385170105664
# Grant Sims multispecies cover crop experiment sites -36.22746736927963, 144.40088864017818
lat = -34.38904277303204
lon = 148.46949938279096
lon_buffer = 0.02
lat_buffer = 0.02

# Set the range of dates for the analysis
time_range = ('2019-01-01', '2019-04-01') # when is the earliest? 2016?

# Combine central lat,lon with buffer to get area of interest
lat_range = (lat-lat_buffer, lat+lat_buffer)
lon_range = (lon-lon_buffer, lon+lon_buffer)

display_map(x=lon_range, y=lat_range)


In [6]:

# MAKE SAME AS MILGADARA REGION
# TEST
query = {
    'y': lat_range,
    'x': lon_range,
    'time': time_range,
    'resolution': (-20, 20),
    'output_crs': 'epsg:6933',
    'group_by':'solar_day'
}

# Load available data from Sentinel-2
ds = load_ard(
    dc=dc,
    products=['ga_s2am_ard_3', 'ga_s2bm_ard_3'],
    cloud_mask='s2cloudless',
    min_gooddata=0.9,
    **query,
)

ds

Finding datasets
    ga_s2am_ard_3
    ga_s2bm_ard_3
Counting good quality pixels for each time step using s2cloudless


  _reproject(
  _reproject(


Filtering to 21 out of 36 time steps with at least 90.0% good quality pixels
Applying s2cloudless pixel quality/cloud mask
Loading 21 time steps


In [7]:
# Create a reusable query
query = {
    'y': lat_range,
    'x': lon_range,
    'time': time_range,
    #'measurements': ['nbart_red', 'nbart_green', 'nbart_blue', 'nbart_nir_1'],
    #'measurements': ['nbart_band02', 'nbart_band03', 'nbart_band04', 'nbart_band05'],
    #'measurements': ['nbart_band02', 'nbart_band03', 'nbart_band04', 'nbart_band05','nbart_band06', 'nbart_band07', 'nbart_band08', 'nbart_band8a','nbart_band11','nbart_band12'], 
    
    # 'measurements': ['nbart_blue', 'nbart_green', 'nbart_red', 'nbart_red_edge_1','nbart_red_edge_2', 'nbart_red_edge_3', 'nbart_nir_1', 'nbart_nir_2','nbart_swir_2','nbart_swir_3',
    #                 'incident_angle'],
    'resolution': (-20, 20),
    'output_crs': 'epsg:6933',
    'group_by':'solar_day'
}

In [8]:

# Load available data from Sentinel-2
ds = load_ard(
    dc=dc,
    products=['ga_s2am_ard_3', 'ga_s2bm_ard_3'],
    cloud_mask='s2cloudless',
    min_gooddata=0.9,
    **query,
)

# Shut down Dask client now that we have loaded the data we need
client.close()

# Preview data
ds

# guide to Sentinel2 bands:
# https://sentinels.copernicus.eu/web/sentinel/user-guides/sentinel-2-msi/resolutions/spatial

Finding datasets
    ga_s2am_ard_3
    ga_s2bm_ard_3
Counting good quality pixels for each time step using s2cloudless
Filtering to 21 out of 36 time steps with at least 90.0% good quality pixels
Applying s2cloudless pixel quality/cloud mask
Loading 21 time steps


In [9]:
# # check out different measurement names:
# dc.list_products().to_csv("dc.list_products.tsv", sep = "\t")
# dc.list_measurements().to_csv("dc.list_measurements.tsv", sep = "\t")

In [None]:
### TO DO

''' 
Modify the function provided by Scarth et al to estimate veg fractional cover from Sentinel data.
Insert it in here and save 3 new bands to the output ds.

Also can potentially remove some of the index bands
'''


### Write a function based on Scarth et al code to estimate veg fractional cover. Add this to the xarray. 

In [11]:
from fractionalcover3 import unmix_fractional_cover # this works
from fractionalcover3 import data # this now works


ModuleNotFoundError: No module named 'fractionalcover3'

In [None]:
# Test the function for estimating fractions from example reflectance data.
# which 7 bands do these represent>???
inref = np.array([562, 825, 1088, 2056, 2951, 2187]) * 0.0001
inref.shape = (6, 1, 1)

for i in [1, 2, 3, 4]:
    fractions = unmix_fractional_cover(inref,
    fc_model=data.get_model(n = i))
    print('n = ', i)
    print(fractions)
    print('--------')


### Plot to check it out:

### Create band indices and save the xarray object

In [8]:
# Also add: dNDVI, NDYI, CAI, faPAR (if possible)

def NDYI(green,blue):
    '''Normalized Difference Yellowness Index (NDYI) formula'
    Para E. S. https://custom-scripts.sentinel-hub.com/sentinel-2/ndyi/
    See link for idea on how to mask pixels with high NDYI'''
    
    return((green-blue)/(green+blue))

# The only FAPAR equation I've found for Sentinel 2 data was developed as a neural network model (https://step.esa.int/docs/extra/ATBD_S2ToolBox_L2B_V1.1.pdf)
# This function will not be easy to implement with our data as currently formatted (https://custom-scripts.sentinel-hub.com/custom-scripts/sentinel-2/fapar/)


In [9]:
ds_ind = calculate_indices(ds, 
                           index=['NDVI', # Normalised Difference Vegation Index, Rouse 1973
                                  'kNDVI', # Non-linear Normalised Difference Vegation Index,
                                  'EVI', # Enhanced Vegetation Index, Huete 2002
                                  'LAI', # Leaf Area Index, Boegh 2002
                                  'SAVI', # Soil Adjusted Vegetation Index, Huete 1988
                                  'MSAVI', # Mod. Soil Adjusted Vegetation Index, Qi et al. 1994
                                  'NDMI', # Normalised Difference Moisture Index, Gao 1996
                                  'NDWI', # Normalised Difference Water Index, McFeeters 1996
                                  'MNDWI', # Modified Normalised Difference Water Index, Xu 2006 (USES SWIR1)
                                  'NBR', # Normalised Burn Ratio, Lopez Garcia 1991
                                  'NDCI', # Norm dif chlorophyll index
                                  'NDTI', # Normalised Difference Tillage Index,
                                  'BSI' # Bare soil
                                  ], 
                           collection='ga_s2_3')  # https://knowledge.dea.ga.gov.au/notebooks/How_to_guides/Calculating_band_indices/

ds_ind['NDYI'] = NDYI(ds_ind['nbart_green'], ds_ind['nbart_blue'])

ds_ind

In [10]:
## Get the info about how each index calculated... 
?calculate_indices()

Object `calculate_indices()` not found.


In [None]:

# pickle dump (2 refers to this being the 'full' dataset, rather than just the bands used for vis)

with open(outdir+stub+'_ds2.pickle', 'wb') as handle:
    pickle.dump(ds_ind, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
## Test openning pickle.. 
with open(outdir+stub+'_ds2.pickle', 'rb') as handle:
    ds_ind = pickle.load(handle)


In [None]:
#### TO DO
''' get code blocks to save/open using netcdf, transfer this to .py codes for 01_preseg and 02_SAMGeo and all subsequent...
Then make sure it's repeatable and replace those older runs with new ones..'''



In [None]:
ds_ind
ds = ds_ind
ds

### Trying again to emulate past method using hdstats


In [None]:
# Get a RGBN as numpy array
keep_vars = ['nbart_red','nbart_green','nbart_blue','nbart_nir_1']

# interpolated version. select vars, convert zeros to nans, interpolate nans, make np array, then transpose
#ds = ds[keep_vars]

In [None]:
data = ds[keep_vars].to_array().transpose('y', 'x','variable', 'time').values.astype(np.float32)
data[data == 0] = np.nan
data /= 10000.

dayofyear = np.array(ds.time.dt.dayofyear)
ts = dayofyear/365.

print(data.shape)

In [None]:
ndwi_obs = (data[:,:,1,:]-data[:,:,3,:])/(data[:,:,1,:]+data[:,:,3,:]) # w = water. (g-nir)/(g+nir)
print('number of missing pixels to be dealt with:', np.count_nonzero(np.isnan(ndwi_obs)))


In [None]:
ndwi = hdstats.completion(ndwi_obs)

In [None]:
yloc, xloc = data.shape[0]//4, data.shape[1]//4

# specify a time vector
ts = dayofyear/365. # ts x-axis as day of year
# OR
ts = np.array(range(1,data.shape[3]+1)) # ts x-axis as sequential obs days.  

plt.plot(ts, ndwi[yloc,xloc,:], 'ro', alpha=0.5, label='Completed')
plt.plot(ts, ndwi_obs[yloc,xloc,:], 'ko', label='Observations')
plt.xlabel('Fraction of year')
plt.ylabel('NDWI')
plt.legend();

In [None]:
def imshow(data, fs=6, pc=2):
    img = data.copy()
    plt.figure(figsize=(fs,fs*(img.shape[0]/img.shape[1])))

    if len(img.shape) == 3:
        for i in range(min(3, img.shape[2])):
            dd=img[:,:,i]
            pl, pu = np.percentile(dd[np.isfinite(dd)], (pc,100-pc))
            dd[dd<pl], dd[dd>pu] = pl, pu
            dd -= pl
            dd /= (pu - pl)
        plt.imshow(img, aspect='auto')
    else:
        pl, pu = np.percentile(img[np.isfinite(img)], (pc,100-pc))
        plt.imshow(img, aspect='auto', vmin=pl, vmax=pu)
    plt.tick_params(axis='both', which='both', bottom=False, top=False, labelbottom=False, right=False, left=False, labelleft=False)
    plt.tight_layout()

In [None]:
f2 = hdstats.fourier_mean(ndwi)
imshow(f2)
print(f2.shape)

# this is the one

In [None]:
# rescale each band of a numpy array so that it's between 0 and 255

def rescale(im):
    
    '''rescale raster (im) to between 0 and 255.
    Attempts to rescale each band separately, then join them back together to achieve exact same shape as input.
    Note. Assumes multiple bands, otherwise breaks'''
    
    n_bands = im.shape[2]
    #print(n_bands)

    #print(im.shape)

    _im = np.empty(im.shape)

    #print(_im)
    for n in range(0,n_bands):

        #print(n)

        matrix = im[:,:,n]
        scaled_matrix = (255*(matrix - np.min(matrix))/np.ptp(matrix)).astype(int)

        #_im = np.append(_im, scaled_matrix, axis = 0)
        _im[:,:,n] = scaled_matrix

    print('output shape equals input:', im.shape == im.shape)

    imshow(_im)
    
    return(_im)

#t = rescale(drimg[:,:,:3])

In [None]:
def export_for_segmentation(ds, inp, out_stub):
    
    '''prepares a 3-band image for SAMgeo. 
    First rescale bands in the image. Then convert to xarray with original geo info. Then save geotif'''
    
    if inp.shape[2] == 3:
    
        image = rescale(inp) # 3d array 
        lat = list(ds.y.values) # latitude is the same size as the first axis
        lon = list(ds.x.values) # longitude is the same size as second axis
        bands = list(range(1,image.shape[2]+1)) # band is the 3rd axis
        crs = ds.rio.crs
                
        # create xarray object
        data_xr = xr.DataArray(image, 
                       coords={'y': lat,'x': lon,'band': bands}, 
                       dims=["y", "x", "band"])

        data_xr.rio.write_crs(crs, inplace=True)
        
        # save as geotif:
        data_xr.transpose('band', 'y', 'x').rio.to_raster(out_stub + '.tif')

    else:
        print("Input image is wrong shape! No action taken")
        
    #return(image, data_xr)

In [None]:
### Set the stub name and export the ds for later analysis as well as an RGB for segmentation with SAMGeo
export_for_segmentation(ds, f2, outdir+stub)

In [15]:
### Save the input params for record keeping and possible future use

# Saving the objects:
with open(outdir+stub+'_ds2_query.pkl', 'wb') as f:
    pickle.dump(query, f)

# Getting back the objects:
with open(outdir+stub+'_ds2_query.pkl', 'rb') as f:
    query = pickle.load(f)

In [16]:
query


{'y': (-34.40904277303204, -34.369042773032035),
 'x': (148.44949938279095, 148.48949938279097),
 'time': ('2019-01-01', '2022-01-01'),
 'resolution': (-10, 10),
 'output_crs': 'epsg:6933',
 'group_by': 'solar_day'}

### Next step. (AFTER RUNNING SAMGEO) Time series summary and extraction at paddock level
1. Load polygons generated by segment anything 
2. extract spectral time series summaries for each polyon
3. Save crucial files for next  script



In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np

out_path

In [None]:
#data_dir = 'Data/Paddocks/'

stub = 'MILG_small'
stub = 'fm_ndwi_4'
stub = 'ADAMO'
stub = 'GRANT'

pol = gpd.read_file(out_path+stub+'_filt.gpkg')

pol['paddock'] = range(1,len(pol)+1)
pol['paddock'] = pol.paddock.astype('category')

pol.plot(column = 'paddock')

In [None]:
gdf = pol
gdf.explore(column='paddock') # note, I think column must be categorical for this... 

In [None]:
# add some indices to the data
ds_indi = calculate_indices(ds, index=['NDVI','NDWI'], collection='ga_s2_3') # calculate indices using the DEA tool. # calculate indices using the DEA tool.

ds_indi


In [None]:
# Loop through every polygon and extract the smoothed time series for some number of bands. 

ts = []

for datarow in pol.itertuples(index=True):
    
    #print(datarow)
    
    ds_clipped = ds_indi.rio.clip([datarow.geometry])
    
    pol_ts = ds_clipped.where(ds_clipped > 0).median(dim = ['x','y'])
    
    array = pol_ts.to_array().transpose('variable', 'time').values.astype(np.float32)
    
    ts.append(array[None,:] )
        
pvt = np.vstack(ts)

In [None]:
# the result is an array of paddocks by variable (e.g. band) by time
# use ds.time and ds.geometry to bring in the time axis in proper format and location for each paddock
print(pvt.shape)
print(len(pol.geometry), len(ds.time))

np.save(out_path+stub+'_pvt', pvt, allow_pickle=True, fix_imports=True)


In [None]:
import seaborn as sns

pvt_ndvi = pvt[:,4,:]

sns.heatmap(pvt_ndvi)

In [None]:
ds.assign_coords

In [None]:

# # NO LONGER NEEDED?


# # save the time axis of the ds array as a csv so it can be opened as a pd.series

# pd.Series(ds.time.values).to_csv('Data/'+stub+'_ds-time.csv',
#                                 header = False,
#                                 mode = 'w',
#                                 index_label = 'False')


In [None]:
# make a copy of the ds next time!!
ds_ = ds

ds_.time.attrs.pop('units', None)
ds_.to_netcdf(out_path+stub+'_ds.nc')



In [None]:
ds_

In [None]:
ds