In [None]:
import rasterio
import numpy as np
import pandas as pd

print("rasterio: %s"%rasterio.__version__)

In [None]:
# Define the Landsat bands so that we can use a high level 
# description to interface.
band_def5 = {'blue':1,'green':2,'red':3,'nir':4,'swir1':5,'thermal':6,'swir2':7}
band_def8 = {'ultra_blue':1,'blue':2,'green':3,'red':4,'nir':5,'swir1':6,'swir2':7,
             'pan':8,'cirus':9,'thermal1':10,'thermal2':11}
# generate the names of all images by 
images5 = ["LT05_L1TP_042033_19881022_20161001_01_T1_sr_band%d.tif"%b for b in band_def5.values()]
images8 = ["LC08_L1TP_042033_20171022_20171107_01_T1_sr_band%d.tif"%b for b in band_def8.values()]

# what bands do we care about?
bands = ["red","green","blue","nir"]

# What bands are we going t ogenerate
calculated = ['ndvi','bn','bnn']

# how shal we subsample the ordered computed data?
num_pts = 20
step = 5
_end = num_pts * step

In [None]:
# find some of the brightest NDVI and BN locations.  
# This is rather arbitrary but provides some useful data.
with rasterio.open(images5[band_def5["red"]-1]) as red_img:
    red = red_img.read()
with rasterio.open(images5[band_def5["nir"]-1]) as nir_img:
    nir = nir_img.read()
    affine = nir_img.affine

blue_img = rasterio.open(images5[band_def5["blue"]-1])
blue = blue_img.read()

# because we are using Landsat reflectance images, the pixel vause are not
# the raw sensor info, and the numbers can be out of the theretical range,
# so mask out the nonsensical values that might cause problems.
bnmask = (nir>0) & (red>0) & (blue>0) & (nir<10000) & (red<10000) & (blue<10000)

ndvi = (nir-red)/(nir+red)
sndvi = sorted(ndvi.flatten()[bnmask.flatten()],reverse=True)[0:_end:step]
bn = blue/nir
bnn = (nir-blue)/(nir+blue)
sbn = sorted(bn.flatten()[bnmask.flatten()],reverse=True)[0:_end:step]
sbnn = sorted(bnn.flatten()[bnmask.flatten()],reverse=True)[0:_end:step]
    
def find_loc(band,values,img):
    import numpy as np

    npts = []
    for v in values:
        x,y = [a[0] for a in np.where(band==v)[1:]]
        #print(pos)
        npts.append(affine*[y+0.25,x+0.25])
    
    return list(set(npts))

ndvi_pts = find_loc(ndvi,sndvi,nir_img)
bn_pts = find_loc(bn,sbn,nir_img)
bnn_pts = find_loc(bnn,sbnn,nir_img)

# hand coded water
water_pts=[(348586,4286269),(338690,4323890)]

del red, nir, blue, ndvi
del bnn, bn, blue_img

In [None]:
print("num NDVI points =",len(ndvi_pts))
print("num BN points =",len(bn_pts))
print("num BNN points =",len(bnn_pts))
print("num Water points =",len(water_pts))

In [None]:
# extract the basic band info, as well as 
def fill_pt(images,band_def,bands,name,
            north=None,east=None,row=None,col=None,
            ptype=None):
    ndf = pd.DataFrame(data=[name],columns=["image"])

In [None]:
# extract the basic band info, as well as 
def fill_pt(images,band_def,bands,name,
            north=None,east=None,row=None,col=None,
            ptype=None):
    ndf = pd.DataFrame(data=[name],columns=["image"])
    ndf['type'] = ptype
    
    for band in bands:
        image = images[band_def[band]-1]
    
        img = rasterio.open(image)
        affine = img.affine
    
        if (north==None or east==None) and (row==None or col==None):
            print("Error: need either north+east or row+col")
            return {}
        if (north!=None and east!=None):
            row,col = img.index(y=north,x=east)
        elif (row!=None and col!=None):
            east,north = affine * (col,row)
        else:
            print("Error: north+east / row+col pairs not found")
            return {}
        
        ndf['east'],ndf['north'] = east,north
        ndf['row'],ndf['col'] = row,col

        val = img.read(window=((row,row+1),(col,col+1)))[0][0][0]
        ndf[band] = val
        if (val<0) or (val>=10000):
            ndf['row'],ndf['col'] = np.nan,np.nan
            for band in bands:
                ndf[band] = np.nan
            return ndf
        
    del img
    return ndf

# extract the basic band info, as well as 
def fill_computed(df,**kwargs):
    for key in kwargs:
        kwargs[key](df)

In [None]:
# process the Landsat-5 images
df = pd.DataFrame()
for east,north in water_pts:
    df = df.append(fill_pt(images5,band_def5,bands,
                            images5[0].split("_sr_band")[0],
                            north=north,east=east,ptype='water'), ignore_index=True)
for east,north in ndvi_pts:
    df = df.append(fill_pt(images5,band_def5,bands,
                            images5[0].split("_sr_band")[0],
                            north=north,east=east,ptype='veg'), ignore_index=True)
for east,north in bn_pts:
    df = df.append(fill_pt(images5,band_def5,bands,
                            images5[0].split("_sr_band")[0],
                            north=north,east=east,ptype='back'), ignore_index=True)
for east,north in bnn_pts:
    df = df.append(fill_pt(images5,band_def5,bands,
                            images5[0].split("_sr_band")[0],
                            north=north,east=east,ptype='norm_back'), ignore_index=True)

In [None]:
# process the Landsat-8 images
for east,north in water_pts:
    df = df.append(fill_pt(images8,band_def8,bands,
                            images8[0].split("_sr_band")[0],
                            north=north,east=east,ptype='water'), ignore_index=True)
for east,north in ndvi_pts:
    df = df.append(fill_pt(images8,band_def8,bands,
                            images8[0].split("_sr_band")[0],
                            north=north,east=east,ptype='veg'), ignore_index=True)
for east,north in bn_pts:
    df = df.append(fill_pt(images8,band_def8,bands,
                            images8[0].split("_sr_band")[0],
                            north=north,east=east,ptype='back'), ignore_index=True)
for east,north in bnn_pts:
    df = df.append(fill_pt(images8,band_def8,bands,
                            images8[0].split("_sr_band")[0],
                            north=north,east=east,ptype='norm_back'), ignore_index=True)

In [None]:
# define the user defined caculations.
def ndvi(df):
    df['ndvi'] = (df['nir']-df['red'])/(df['nir']+df['red'])
def bn(df):
    df['bn'] = df['blue']/df['nir']
def bnn(df):
    df['bnn'] = (df['nir']-df['blue'])/(df['nir']+df['blue'])

fill_computed(df,ndvi=ndvi,bn=bn,bnn=bnn)

In [None]:
# find the locations with NaN's.  We scrubbed the row/col variables above
# to make this easy.
rf = df[df['row'].isna()]

# drop both the locations with NaN's and its pairs in other images as well
#df = df.dropna(axis=0, how='all')
df = df[~(df['east'].isin(rf['east'].values) & df['north'].isin(rf['north'].values))]

In [None]:
# show a few for inspection
df.head()

In [None]:
# write it out some place
df.to_csv("landsat_training.csv",index=False)