# Identify Landsat Scenes for Covariate

Using the USGS STAC based sat-api, query Landsat 8 scences for a given aoi (analysis tile), over time window of interest, with additional quality filters. Save the resulting query to a geojson catalog file for use in the next step Greenest Pixel Compositing.

In [1]:
import os
import json
import requests
import datetime

from CovariateUtils import get_index_tile

  shapely_geos_version, geos_capi_version_string


In [2]:
# Search for imagery
# https://github.com/developmentseed/example-jupyter-notebooks/blob/landsat-search/notebooks/Landsat8-Search/L8-USGS-satapi.ipynb


sat_api_url = "https://landsatlook.usgs.gov/sat-api"

def query_satapi(query):
    headers = {
            "Content-Type": "application/json",
            "Accept-Encoding": "gzip",
            "Accept": "application/geo+json",
        }

    url = f"{sat_api_url}/stac/search"
    data = requests.post(url, headers=headers, json=query).json()
    
    return data

def query_year(year, bbox, min_cloud, max_cloud):
    '''Given the year, finds the number of scenes matching the query and returns it.'''
    date_min = '-'.join([str(year), "06-01"])
    date_max = '-'.join([str(year), "09-15"])
    start_date = datetime.datetime.strptime(date_min, "%Y-%m-%d")
    end_date = datetime.datetime.strptime(date_max, "%Y-%m-%d") 
    start = start_date.strftime("%Y-%m-%dT00:00:00Z")
    end = end_date.strftime("%Y-%m-%dT23:59:59Z")
    
    query = {
    "time": f"{start}/{end}",
    "bbox":bbox,
    "query": {
        "collections": ["landsat-c2l2-sr"],
        "platform": {"in": ["LANDSAT_8"]},
        "eo:cloud_cover": {"gte": min_cloud, "lt": max_cloud},
        "landsat:collection_category":{"in": ["T1"]}
    },
    "limit": 20 # We limit to 500 items per Page (requests) to make sure sat-api doesn't fail to return big features collection
    }
    
    data = query_satapi(query)
    
    # you can't trouble shoot if you don't return the actual results
    return data

In [4]:
geojson_path_albers = "/projects/shared-buckets/alexdevseed/boreal_tiles.gpkg"
layer = "boreal_tiles_albers"
tile_n = 30543

tile_id = get_index_tile(geojson_path_albers, tile_n, buffer=0, layer = layer)

In [15]:
# Accessing imagery
# Select an area of interest
#bbox_list = [[-105,45,-100,50], [-101,45,-100,46]] # Not Boreal Enough
bbox_list = [tile_id['bbox_4326']]
min_cloud = 0
max_cloud = 20
years = range(2015,2020 + 1)
for bbox in bbox_list:
    # Geojson of total scenes - Change to list of scenes
    response_by_year = [query_year(year, bbox, min_cloud, max_cloud ) for year in years]
    scene_totals = [each['meta']['found'] for each in response_by_year]
    print(scene_totals)

[4, 7, 12, 7, 4, 6]


In [8]:
# Take the search over several years, write the geojson response for each
## TODO: need unique catalog names that indicate bbox tile, and time range used.
save_path = '/projects/shared-buckets/alexdevseed/landsat8/sample2'
if (not os.path.isdir(save_path)): os.mkdir(save_path)
catalogs = []
for yr in range(0,len(years)):
    catalog = os.path.join(save_path, f'response-{tile_n}-{years[yr]}.json')
    with open(catalog, 'w') as jsonfile:
        json.dump(response_by_year[yr], jsonfile)
        catalogs.append(catalog)

OSError: [Errno 30] Read-only file system: '/projects/shared-buckets/alexdevseed/landsat8/sample2/response-30543-2015.json'

In [16]:
import boto3
def get_json(s3path, output_dir):
    '''
    Download a json from S3 to the output directory
    '''
    aws_session = boto3.session.Session()
    s3 = aws_session.resource('s3')
    output_file = os.path.join(output_dir, os.path.basename(s3path))
    #TODO split the bucket name from the s3 path
    bucket_name = s3path.split("/")[2]
    s3_key = "/".join(samples3.split("/")[3:])
    s3.Bucket(bucket_name).download_file(s3_key, output_file)
    
    with open(output_file) as f:
        catalog = json.load(f) 
    return catalog

samplehttp = "https://maap-ops-dataset.s3.amazonaws.com/maap-users/alexdevseed/landsat8/sample2/locals3-30543-2018.json"
samples3 = '"s3://maap-ops-dataset/maap-users/alexdevseed/landsat8/sample2/locals3-30543-2018.json'
test = get_json(samples3, '/projects/tmp')

ClientError: An error occurred (403) when calling the HeadObject operation: Forbidden

7

### Exploring Results as Spatial layer

The return from the api is a valid geojson per page. You can directly plot this on a map (e.g. folium). Below is an expirement to convert to geopandas, however only the 'properties' key is maintained as an attribute, which isn't going to work since we also need the 'assests'. TODO: Need to flatten the json before loading to geopandas.

In [10]:
## TODO: change to conda install -c conda-forge
%pip install -q folium
%pip install -q geopandas
%pip install -q shapely

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


In [11]:
import geopandas as gpd
import folium
import shapely as shp

In [12]:
# Silly but it's easier to write the json to file and read back in right now to get a valid geopandas frame.
# Dilemna reading geojson with geopandas only pull the "properties" in as attributes
#with open(f'response-{yr}.json', 'w') as jsonfile:
#    json.dump(response, jsonfile)

#scenes_poly1 = gpd.read_file('response.json')
#scenes_poly1.head()

In [13]:
# This works but you have to manually set the crs
yr = 5
scenes_poly = gpd.GeoDataFrame.from_features(response_by_year[yr], crs='epsg:4326')
for col in scenes_poly.columns: print(col)

scenes_poly.head()


geometry
datetime
eo:cloud_cover
view:sun_azimuth
view:sun_elevation
platform
instruments
view:off_nadir
landsat:cloud_cover_land
landsat:wrs_type
landsat:wrs_path
landsat:wrs_row
landsat:scene_id
landsat:collection_category
landsat:collection_number
landsat:correction
proj:epsg
proj:shape
proj:transform


Unnamed: 0,geometry,datetime,eo:cloud_cover,view:sun_azimuth,view:sun_elevation,platform,instruments,view:off_nadir,landsat:cloud_cover_land,landsat:wrs_type,landsat:wrs_path,landsat:wrs_row,landsat:scene_id,landsat:collection_category,landsat:collection_number,landsat:correction,proj:epsg,proj:shape,proj:transform
0,"POLYGON ((-117.90855 52.75991, -118.62326 51.0...",2020-09-10T18:42:19.025128Z,3.36,159.224638,41.108349,LANDSAT_8,"[OLI, TIRS]",0,3.36,2,44,24,LC80440242020254LGN00,T1,2,L2SP,32611,"[8021, 7921]","[30, 0, 384285, 0, -30, 5846715]"
1,"POLYGON ((-118.49411 51.34454, -119.17569 49.6...",2020-08-25T18:42:36.040695Z,12.54,154.152875,47.615272,LANDSAT_8,"[OLI, TIRS]",0,12.54,2,44,25,LC80440252020238LGN00,T1,2,L2SP,32611,"[8021, 7931]","[30, 0, 340785, 0, -30, 5689815]"
2,"POLYGON ((-116.94429 51.34515, -117.62484 49.6...",2020-08-18T18:36:21.560496Z,5.57,152.456053,49.743891,LANDSAT_8,"[OLI, TIRS]",0,5.57,2,43,25,LC80430252020231LGN00,T1,2,L2SP,32611,"[7941, 7831]","[30, 0, 452985, 0, -30, 5688915]"
3,"POLYGON ((-116.34411 52.76023, -117.05765 51.0...",2020-08-18T18:35:57.673692Z,5.78,153.937001,48.631092,LANDSAT_8,"[OLI, TIRS]",0,5.78,2,43,24,LC80430242020231LGN00,T1,2,L2SP,32611,"[7921, 7821]","[30, 0, 493785, 0, -30, 5846415]"
4,"POLYGON ((-116.94300 51.34515, -117.62359 49.6...",2020-08-02T18:36:16.128698Z,6.58,148.999814,54.014274,LANDSAT_8,"[OLI, TIRS]",0,6.58,2,43,25,LC80430252020215LGN00,T1,2,L2SP,32611,"[7941, 7841]","[30, 0, 452685, 0, -30, 5688915]"


In [17]:
#sc_bbox = scenes_poly.total_bounds #scene boundaries is not as useful as the original bbox to query
# TODO: loop over years to check that the bbox is covered, n < 8 in 1 year testing has less than full coverage.
sc_bbox = bbox_list[1]
sc_bbox_polygon = shp.geometry.box(sc_bbox[0], sc_bbox[1], sc_bbox[2], sc_bbox[3])
center = sc_bbox_polygon.centroid

m = folium.Map(
    location=[center.y,center.x],
    tiles="cartodbpositron",
    zoom_start=6,
)

bbox_style = {'fillColor': '#ff0000', 'color': '#ff0000'}

folium.GeoJson(scenes_poly, name="geojson").add_to(m)
folium.GeoJson(sc_bbox_polygon,
               name="bbox",
               style_function=lambda x:bbox_style).add_to(m)

m

IndexError: list index out of range

### Cache the results

> Note: For testing purposes, a local copy of a sample bbox was downloaded. In the real analysis the DPS jobs will read data as needed directly from USGS buckets. This code is here simply to demonstrate how it can be done.

TODO move data reading to next step.

1. Cache the json from the query (save as files)
1. Loop over a few of them, and download the source
1. Write a new copy of the json with internal ADE urls to the same files

In [18]:
import boto3
import botocore

aws_session2 = boto3.session.Session(
    aws_access_key_id=aws_access_key_id,
    aws_secret_access_key=aws_secret_access_key,
)
s3 = aws_session2.resource('s3')

In [7]:
# read the json file catalog
def write_local_data_and_catalog(catalog, bands):
    '''Given path to a response json from a sat-api query, make a copy changing urls to local paths'''
    with open(catalog) as f:
        asset_catalog = json.load(f)
        for feature in asset_catalog['features']:
            new_dir = os.path.join(save_path, feature['id'])
            if (not os.path.isdir(new_dir)): os.mkdir(new_dir)
            #download the assests
            for band in bands:
                try:
                    key = feature['assets'][f'SR_{band}.TIF']['href'].replace('https://landsatlook.usgs.gov/data/', '')
                    output_file = os.path.join(new_dir, os.path.basename(key))
                    #print(key)
                    ## Uncomment next line to actually download the data as a local sample
                    #s3.Bucket('usgs-landsat').download_file(key, output_file, ExtraArgs={'RequestPayer':'requester'})
                    feature['assets'][f'SR_{band}.TIF']['href'] = output_file
                except botocore.exceptions.ClientError as e:
                    if e.response['Error']['Code'] == "404":
                        print("The object does not exist.")
                    else:
                        raise
        # save and updated catalog with local paths
        local_catalog = catalog.replace('response', 'locals3')
        with open(local_catalog,'w') as jsonfile:
            json.dump(asset_catalog, jsonfile)
        
        return local_catalog

In [8]:
# if not catalogs then read file names from save_path response-{yr}.json
bands = [''.join(["B",str(item)])for item in range(2,8,1)]
local_catalogs = [write_local_data_and_catalog(catalog, bands) for catalog in catalogs]

In [33]:
write_local_data_and_catalog(catalogs[0], bands)

'/projects/alexdevseed/landsat8/sample2/local-2015.json'

In [16]:
#write a maap s3 based catalog
def write_local_data_and_catalog_s3(catalog, bands):
    '''Given path to a response json from a sat-api query, make a copy changing urls to local paths'''
    with open(catalog) as f:
        asset_catalog = json.load(f)
        for feature in asset_catalog['features']:
            #new_dir = os.path.join(save_path, feature['id'])
            #if (not os.path.isdir(new_dir)): os.mkdir(new_dir)
            #download the assests
            for band in bands:
                try:
                    key = feature['assets'][f'SR_{band}.TIF']['href'].replace(
                        'https://landsatlook.usgs.gov/data/',
                        '')
                    output_file = os.path.join(
                        f's3://maap-ops-dataset/alexdevseed/landsat8/sample2/{feature["id"]}/'
                        , os.path.basename(key))
                    #print(key)
                    ## Uncomment next line to actually download the data as a local sample
                    #s3.Bucket('usgs-landsat').download_file(key, output_file, ExtraArgs={'RequestPayer':'requester'})
                    feature['assets'][f'SR_{band}.TIF']['href'] = output_file
                except botocore.exceptions.ClientError as e:
                    if e.response['Error']['Code'] == "404":
                        print("The object does not exist.")
                    else:
                        raise
        # save and updated catalog with local paths
        local_catalog = catalog.replace('response', 'locals3')
        with open(local_catalog,'w') as jsonfile:
            json.dump(asset_catalog, jsonfile)
        
        return local_catalog

In [17]:
# if not catalogs then read file names from save_path response-{yr}.json
bands = [''.join(["B",str(item)])for item in range(2,7,1)]
local_catalogs = [write_local_data_and_catalog_s3(catalog, bands) for catalog in catalogs]

In [None]:
## Retrieving Pixels
bands = [2,3,4,5,6]
bbox = bbox_list[1]
# TODO: 
# Loop over each scene
response = response_by_year[1]
# Each season should actually be it's own DPS job
for item in response['features']:
    # for each scene Loop over bands 2,3,4,5,6 (assets)
    for band in bands:
        # For each scene, read subset set by bounding box
        asset = item['assets'][f'SR_B{band}.TIF']['href']
        # Convert to S3 url for use with requester pays
        cog = asset.replace('https://landsatlook.usgs.gov/data/', 's3://usgs-landsat/')
        print(cog) 
        # Since the source files are per band, the 1st band in a given file is default
        # Bound Box reprojected on the fly to the native projection of the asset
        #subset, crs, transform = extract_subset(cog, bbox, 1)

    # stack the bands into the same array with n layers (z direction)?
    # optional: calculate indexes based on the bands and store as additional layers
    # save cog to disk (could be a kea or Zarr(xarray))
# after looping, make a VRT of cogs so it can be treated as a single file ?

# Questions, 
# 1. should the tiling scheme be LonLat 1 degree, the end Equal area projection, or utm zone based? take bbox reproject to LonLat for the query
# 2. do we need the QA band or can we do that separate, the only limitation on cogs is that the storage type needs to be identical in all bands. Kea or Zarr could accomodate, or if the pixels are cloud filtered before saving the stack then a COG is ok.

In [18]:
import boto3
from rasterio.session import AWSSession
import rasterio as rio
aws_session = AWSSession(boto3.Session())
with rio.Env(aws_session):
    with rio.open('s3://maap-ops-dataset/alexdevseed/landsat8/sample2/LC08_L2SP_043024_20160604_20200906_02_T1/LC08_L2SP_043024_20160604_20200906_02_T1_SR_B4.TIF', 'r') as src:
        print(src.profile)

{'driver': 'GTiff', 'dtype': 'uint16', 'nodata': 0.0, 'width': 7831, 'height': 7921, 'count': 1, 'crs': CRS.from_epsg(32611), 'transform': Affine(30.0, 0.0, 492885.0,
       0.0, -30.0, 5846415.0), 'blockxsize': 256, 'blockysize': 256, 'tiled': True, 'compress': 'deflate', 'interleave': 'band'}


In [59]:
test = "LC08_L2SP_044024_20150812_20200909_02_T1_SR"
f'stuff{test[:-3]}'

'stuffLC08_L2SP_044024_20150812_20200909_02_T1'