In [2]:
import numpy as np
from getpass import getpass

In [None]:
username='harrygibson'
password=getpass()

In [17]:
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS']=r"C:\Users\zool1301\Documents\GitHub\modis_acquisition_pipeline\dev\map-oxford-hsg-bf2341e4c187.json"

In [37]:
YEAR_FROM = 2000
YEAR_TO = 2020
DOY_START = 1
DOY_END = -1
TILE = '*'
BASE_URL = "http://e4ftl01.cr.usgs.gov"
platform = "MOLT"
product = "MOD11A2.006"

In [40]:
product_url = f"{BASE_URL}/{platform}/{product}"
product_url

'http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006'

In [49]:
def generate_selected_dates(year_from=2000, year_to=2020, doy_start=1, doy_end=-1):
    import calendar, time
    dates = []
    for year in range(year_from, year_to+1):
        if doy_end == -1:
            if calendar.isleap(year):
                end_day = 367
            else:
                end_day = 366
        else:
            end_day = doy_end
        dates_this_yr = [time.strftime("%Y.%m.%d", time.strptime("%d/%d" % (i, year),
                                                         "%j/%Y")) for i in
                 range(doy_start, end_day)]
        dates.extend(dates_this_yr)
    return dates

In [73]:
def get_existing_files(out_dir):
    # in case we need to do something different to list files on bucket
    return os.listdir(out_dir)

def load_page_text(url):
    import requests
    # nasa data pools are unavailable for maintenance on wednesday afternoons
    the_day_today = time.asctime().split()[0]
    the_hour_now = int(time.asctime().split()[3].split(":")[0])
    if the_day_today == "Wed" and 14 <= the_hour_now <= 17:
        LOG.info("Sleeping for %d hours... Yawn!" % (18 - the_hour_now))
        time.sleep(60 * 60 * (18 - the_hour_now))
    resp = requests.get(url)
    return resp.text
    
def parse_modis_dates (product_url, requested_dates, product, out_dir, check_existing_dates=False ):
    """Parse returned MODIS dates.

    This function gets the dates listing for a given MODIS products, and
    extracts the dates for when data is available. Further, it crosses these
    dates with the required dates that the user has selected and returns the
    intersection. Additionally, if the `checkExistingDates` flag is set, we'll check for
    files that might already be present in the system and skip them. Note
    that if a file failed in downloading, it might still be around
    incomplete.

    Parameters
    ----------
    url: str
        A top level product URL such as "http://e4ftl01.cr.usgs.gov/MOTA/MCD45A1.005/"
    dates: list
        A list of required dates in the format "YYYY.MM.DD"
    product: str
        The product name, MOD09GA.005
    out_dir: str
        The output dir
    checkExistingDates: bool
        Whether to check for present files
    Returns
    -------
    A (sorted) list with the dates that will be downloaded.
    """
    if check_existing_dates:
        product = product_url.strip('/').split('/')[-1]
        product_no_version = product.split(".")[0]
        already_here = fnmatch.filter(get_existing_files(out_dir),
                                      "%s*hdf" % product_no_version)
        already_here_dates = [x.split(".")[-5][1:]
                              for x in already_here]

    html = load_page_text(product_url)

    available_dates = []
    for line in html:
        if line.find("href") >= 0 and \
                        line.find("[DIR]") >= 0:
            # Points to a directory
            the_date = line.split('href="')[1].split('"')[0].strip("/")
            if check_existing_dates:
                try:
                    modis_date = time.strftime("%Y%j",
                                               time.strptime(the_date,
                                                             "%Y.%m.%d"))
                except ValueError:
                    continue
                if modis_date in already_here_dates:
                    continue
                else:
                    available_dates.append(the_date)
            else:
                available_dates.append(the_date)

    dates = set(dates)
    available_dates = set(available_dates)
    suitable_dates = list(dates.intersection(available_dates))
    suitable_dates.sort()
    return suitable_dates

In [50]:
real_dates = generate_selected_dates(YEAR_FROM, YEAR_TO, DOY_START, DOY_END)
dates_to_download = parse_modis_dates(product_url, real_dates, product, "C:\\temp")

### we will make downloadable_dates the point at which we split off jobs

In [101]:
def get_download_urls_for_date(product_url, date, tile='*'):
    date_page_url = product_url + "/" + date
    print(date_page_url)
    date_page_html = load_page_text(date_page_url)
    hdf_urls = []
    hdf_lines = [i for i in [line for line in date_page_html.split('\n') \
                    if (tile == '*' or # download all tiles
                        (isinstance(tile, list) and any(t in line for t in tile)) or # a list of tiles
                        (line.find(tile) != -1))] # a single tile
                 if i.find('.hdf"') != -1]
    hdf_files = [date_page_url + '/' + l.split('<a href="')[1].split('">')[0] for l in hdf_lines]
    return hdf_files

In [134]:
test_date = dates_to_download[2]
this_job_download_urls = get_download_urls_for_date(product_url, dates_to_download[2], ['h17v03', 'h18v03', 'h17v04', 'h18v04'])
this_job_download_urls

http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006/2000.03.05


['http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf',
 'http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf',
 'http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf',
 'http://e4ftl01.cr.usgs.gov/MOLT/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v04.006.2015057173924.hdf']

In [128]:
def download_hdfs_to_bucket(hdf_urls, dest_folder, nasa_username, nasa_pw):
    import requests, tempfile, os
    from google.cloud import storage
    storage_client = storage.Client("map-oxford-hsg")
    # Create a bucket object for our bucket
    bucket = storage_client.get_bucket("hsg-dataflow-test")
    with requests.Session() as s:
        s.auth = (nasa_username, nasa_pw)
        for url in hdf_urls:
            for attempt in range(10):
                try:
                    r1 = s.request('get', url)
                    r = s.get(r1.url, stream=True)
                    if not r.ok:
                        raise IOError("Can't start download for {}".format(url))
                    product, datestr, fname = url.split("/")[-3:]
                    tempfilename = os.path.join(tempfile.gettempdir(), fname)
                    with open(tempfilename, 'wb') as fp:
                        for chunk in r.iter_content(chunk_size=65536):
                            if chunk:
                                fp.write(chunk)
                        fp.flush()
                        os.fsync(fp)
                    # Create a blob object from the filepath
                    print(f"Downloaded {fname}")
                    bucketpath = '/'.join([dest_folder, product, datestr, fname])
                    blob = bucket.blob(bucketpath)
                    blob.upload_from_filename(tempfilename)
                    print(f"Uploaded {fname} to bucket")
                    os.remove(tempfilename)
                except:
                    raise
                else:
                    break

            else:
                raise IOError("conneection error occurred 10 times on "+fname)

In [129]:
download_hdfs_to_bucket(this_job_download_urls, "dev_hdf", username, password)

Downloaded MOD11A2.A2000065.h17v03.006.2015057173849.hdf
Uploaded MOD11A2.A2000065.h17v03.006.2015057173849.hdf to bucket
Downloaded MOD11A2.A2000065.h17v04.006.2015057173846.hdf
Uploaded MOD11A2.A2000065.h17v04.006.2015057173846.hdf to bucket
Downloaded MOD11A2.A2000065.h18v03.006.2015057173926.hdf
Uploaded MOD11A2.A2000065.h18v03.006.2015057173926.hdf to bucket
Downloaded MOD11A2.A2000065.h18v04.006.2015057173924.hdf
Uploaded MOD11A2.A2000065.h18v04.006.2015057173924.hdf to bucket


In [138]:
this_day_blobs = list(bucket.list_blobs(prefix=f"dev_hdf/{product}/{test_date}"))
[b.name for b in this_day_blobs if b.name.split('/')[2] == test_date]

['dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf',
 'dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf',
 'dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf',
 'dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v04.006.2015057173924.hdf']

In [192]:
gs_template_lst_file = "gs://hsg-dataflow-test/{}" 
gs_paths = [gs_template_lst_file.format(blob.name) for blob in this_day_blobs]
gs_paths

['gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf',
 'gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf',
 'gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf',
 'gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v04.006.2015057173924.hdf']

In [208]:
tempfolder = tempfile.gettempdir()
tmppaths = []
for f in gs_paths:
    tmppath = os.path.join(tempfolder, os.path.basename(f))
    !gsutil cp {f} {tmppath}
    tmppaths.append(tmppath)

Copying gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf...
/ [0 files][    0.0 B/  3.1 MiB]                                                
-
- [1 files][  3.1 MiB/  3.1 MiB]                                                

Operation completed over 1 objects/3.1 MiB.                                      
Copying gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf...
/ [0 files][    0.0 B/  3.4 MiB]                                                
-
- [1 files][  3.4 MiB/  3.4 MiB]                                                

Operation completed over 1 objects/3.4 MiB.                                      
Copying gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf...
/ [0 files][    0.0 B/  5.2 MiB]                                                
-
- [0 files][  2.8 MiB/  5.2 MiB]                                                
\
\ [1 fi

In [211]:
d = gdal.BuildVRT(os.path.join(tempfolder, "test.vrt"), tmppaths)
d.FlushCache()
d=None

In [203]:
gs_template_lst_day_lyr = "gs://hsg-dataflow-test/{}:MODIS_Grid_8Day_1km_LST:LST_Day_1km\n"
gs_template_lst_night_lyr = "gs://hsg-dataflow-test/{}:MODIS_Grid_8Day_1km_LST:LST_Night_1km\n"
vrtListDay = [lst_day_template.format(blob.name) for blob in this_day_blobs]
vrtListNight = [lst_night_template.format(blob.name) for blob in this_day_blobs]

In [165]:
vrtListDayFile = os.path.join(tempfile.gettempdir(), "LST_Day_Files_"+test_date+".txt")
vrtDayFile = vrtListDayFile.replace('.txt', '.vrt')   

vrtListDayFileLocal = os.path.join(tempfile.gettempdir(), "LST_Day_Files_Local"+test_date+".txt")
vrtDayFileLocal = vrtListDayFileLocal.replace('.txt', '.vrt')    

In [179]:
with open(vrtListDayFile, 'w') as txtfile:
    txtfile.writelines(vrtListDay)

with open(vrtListDayFileLocal, 'w') as txtfile:
    txtfile.writelines(vrtListDayLocal)

In [187]:
this_day_blobs

[<Blob: hsg-dataflow-test, dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf, 1591365860649300>,
 <Blob: hsg-dataflow-test, dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf, 1591365871783016>,
 <Blob: hsg-dataflow-test, dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf, 1591365880871137>,
 <Blob: hsg-dataflow-test, dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v04.006.2015057173924.hdf, 1591365898026581>]

In [167]:
vrtCommandLocal = f"gdalbuildvrt -input_file_list {vrtListDayFileLocal} {vrtDayFileLocal} -te -20015109.356 -10007554.678 20015109.356 10007554.678 -tr 926.625433138760630 926.625433138788940"
vrtCommandLocal

'gdalbuildvrt -input_file_list C:\\Users\\zool1301\\AppData\\Local\\Temp\\LST_Day_Files_Local2000.03.05.txt C:\\Users\\zool1301\\AppData\\Local\\Temp\\LST_Day_Files_Local2000.03.05.vrt -te -20015109.356 -10007554.678 20015109.356 10007554.678 -tr 926.625433138760630 926.625433138788940'

In [178]:
!{vrtCommandLocal}

0...10...20...30...40...50...60...70...80...90...100 - done.


ERROR 6: CPLRSASHA256Sign() not implemented: GDAL must be built against libcrypto++ or libcrypto (openssl)
ERROR 4: `/vsigs_streaming/hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf:MODIS_Grid_8Day_1km_LST:LST_Day_1km' does not exist in the file system, and is not recognized as a supported dataset name.
ERROR 6: CPLRSASHA256Sign() not implemented: GDAL must be built against libcrypto++ or libcrypto (openssl)
ERROR 4: `/vsigs_streaming/hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v04.006.2015057173846.hdf:MODIS_Grid_8Day_1km_LST:LST_Day_1km' does not exist in the file system, and is not recognized as a supported dataset name.
ERROR 6: CPLRSASHA256Sign() not implemented: GDAL must be built against libcrypto++ or libcrypto (openssl)
ERROR 4: `/vsigs_streaming/hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h18v03.006.2015057173926.hdf:MODIS_Grid_8Day_1km_LST:LST_Day_1km' does not exist in the file sys

Now check that vrt creation works on dataflow, based on how gs is presented there

Assuming it does, then calculate output tiffs in sinusoidal based on how the NDVI sample does it

Then translate with gdalwarp as per my code


In [177]:
from osgeo import gdal

In [None]:
# this does not work for hdf but ok for tif
with beam.io.gcp.gcsio.GcsIO().open('gs://hsg-dataflow-test/dev_hdf/MOD11A2.006/2000.03.05/MOD11A2.A2000065.h17v03.006.2015057173849.hdf', 'rb') as f:
    content = f.read()
    gdal.FileFromMemBuffer('/vsimem/som_memfile', f.read())
    ds = gdal.Open('/vsimem/som_memfile')
    print(ds.GetGeoTransform())
    ds = None
    gdal.Unlink('/vsimem/som_memfile')