In [1]:
# The Cryocloud environemnt may not have the newest version of earthaccess, with newest h5coro features.
#%%capture
# suppress install outputs

!pip uninstall -y earthaccess h5coro
!pip install earthaccess==0.6.1

# h5coro has new features that we need that are not released
!pip install git+https://github.com/ICESat2-SlideRule/h5coro.git@main

# !!! Restart the kernal and clean output after running this cell

Found existing installation: earthaccess 0.6.1
Uninstalling earthaccess-0.6.1:
  Successfully uninstalled earthaccess-0.6.1
Found existing installation: h5coro 0.0.7
Uninstalling h5coro-0.0.7:
  Successfully uninstalled h5coro-0.0.7
Collecting earthaccess==0.6.1
  Using cached earthaccess-0.6.1-py3-none-any.whl.metadata (9.4 kB)
Using cached earthaccess-0.6.1-py3-none-any.whl (54 kB)
Installing collected packages: earthaccess
Successfully installed earthaccess-0.6.1
Collecting git+https://github.com/ICESat2-SlideRule/h5coro.git@main
  Cloning https://github.com/ICESat2-SlideRule/h5coro.git (to revision main) to /tmp/pip-req-build-eckxj1e7
  Running command git clone --filter=blob:none --quiet https://github.com/ICESat2-SlideRule/h5coro.git /tmp/pip-req-build-eckxj1e7
  Resolved https://github.com/ICESat2-SlideRule/h5coro.git to commit dddd6583ecf567a752cd3db9bcc36a616d0c7677
  Preparing metadata (setup.py) ... [?25ldone
Building wheels for collected packages: h5coro
  Building wheel f

In [2]:
import geopandas as gpd
import earthaccess
import pprint as pp
import os
import h5py
import pandas as pd
import numpy as np
import datetime as dt
import matplotlib.pyplot as plt

from h5coro import s3driver
import h5coro
from itertools import product
from pqdm.threads import pqdm

auth = earthaccess.login()
print(f"earthaccess: {earthaccess.__version__}")

EARTHDATA_USERNAME and EARTHDATA_PASSWORD are not set in the current environment, try setting them or use a different strategy (netrc, interactive)
You're now authenticated with NASA Earthdata Login
Using token with expiration date: 09/16/2024
Using .netrc file for EDL
earthaccess: 0.6.1


In [7]:
roi_name = 'YKflats'
roi_path = f'data/{roi_name}_roi_shape.shp'
roi = gpd.read_file(roi_path)

"""
Needed to add a 10km buffering and unary union step.
This converts the rois from a multipolygon to a single polygon.
Then, we need to simplify the geom to fewer verticies.
!!! Earthaccess needs single polygons with <~ 500 verticies !!!
"""
roi_est_crs = roi.estimate_utm_crs()
print(roi_est_crs)
roi = roi.to_crs(roi_est_crs)
roi['geometry'] = roi.geometry.buffer(distance = 10000)
roi = roi.to_crs('EPSG:4326')
roi_geom = roi.geometry.unary_union
roi_geom = roi_geom.simplify(tolerance=0.01, preserve_topology=True)


""" 
Coordinates must be a list of (lat, lon) tupples, AND counter-clockwise
"""
coords_list = list(roi_geom.exterior.coords)
coords_list_counter = coords_list[::-1]
print(len(coords_list_counter))

EPSG:32606
132


In [8]:
results_dict = {}

# Range for summer observations
spring = '05-29'
fall = '10-3'

for year in range(2019, 2024):
    print(f"Querying {year}")
    results = earthaccess.search_data(
        short_name = "ATL06",
        version = '006', 
        polygon = coords_list_counter,
        cloud_hosted = True,
        temporal = (f"{year}-{spring}", f"{year}-{fall}")
    ) 
    results_dict[year] = results

Querying 2019
Granules found: 86
Querying 2020
Granules found: 93
Querying 2021
Granules found: 92
Querying 2022
Granules found: 91
Querying 2023
Granules found: 94


In [9]:
# Establish AWS credentials for reading function
env = "cloud" 
aws_credentials = earthaccess.get_s3_credentials("NSIDC")
cred = {
      "aws_access_key_id": aws_credentials["accessKeyId"],
      "aws_secret_access_key": aws_credentials["secretAccessKey"],
      "aws_session_token": aws_credentials["sessionToken"]
    }

In [10]:
# Need to modify this if ever running locally
def get_data_links(granules):
    return [g.data_links(access="direct")[0].replace("s3://", "") for g in granules]

In [11]:
def filter_criteria(df):
    
    # Make dem_dif column
    df['dem_dif'] = df['dem_h'] - df['h_li']
    
    # Criteria for filtering
    dem_match = df['dem_dif'].between(-20, 20)
    not_slope = df['dh_fit_dx'].between(-0.05, 0.05)
    
    # Apply filtering to df
    filter_condition = dem_match & not_slope
    df = df[filter_condition]
    
    return df

In [12]:
def calendar_from_delta(delta_time_series):
    
    epoch = dt.datetime(2018,1,1)
    delta_time_sec_series = pd.to_timedelta(delta_time_series, unit = 's')
    obs_date_series = (delta_time_sec_series + epoch)
    
    return obs_date_series.dt.date

In [13]:
def read_atl06(files, year=year, executors=4, credentials = cred):
    
    def read_h5coro(file):
        
        try:
            
            file_path = str(file)
            driver = s3driver.S3Driver
            # Read files
            h5 = h5coro.H5Coro(file_path, driver, credentials = credentials) 

            subgroups = ['gt1l/land_ice_segments/', 'gt1r/land_ice_segments/',
                         'gt2l/land_ice_segments/', 'gt2r/land_ice_segments/',
                         'gt3l/land_ice_segments/', 'gt3r/land_ice_segments/'
            ]

            variables = ['latitude', 'longitude', 'delta_time', 'h_li', 'dem/dem_h', 'fit_statistics/dh_fit_dx']
            ds_list = ['/'.join(p) for p in list(product(subgroups, variables))]

            f = h5.readDatasets(datasets = ds_list, block = True) # block = True means all datasets read before continuing execution

            tracks = []
            for subgroup in subgroups:
                ds = {data_set.split('/')[-1]: f[data_set][:] for data_set in ds_list if data_set.startswith(subgroup)}
                ds['beam'] = subgroup[0:4]

                df = pd.DataFrame(ds)
                tracks.append(df)

            data = pd.concat(tracks)

            data = filter_criteria(data)

            return(data)
        
        # One file from RU2, 2022 threw an error. Added these exception statements to return an empty df
        except AttributeError as e:
            # Handle cases where an expected attribute is missing
            print(f"AttributeError in file {file}: {e}")
            # Return an empty DataFrame if this error occurs
            return pd.DataFrame()

        except Exception as e:
            # Handle any other unexpected exceptions
            print(f"Unexpected error in file {file}: {e}")
            # Return an empty DataFrame if an unexpected error occurs
            return pd.DataFrame()


    #Paralell time
    dfs = pqdm(files, read_h5coro, n_jobs=executors)
    combined = pd.concat(dfs)
    
    #Convert to calendar date and drop delta time
    combined['obs_date'] = calendar_from_delta(combined['delta_time'])
    combined = combined.drop('delta_time', axis = 1)
    
    # Write the file out
    combined.to_csv(f'data/IS2raw_{roi_name}_{year}.csv')
    
    return combined

  

In [None]:
for year, file_info in results_dict.items():
    links = get_data_links(results_dict[year])
    read_atl06(links, year = year, executors = 4)

QUEUEING TASKS | :   0%|          | 0/86 [00:00<?, ?it/s]

PROCESSING TASKS | :   0%|          | 0/86 [00:00<?, ?it/s]