This script will access the FRF TDS, list instrument types available within user specified data range, download netcdf files for the selected instrument in the data range, collate the nc files and clip them to the dates desired, and save the file to disk. 

# imports

In [1]:
import os
from datetime import datetime, timedelta
from dateutil.relativedelta import relativedelta

import pandas as pd
import xarray as xr
from siphon.catalog import TDSCatalog
import requests

**Notes on the imports:**

`os` module alows for operating system dependent functionality. Let's the script interact with underlying OS, like for handling file paths, creating/copying/moving directories, checking if a file eixts... 

`datetime` & `timedelta` from `datetime` module & `relativedelta` from `dateutil` module: needed these to parse dates from strings and extract dates from filenames. And determine relative time & time differences/comparisons. Needed to add `relativedelta` to get the end date/time of the query (`dataset_end`) while accounting for diff days per calendar month.  

# define inputs and prep 

**input date range:**

In [2]:
startDate = pd.to_datetime('2019-08-15')
endDate   = pd.to_datetime('2019-09-30')

**access TDS:**

In [3]:
catalog_url = 'https://chldata.erdc.dren.mil/thredds/catalog/frf/oceanography/waves/catalog.xml'
catalog = TDSCatalog(catalog_url) # make TDSCatalog object 
print(catalog.catalog_refs)
# instrumentsAll = list(catalog.catalog_refs.keys())
# print(instrumentsAll)
# print(catalog.catalog_refs[0].follow().catalog_refs.keys()) # for 8m-array
# print(catalog.catalog_refs[1].follow().catalog_refs.keys()) # for CS01...
# print(catalog.catalog_refs[10].follow().catalog_refs.keys()) # for aquadop 2m 

['8m-array', 'CS01-SBE26', 'CS02-SBE26', 'CS03-SBE26', 'CS04-SBE26', 'CS05-SBE26', 'DWG-BB02', 'DWG-BB03', 'DWG-OC02', 'DWG-OC03', 'adop-2m', 'adop-3.5m', 'awac-11m', 'awac-4.5m', 'awac-5m', 'awac-6m', 'awac-8m', 'awac-jpier-11m', 'lidarHydrodynamics', 'lidarWaveGauge080', 'lidarWaveGauge090', 'lidarWaveGauge100', 'lidarWaveGauge110', 'lidarWaveGauge140', 'lidarWaveRunup', 'paros-340x-940y-top', 'paros940-200', 'paros940-250', 'sig769-300', 'sig940-300', 'sig940-400', 'sig940-600', 'waverider-17m', 'waverider-17m-1D', 'waverider-20m-1d', 'waverider-26m', 'waverider-guam', 'waverider-nags-head-nc', 'waverider-oregon-inlet-nc', 'waverider-oregoninlet-17m', 'waverider-wallops', 'waverider-wilmington-nc', 'xp100m', 'xp125m', 'xp150m', 'xp200m', 'xp340m']


# **TEST**
comment everything here to run the final version 

In [None]:
# test for 8-m array: 
i=0
inst_catalog = catalog.catalog_refs[i].follow()
# get the available years in integers 
years = [int(year) for year in inst_catalog.catalog_refs.keys() if year.isdigit()]
print(years) 
# is it in the year range I want? 
yearsInRange = [str(year) for year in years if startDate.year <= year <= endDate.year]
print(yearsInRange) 

#presets: 
has_data=False # preset as False 
instrumentsInRange = [] #pre-allocate the final list array 

# ADD IF: if there is yearsInRange: 
for year in yearsInRange:
    # which data nc files are avaialbe for that instrument and year?
    dataInYear = list(inst_catalog.catalog_refs[year].follow().datasets.keys())
    print(dataInYear)

    for ncfile in dataInYear:  
    # get the month from the file name, from YYYYMM format before `.nc`
        date_part = ncfile.split('_')[-1].split('.')[0]  
        dataset_date = datetime.strptime(date_part, '%Y%m') # form the full datetime for that month 
    # 
    #   ADD TRY/EXCEPT STATEMENT HERE: error if the file doesn't match the pattern 
    # 
    # for each datast_date, define the end date depending on # of days in that month 
        dataset_start = dataset_date
        dataset_end = dataset_start + relativedelta(months=1) - timedelta(seconds=1)
        print(dataset_start, dataset_end)
    # is there overlap 
        if (dataset_start <= endDate) and (dataset_end >= startDate):
            has_data=True
            break 
        # end of the ncfile for-loop 
    if has_data: 
        break # if has_data=T end the year in yearsInRange loop, and append to final inst list 


[1990, 1991, 1992, 1993, 1994, 1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010, 2011, 2012, 2015, 2016, 2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024]
['2019']
['FRF-ocean_waves_8m-array_201901.nc', 'FRF-ocean_waves_8m-array_201902.nc', 'FRF-ocean_waves_8m-array_201903.nc', 'FRF-ocean_waves_8m-array_201904.nc', 'FRF-ocean_waves_8m-array_201905.nc', 'FRF-ocean_waves_8m-array_201906.nc', 'FRF-ocean_waves_8m-array_201907.nc', 'FRF-ocean_waves_8m-array_201908.nc', 'FRF-ocean_waves_8m-array_201909.nc', 'FRF-ocean_waves_8m-array_201910.nc', 'FRF-ocean_waves_8m-array_201911.nc', 'FRF-ocean_waves_8m-array_201912.nc']
2019-01-01 00:00:00 2019-01-31 23:59:59
2019-02-01 00:00:00 2019-02-28 23:59:59
2019-03-01 00:00:00 2019-03-31 23:59:59
2019-04-01 00:00:00 2019-04-30 23:59:59
2019-05-01 00:00:00 2019-05-31 23:59:59
2019-06-01 00:00:00 2019-06-30 23:59:59
2019-07-01 00:00:00 2019-07-31 23:59:59
2019-08-01 00:00:00 2019-08-31 23:59:59
2019-09-01 00:0

**Notes on `.follow`:**
This is related to using the Siphon library for working with TDS Catalogs. TDS Catalogs are hiercharical, with nested datasets. First I'm accessing the main catalog using `TDSCatalog`, then I'm getting the "CatalogRef" class object from the main catalog (this is instruments list). The `follow()` method gets the catalog object nested inside each instrument. 

In [None]:
# instrumentsInRange = [] 
# for i in catalog.catalog_refs:
#     inst_catalog_ref = catalog.catalog_refs[i]
#     inst_catalog = inst_catalog_ref() -- doen'st work 

#     # all avail years for instrument i: 
#     years = [int(year) for year in inst_catalog.catalog_refs.keys() if year.isdigit()]


In [13]:
i=2
inst_catalog = catalog.catalog_refs[i]
print(inst_catalog)

CS02-SBE26


In [14]:
print(instrumentsInRange)

[8m-array, 8m-array]


In [15]:
instrumentsInRange.append(inst_catalog)
print(instrumentsInRange)

[8m-array, 8m-array, CS02-SBE26]


In [None]:
# selected_indices = input("\nEnter numbers of instrument types eg, 1,3,5: ")
# print(selected_indices) 

print("Instruments with data in the date range:")
for idx, inst in enumerate(instrumentsInRange):
    print(f"{idx+1}. {inst}")  # 1. 8m-array 

Instruments with data in the date range:
1. 8m-array
2. 8m-array
3. CS02-SBE26


... have list of instruments with data in the date range, now want to choose the one(s) I want and to downalod them to netcdf ...


for now just repopulate the yearsInRange etc. Can probably save those variables from the above steps

In [29]:
# choose: 
selected_instruments = ['8m-array']

# set up for download 
downloadedFiles = [] 

for inst in selected_instruments:
    print(f"\nGetting instrument {inst}...")
    inst_catalog = catalog.catalog_refs[inst].follow()
    years = [int(year) for year in inst_catalog.catalog_refs.keys() if year.isdigit()]
    yearsInRange = [str(year) for year in years if startDate.year <= year <= endDate.year]
    
    if not yearsInRange:
        print(f"There is no {inst} data available for this date range.")
        continue
    for year in yearsInRange:
        year_catalog_ref = inst_catalog.catalog_refs[year]
        year_catalog = year_catalog_ref.follow()
        dataInYear = list(year_catalog.datasets.keys())

        print(dataInYear)
        
        for dataset_name in dataInYear: 
            date_part = ncfile.split('_')[-1].split('.')[0]  
            dataset_date = datetime.strptime(date_part, '%Y%m') # form the full datetime for that month 
        # 
    #   ADD TRY/EXCEPT STATEMENT HERE: error if the file doesn't match the pattern 
    # 
    # for each datast_date, define the end date depending on # of days in that month 
            dataset_start = dataset_date
            dataset_end = dataset_start + relativedelta(months=1) - timedelta(seconds=1)
            print(dataset_start, dataset_end)
            dataset = year_catalog.datasets[dataset_name]
            url = dataset.access.urls['HTTPServer']
            local_filename = f"{inst}_{dataset_name}"
            print(local_filename)
    # is there overlap [start_date > data > end_date]
            # if (dataset_start <= endDate) and (dataset_end >= startDate):
            #     # yes, then get using requests package 
            #     dataset = year_catalog.datasets[dataset_name]
            #     url = dataset.access.urls['HTTPServer']
            #     local_filename = f"{inst}_{dataset_name}"
            #     print(local_filename)


Getting instrument 8m-array...
['FRF-ocean_waves_8m-array_201901.nc', 'FRF-ocean_waves_8m-array_201902.nc', 'FRF-ocean_waves_8m-array_201903.nc', 'FRF-ocean_waves_8m-array_201904.nc', 'FRF-ocean_waves_8m-array_201905.nc', 'FRF-ocean_waves_8m-array_201906.nc', 'FRF-ocean_waves_8m-array_201907.nc', 'FRF-ocean_waves_8m-array_201908.nc', 'FRF-ocean_waves_8m-array_201909.nc', 'FRF-ocean_waves_8m-array_201910.nc', 'FRF-ocean_waves_8m-array_201911.nc', 'FRF-ocean_waves_8m-array_201912.nc']
2019-12-01 00:00:00 2019-12-31 23:59:59


AttributeError: 'Dataset' object has no attribute 'access'

# List instruments with data in the date range

In [None]:
instrumentsInRrange = []

# iterate over all instruments that exist in the TDS 
for inst in catalog.catalog_refs:
    inst_catalog = catalog.catalog_refs[inst].follow() 
    #  available years for the instrument
    years = [int(year) for year in inst_catalog.catalog_refs.keys() if year.isdigit()]
    yearsInRange = [str(year) for year in years if startDate.year <= year <= endDate.year]
    
    if yearsInRange:
        #are there datasets within the months ?
        has_data = False
        for year in yearsInRange:
            year_catalog_ref = inst_catalog.catalog_refs[year]
            year_catalog = year_catalog_ref.follow()
            dataInYear = list(year_catalog.datasets.keys())
            
            for ncfile in dataInYear:
                try:
                    date_part = ncfile.split('_')[-1].split('.')[0]  # Extract 'YYYYMM'
                    dataset_date = datetime.strptime(date_part, '%Y%m')
                except ValueError:
                    continue  # Skipping files that don't match teh naming pattern
                
                #start and end dates
                dataset_start = dataset_date
                dataset_end = dataset_start + relativedelta(months=1) - timedelta(seconds=1)
                
                # overlaps?
                if (dataset_start <= endDate) and (dataset_end >= startDate):
                    has_data = True
                    break 
            if has_data:
                break  
        if has_data:
            instrumentsInRrange.append(inst)

print("Instruments with data in the date range:")
for idx, inst in enumerate(instrumentsInRrange):
    print(f"{idx+1}. {inst}")

# Choose which to download

In [None]:
selected_instruments = ['lidarWaveGauge090']

# Download .nc files 

In [None]:
downloadedFiles = []

for inst in selected_instruments:
    print(f"\nGetting instrument {inst}...")
    inst_catalog = catalog.catalog_refs[inst].follow()

    years = [int(year) for year in inst_catalog.catalog_refs.keys() if year.isdigit()]
    yearsInRange = [str(year) for year in years if startDate.year <= year <= endDate.year]
    
    if not yearsInRange:
        print(f"No data available for {inst} in the specified date range.")
        continue
    
    for year in yearsInRange:
        year_catalog_ref = inst_catalog.catalog_refs[year]
        year_catalog = year_catalog_ref.follow()
        dataInYear = list(year_catalog.datasets.keys())
        
        for dataset_name in dataInYear:
            # Extract date
            try:
                date_part = dataset_name.split('_')[-1].split('.')[0]  # Extract 'YYYYMM'
                dataset_date = datetime.strptime(date_part, '%Y%m')
            except ValueError:
                continue  
            
            # start and end dates
            dataset_start = dataset_date
            dataset_end = dataset_start + relativedelta(months=1) - timedelta(seconds=1)
            
            # is there overlap [start_date > data > end_date]
            if (dataset_start <= endDate) and (dataset_end >= startDate):
                dataset = year_catalog.datasets[dataset_name]
                url = dataset.access_urls['HTTPServer']
                local_filename = f"{inst}_{dataset_name}"
                
                if not os.path.exists(local_filename):
                    print(f"Downloading {url} to {local_filename}")
                    with requests.get(url, stream=True) as r:
                        r.raise_for_status()
                        with open(local_filename, 'wb') as f:
                            for chunk in r.iter_content(chunk_size=8192):
                                f.write(chunk)
                    downloadedFiles.append(local_filename)
                else:
                    print(f"{local_filename} already exists.")
                    downloadedFiles.append(local_filename)

# Collate the nc files and clip to date range 

In [None]:
ds_list = []
for f in downloadedFiles:
    ds = xr.open_dataset(f)
    ds_list.append(ds)

# concat along 'time' dimension
ds_combined = xr.concat(ds_list, dim='time', data_vars='minimal', coords='minimal', compat='override', join='override')

# Convert 'time' to datetime64 
if ds_combined['time'].dtype != 'datetime64[ns]':
    ds_combined['time'] = pd.to_datetime(ds_combined['time'].values)
    
# clip to range
ds_clipped = ds_combined.sel(time=slice(startDate, endDate))    
    
# save to new netCDF file
output_filename = f"{inst}_collated_clipped_data.nc"
ds_clipped.to_netcdf(output_filename)
print(f"Collated/clipped data saved to {output_filename}")