In [1]:
import pandas as pd
import numpy as np
import geopandas
import xarray as xr 
import netCDF4 as nc

from matplotlib import pyplot as plt 
from matplotlib.colors import LinearSegmentedColormap 
np.warnings.filterwarnings('ignore')

import urllib.request 
from urllib.error import HTTPError

In [2]:
noaa = pd.read_csv("../data/Data Summary.xlsx - NOAA CoastWatch API.csv")

# griddap is the main url link we'll use to contruct API request
# Title and Summary have descriptions on data
# we don't care for NA rows
noaa = noaa[['griddap', 'Title', 'Summary', "Info", "Dataset ID"]].dropna()

# upper case column names
noaa.columns = [c.upper() for c in noaa.columns]

# lower case all string values in title and summary
noaa['TITLE'] = noaa.TITLE.str.lower()
noaa['SUMMARY'] = noaa.SUMMARY.str.lower()

noaa.head()

Unnamed: 0,GRIDDAP,TITLE,SUMMARY,INFO,DATASET ID
1,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,"abi g17-star-l3c-v2.71 (abi l3c sst), 0.02\u00...",abi l3c sst. sea surface temperature retrieval...,https://coastwatch.pfeg.noaa.gov/erddap/info/j...,jplABI_G17-STAR-L3C-v2_71
2,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,"amsre model output, obs4mips nasa-jpl, global,...",this data set contains sea surface temperature...,https://coastwatch.pfeg.noaa.gov/erddap/info/j...,jplAmsreSstMon
3,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,"amsre model output, obs4mips nasa-jpl, global,...",this data set contains sea surface temperature...,https://coastwatch.pfeg.noaa.gov/erddap/info/j...,jplAmsreSstMon_LonPM180
5,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,"aquarius sea surface salinity, l3 smi, version...",this dataset has 3-month composites of aquariu...,https://coastwatch.pfeg.noaa.gov/erddap/info/j...,jplAquariusSSS3MonthV5
6,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,"aquarius sea surface salinity, l3 smi, version...",this dataset has 3-month composites of aquariu...,https://coastwatch.pfeg.noaa.gov/erddap/info/j...,jplAquariusSSS3MonthV5_Lon0360


In [3]:
# need the following parameters
# Sea Temperature           | weekly
# Degree Heating Weeks      | weekly
# Chlorophyll               | monthly
# PAR                       | monthly
# Bathymetry                

In [4]:
# check for parameter in summary description
param = 'photosynthetically available radiation'
param_cond = noaa.SUMMARY.str.contains(param)

# date range - usually in form of YYYY-YYYY or YYYY-present
date_range = 'present'
date_cond = noaa.TITLE.str.contains(date_range)

# location 
loc = 'global'
loc_cond = noaa.TITLE.str.contains('global')

# temporal frequency
freq = 'weekly|daily'
freq_cond = ~noaa.TITLE.str.contains(freq)

# exclude from title...
other = 'lon0360|experimental'
other_cond = ~noaa.TITLE.str.contains(other)

# noaa[param_cond & date_cond & loc_cond & freq_cond & other_cond].to_csv('par.csv')
noaa[param_cond & date_cond & loc_cond & freq_cond & other_cond].info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1107 to 1845
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   GRIDDAP     5 non-null      object
 1   TITLE       5 non-null      object
 2   SUMMARY     5 non-null      object
 3   INFO        5 non-null      object
 4   DATASET ID  5 non-null      object
dtypes: object(5)
memory usage: 240.0+ bytes


In [5]:
noaa[noaa['DATASET ID'] == 'nceiPH53sstd1day']

Unnamed: 0,GRIDDAP,TITLE,SUMMARY,INFO,DATASET ID
15,https://coastwatch.pfeg.noaa.gov/erddap/gridda...,avhrr pathfinder version 5.3 l3-collated (l3c)...,advanced very high resolution radiometer (avhr...,https://coastwatch.pfeg.noaa.gov/erddap/info/n...,nceiPH53sstd1day


In [6]:
# avhrr pathfinder version 5.3 l3-collated (l3c) sst, global, 0.0417\u00b0, 1981-present, daytime (1 day composite)
# datasaet ID nceiPH53sstd1day
dataset_id = 'nceiPH53sstd1day'

# retrieve data set info to get the variables in data set
url_info = noaa[noaa['DATASET ID'] == dataset_id].INFO.values[0]
urllib.request.urlretrieve(url_info, "temp_info.csv")

# read in temp file
info = pd.read_csv("temp_info.csv")
display(info[info['Row Type'] == 'variable'])

var = info[info['Row Type'] == 'variable']['Variable Name'].values
print(var)

Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value
106,variable,sea_surface_temperature,,double,"time, latitude, longitude"
122,variable,dt_analysis,,double,"time, latitude, longitude"
137,variable,wind_speed,,byte,"time, latitude, longitude"
152,variable,sea_ice_fraction,,double,"time, latitude, longitude"
168,variable,quality_level,,byte,"time, latitude, longitude"
187,variable,pathfinder_quality_level,,byte,"time, latitude, longitude"
205,variable,l2p_flags,,short,"time, latitude, longitude"


['sea_surface_temperature' 'dt_analysis' 'wind_speed' 'sea_ice_fraction'
 'quality_level' 'pathfinder_quality_level' 'l2p_flags']


In [7]:
# input parameters to retrieve data
lat_start = '27.2'
lat_stop = '24'
lon_start = '-83.2'
lon_stop = '-80'

date_start = '2018-06-01'
date_stop = '2018-10-30'

file_type = '.csvp'

# lets take a look at the url...
"https://coastwatch.pfeg.noaa.gov/erddap/griddap/" # base url
"nceiPH53sstd1day" #dataset_id
".csvp" #file_type

# for each variable name
# variable_name
# %5B(start_time):1:(end_time)%5D
# %5B(lat_start):1:(lat_end)%5D
# %5B(lon_start):1:(lon_end)%5D

"sea_surface_temperature%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",dt_analysis%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",wind_speed%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",sea_ice_fraction%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",quality_level%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",pathfinder_quality_level%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"
",l2p_flags%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D"

',l2p_flags%5B(2021-06-30T12:00:00Z):1:(2021-06-30T12:00:00Z)%5D%5B(25.6):1:(24)%5D%5B(-83.2):1:(-80)%5D'

In [8]:
# construct url
base_url = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/" + dataset_id + file_type + '?'
filter_url = ''

for v in var:
    filter_url += v
    
    for val in [(date_start, date_stop), (lat_stop, lat_start), (lon_start, lon_stop)]:
        
        filter_url += '%5B' + val[0] + ':1:' + val[1] + '%5D'
    filter_url += ','
    
filter_url = filter_url[:-1]

url = base_url + filter_url

print(url)

https://coastwatch.pfeg.noaa.gov/erddap/griddap/nceiPH53sstd1day.csvp?sea_surface_temperature%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,dt_analysis%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,wind_speed%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,sea_ice_fraction%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,quality_level%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,pathfinder_quality_level%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D,l2p_flags%5B2018-06-01:1:2018-10-30%5D%5B24:1:27.2%5D%5B-83.2:1:-80%5D


In [9]:
def construct_url(dataset_id, file_type, lat_start, lat_stop, lon_start, lon_stop, 
                  date_start, date_stop, altitude, rev_lat):
    
    # retrieve data set info to get the variables in data set
    url_info = noaa[noaa['DATASET ID'] == dataset_id].INFO.values[0]
    urllib.request.urlretrieve(url_info, "temp_info.csv")

    # read in temp file
    info = pd.read_csv("temp_info.csv")
    
    var = info[info['Row Type'] == 'variable']['Variable Name'].values
    print('variables in dataset...')
    print(var)
    
    # add parenthesis around inputs to construct url easier
    lat_start = '(' + lat_start + ')'
    lat_stop = '(' + lat_stop + ')'
    lon_start = '(' + lon_start + ')'
    lon_stop = '(' + lon_stop + ')'
    
    # reformate date to datetime
    date_start = '(' + date_start + 'T12:00:00Z)'
    date_stop = '(' + date_stop + 'T12:00:00Z)'
    
    # some data retrievals require lat_stop and lat_start order to be switched... 
    # this will be handled via http exception

    filters = [(date_start, date_stop)]
    
    if altitude:
        filters.append(('(0)', '(0)'))

    if rev_lat: # used for htpp excpetion
        filters.append((lat_stop, lat_start))
    else: # should work with most data sets
        filters.append((lat_start, lat_stop))

    filters.append((lon_start, lon_stop))

    # base url 
    base_url = "https://coastwatch.pfeg.noaa.gov/erddap/griddap/" + dataset_id + file_type + '?'
    
    # subset data based on above inputs
    filter_url = ''
    
    # construct filter_url 
    for v in var:
        
        filter_url += v
    
        for val in filters:
            
            filter_url += '%5B' + val[0] + ':1:' + val[1] + '%5D'
            
        filter_url += ','

    # remove last comma
    filter_url = filter_url[:-1]

    url = base_url + filter_url
    
    return url

In [10]:
def noaa_data(dataset_id, file_output, file_type, lat_start, lat_stop, 
              lon_start, lon_stop, date_start, date_stop, altitude):
    
    print("starting data extraction...")
    try:

        url = construct_url(dataset_id, file_type, lat_start, lat_stop, lon_start, lon_stop, 
                      date_start, date_stop, altitude=altitude, rev_lat = False)

        urllib.request.urlretrieve(url, file_output)
        print("completed data extraction...", file_output)

    except HTTPError as err:
    
        if err.code == 400:
            print("switching order of lat input...")

            url = construct_url(dataset_id, file_type, lat_start, lat_stop, lon_start, lon_stop, 
                      date_start, date_stop, altitude=altitude, rev_lat = True)

            urllib.request.urlretrieve(url, file_output)
            print("completed data extraction...", file_output)

        else:
            print("unable to extract data", file_output)
            
    

In [25]:
# avhrr pathfinder version 5.3 l3-collated (l3c) sst, global, 0.0417\u00b0, 1981-present, daytime (1 day composite)
# datasaet ID nceiPH53sstd1day

########################
###     INPUTS       ###
########################
# dataset_id = 'nceiPH53sstd1day'
dataset_id = 'nesdisVHNSQchlaMonthly'

file_output = 'florida_chl_2019.csv'

file_type = '.csvp'

lat_start = '27.2'
lat_stop = '24'
lon_start = '-83.2'
lon_stop = '-80'

# 2018
# date_start = '2018-06-01'
# date_stop = '2018-10-30'

# 2019
# date_start = '2019-03-30'
# date_stop = '2019-10-15'

# 2020
date_start = '2020-06-01'
date_stop = '2020-10-16'


# noaa_data(dataset_id, file_output, file_type, lat_start, lat_stop, 
#               lon_start, lon_stop, date_start, date_stop, rev_lat = False)

array(['chlor_a'], dtype=object)

In [7]:
# florida dataset
dataset = ['nceiPH53sstd1day', 'nesdisVHNSQchlaMonthly', 'erdMH1par0mday']
params = ['florida_sst_', 'florida_chl_', 'florida_par_']
file_output = []

dates = [('2018-06-01', '2018-10-30'), 
         ('2019-03-30', '2019-10-15'), 
         ('2020-06-01', '2020-10-16')]
        
file_type = '.csvp'

lat_start = '27.2'
lat_stop = '24'
lon_start = '-83.2'
lon_stop = '-80'

for data, param in zip(dataset, params):
    
    dataset_id = data

    for date in dates:
        
        date_start = date[0]
        date_stop = date[1]
        file_output = param + date[0][:4] + '.csv'
        
        if param == 'florida_chl_':
            altitude = True
        else:
            altitude = False

        noaa_data(dataset_id, file_output, file_type, lat_start, lat_stop, 
              lon_start, lon_stop, date_start, date_stop, altitude)

starting data extraction...
variables in dataset...
['sea_surface_temperature' 'dt_analysis' 'wind_speed' 'sea_ice_fraction'
 'quality_level' 'pathfinder_quality_level' 'l2p_flags']
completed data extraction... florida_sst_2018.csv
starting data extraction...
variables in dataset...
['sea_surface_temperature' 'dt_analysis' 'wind_speed' 'sea_ice_fraction'
 'quality_level' 'pathfinder_quality_level' 'l2p_flags']
completed data extraction... florida_sst_2019.csv
starting data extraction...
variables in dataset...
['sea_surface_temperature' 'dt_analysis' 'wind_speed' 'sea_ice_fraction'
 'quality_level' 'pathfinder_quality_level' 'l2p_flags']
completed data extraction... florida_sst_2020.csv
starting data extraction...
variables in dataset...
['chlor_a']
completed data extraction... florida_chl_2018.csv
starting data extraction...
variables in dataset...
['chlor_a']
completed data extraction... florida_chl_2019.csv
starting data extraction...
variables in dataset...
['chlor_a']
completed da

In [100]:
# bahamas dataset
dataset = ['nceiPH53sstd1day', 'nesdisVHNSQchlaMonthly', 'erdMH1par0mday']
params = ['bahamas_sst_', 'bahamas_chl_', 'bahamas_par_']
file_output = []

dates = [('2021-03-01', '2021-06-30')]
        
file_type = '.csvp'

lat_start = '27'
lat_stop = '25.2'
lon_start = '-79.2'
lon_stop = '-78'

for data, param in zip(dataset, params):
    
    dataset_id = data

    for date in dates:
        
        date_start = date[0]
        date_stop = date[1]
        file_output = param + date[0][:4] + '.csv'
        
        if param == 'bahamas_chl_':
            altitude = True
        else:
            altitude = False

        noaa_data(dataset_id, file_output, file_type, lat_start, lat_stop, 
              lon_start, lon_stop, date_start, date_stop, altitude)

starting data extraction...
variables in dataset...
['sea_surface_temperature' 'dt_analysis' 'wind_speed' 'sea_ice_fraction'
 'quality_level' 'pathfinder_quality_level' 'l2p_flags']
completed data extraction... bahamas_sst_2021.csv
starting data extraction...
variables in dataset...
['chlor_a']
completed data extraction... bahamas_chl_2021.csv
starting data extraction...
variables in dataset...
['par']
completed data extraction... bahamas_par_2021.csv
