In [1]:
# from siphon.simplewebservice.ndbc import NDBC
import pandas as pd
from erddapy import ERDDAP
from joblib import Parallel, delayed
import multiprocessing
import xarray as xr

# Notes

Now using ERDDAP directly instead. Need to make sure to get:

Sensors:
* covered by below but need to maybe add:
    * More NDBC
    * More CO-OPS
    * PORTS (not in erddap) (or do separately)
    * Work on ACCESS sensor ingestion first to see how that works.
    * Make sure that stations I am counting are not being counted under other organizations
    * Two directories of NDBC currents data. Still figuring out how to account for them. Some of the data is at GCOOS.

Platforms:
* how to access this data? shipboard ADCPs, etc
* should we use gliders? These are in IOOS erddap apparently.

Other:
* HF Radar DONE
* sea ice concentration, extent (grids) DONE
* Look at data inventory list to make sure all are being captured
* OOI?
* ARGO (https://github.com/euroargodev/argopy) not currently included.


* Clearly show that this can be used to just get station id's or other metadata, also.
* also want to be able to get all station id's available but then return their lon/lat and time range in case want to choose when to compare. Histogram?

How should this package behave?

Wrapper to outside:
* Input kw containing min/max lon/lat and time range.
  * Should be able to choose to return either metadata or the data
    * Ideally would be able to return the lon/lat and time range info with dataset_ids, as well as a type for the dataset which will group it for metric calculation and plotting.
  * If don't input min/max lon/lat or time range, return all dataset_ids for the variables
  * If also input a variable type or types (to override the default list):
    * Return the dataset_ids associated with only those variables
  * Should be able to query by type of data (time series/sensor/station, track/platform/glider, surface/grid, etc), which would be useful for the comparison and plotting step.
  
Internally:
* There will need to be a function for each collection/server of data
  * Each function should have the same inputs and outputs
  * Should accept kw, data type, variable (could do combinations of data type and variable); should have defaults available for data type and variable
  * Should return dictionary with 1 entry for each dataset that contains:
    * organization name, website, and id; axiom id (if relevant, or wherever to find the data if not the first place?); data itself (if requested, as combined dataframe or dataset); data type tag; time range; spatial location or box; depth(s)
    
Another use case might be a surface of x, y values to compare like a surface to a scatter plot.

Two separate tasks:
1. Package to access data through several servers.
    * Need to figure out how to access data for each type, write function, include.
2. Assessment of which data is available (generally) and how much is already on Axiom servers.
    * Need to list all possible data types and locations and see what we have.

In [2]:
# user input

# Gulf of Mexico 
kw = {
    "min_lon": -99.0,
    "max_lon": -88.0,
    "min_lat": 20.0,
    "max_lat": 30.0,
    "min_time": '2015-7-15', # "2016-07-10T00:00:00",#Z",
    "max_time": '2015-8-1', # "2017-02-10T00:00:00",#Z"
}

# # full U.S.
# kw = {
#     "min_lon": -195,# -99.0,
#     "max_lon": -60, #-88.0,
#     "min_lat": 17, #20.0,
#     "max_lat": 80, #30.0,
#     "min_time": '2001-1-1', # "2016-07-10T00:00:00",#Z",
#     "max_time": '2021-4-1', # "2017-02-10T00:00:00",#Z"
# }


# Test setup for managing data access

## Sensors

In [None]:
# check each data source for data in the kw-defined region and time frame
# input kw, return dataframes or datasets in some manner TBD

In [611]:
dataset_ids = sensors(kw, standard_names=['sea_water_practical_salinity'], only_meta=True)
len(dataset_ids)

TypeError: sensors() got an unexpected keyword argument 'only_meta'

In [315]:
# dfs = sensors(kw)
dataset_ids = sensors(kw, only_meta=True)

In [313]:
len(dataset_ids)

8297

## HF Radar

In [321]:
dataset_ids = hfradar(kw, standard_names='surface_eastward_sea_water_velocity', only_ids=True)
len(dataset_ids)
# dss = hfradar(kw)

https://coastwatch.pfeg.noaa.gov/erddap/search/advanced.csv?page=1&itemsPerPage=10000&protocol=griddap&cdm_data_type=(ANY)&institution=(ANY)&ioos_category=(ANY)&keywords=(ANY)&long_name=(ANY)&standard_name=(ANY)&variableName=(ANY)&minLon=-195&maxLon=-60&minLat=17&maxLat=80&minTime=978307200.0&maxTime=1617235200.0&searchFor=Coastal+Observing+Research+and+Development+Center%2C+Scripps+Institution+of+Oceanography


11

## Sea ice

In [479]:
ds = seaice_extent(kw)
ds

    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]


Unnamed: 0,Array,Chunk
Bytes,34.51 GB,71.30 MB
Shape,"(3530, 1174, 2082)","(272, 256, 256)"
Count,19046 Tasks,585 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 34.51 GB 71.30 MB Shape (3530, 1174, 2082) (272, 256, 256) Count 19046 Tasks 585 Chunks Type float32 numpy.ndarray",2082  1174  3530,

Unnamed: 0,Array,Chunk
Bytes,34.51 GB,71.30 MB
Shape,"(3530, 1174, 2082)","(272, 256, 256)"
Count,19046 Tasks,585 Chunks
Type,float32,numpy.ndarray


In [480]:
ds = seaice_con(kw)
ds

  decode_timedelta=decode_timedelta,
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': False}):
    ...     array[indexer]

To avoid creating the large chunks, set the option
    >>> with dask.config.set(**{'array.slicing.split_large_chunks': True}):
    ...     array[indexer]
  value = value[(slice(None),) * axis + (subkey,)]


Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 420.99 kB 420.99 kB Shape (368, 143) (368, 143) Count 5 Tasks 1 Chunks Type float64 numpy.ndarray",143  368,

Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray
"Array Chunk Bytes 420.99 kB 420.99 kB Shape (368, 143) (368, 143) Count 5 Tasks 1 Chunks Type float64 numpy.ndarray",143  368,

Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,5 Tasks,1 Chunks
Type,float64,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,15 Tasks,1 Chunks
Type,object,numpy.ndarray
"Array Chunk Bytes 420.99 kB 420.99 kB Shape (368, 143) (368, 143) Count 15 Tasks 1 Chunks Type object numpy.ndarray",143  368,

Unnamed: 0,Array,Chunk
Bytes,420.99 kB,420.99 kB
Shape,"(368, 143)","(368, 143)"
Count,15 Tasks,1 Chunks
Type,object,numpy.ndarray

Unnamed: 0,Array,Chunk
Bytes,1.55 GB,42.54 MB
Shape,"(7385, 368, 143)","(332, 224, 143)"
Count,287 Tasks,46 Chunks
Type,float32,numpy.ndarray
"Array Chunk Bytes 1.55 GB 42.54 MB Shape (7385, 368, 143) (332, 224, 143) Count 287 Tasks 46 Chunks Type float32 numpy.ndarray",143  368  7385,

Unnamed: 0,Array,Chunk
Bytes,1.55 GB,42.54 MB
Shape,"(7385, 368, 143)","(332, 224, 143)"
Count,287 Tasks,46 Chunks
Type,float32,numpy.ndarray


## Sketch of outside wrapper

In [None]:
# turn into class probably
def Data(kw, only_ids=False, standard_names=None, data_type=None):
    '''Overall wrapper.'''
    
    # can limit which data readers are called based on data_type
    # can also skip some readers based on data e.g. sea ice
    # pass in kw, only_ids, and standard_names

* initialize fetch object with region and station id(s) with desired variables and optional time range, or point to local files
* if input region, first step is to find the metadata
  * check each source for the desired variables
* find data
  * read in each dataset by row in the metadata dataframe
  * if local file, read in with known reader (?)

In [907]:
# data functions by data_type
DATASOURCES_GRID = [hfradar, seaice_extent, seaice_con]
DATASOURCES_SENSOR = [sensors]
DATASOURCES_PLATFORM = [sensors, argo]  # has gliders

# MAYBE SHOULD BE ABLE TO INITIALIZE THE CLASS WITH ONLY METADATA OR DATASET NAMES?
# to skip looking for the datasets

class Data(object):
    
    def __init__(self, kw=None, standard_names=None, data_types=None):
        
        # default kw to all U.S. and most recent 4 weeks
        if kw is None:
            now = pd.Timestamp.now().normalize()

            # full U.S.
            kw = {
                "min_lon": -195,
                "max_lon": -60, 
                "min_lat": 17, 
                "max_lat": 80, 
                "min_time": (now - pd.Timedelta('4W')).strftime('%Y-%m-%d'),
                "max_time": (now).strftime('%Y-%m-%d'),
            }

        self.kw = kw
        
#         self.only_meta = only_meta
        
        # default to all reasonable options
        # Note that `sea_ice_concentration` is not a standard name but 
        # we do want to include it from NSIDC.
        if standard_names is None:
            
            standard_names = ['sea_water_temperature', 
                              'sea_water_practical_salinity', 
                              'sea_water_speed', 
                              'sea_water_velocity_to_direction', 
                              'sea_surface_height', 
                              'sea_surface_height_above_sea_level', 
                              'sea_surface_height_amplitude_due_to_geocentric_ocean_tide',
                              'surface_eastward_sea_water_velocity',  # hfradar
                              'surface_northward_sea_water_velocity',  # hfradar
                              'sea_ice_speed',
                              'direction_of_sea_ice_velocity',
                              'eastward_sea_ice_velocity',
                              'northward_sea_ice_velocity',
                              'sea_ice_extent',
                              'sea_ice_area_fraction'  # multiply by 100 to get sea_ice_concentration which is not a standard name
                              ]
            
        self.standard_names = standard_names

        # default to including all data types
        # These should map to the type of plot that will be possible.
        if data_types is None:

            data_types = ['sensor', 'platform', 'grid']
            
        self.data_types = data_types
        
        
    @property
    def sources(self):
        '''Which data sources to include.
        
        This selection is based on data_types.
        '''
        
        if not hasattr(data, '_sources'):
            
            # Only check DATASOURCE if relevant data_type
            sourcestocheck = []
            if 'sensor' in self.data_types:
                sourcestocheck += DATASOURCES_SENSOR
            if 'platform' in self.data_types:
                sourcestocheck += DATASOURCES_PLATFORM
            if 'grid' in self.data_types:
                sourcestocheck += DATASOURCES_GRID

            # don't need duplicates
            sourcestocheck = list(set(sourcestocheck))
            
            self._sources = sourcestocheck
        
        return self._sources
            
    
    @property
    def meta(self):
        '''Find and return metadata for datasets.
        
        Do this by querying each data source function for metadata
        then use the metadata for quick returns.
        
        This will not rerun if the metadata has already been found.
        
        SEPARATE DATASOURCE FUNCTIONS INTO A PART THAT RETRIEVES THE 
        DATASET_IDS AND METADATA AND A PART THAT READS IN THE DATA.
        
        DIFFERENT SOURCES HAVE DIFFERENT METADATA
        
        '''
        
        if not hasattr(data, '_meta'):

            # loop over data sources to read in metadata
            meta = []
            for source in self.sources:

                meta.append(source(self.kw, standard_names=self.standard_names, 
                                   only_meta=True, data_types=self.data_types))

            self._meta = meta
        
        return self._meta
    
    
    @property
    def data(self):
        '''Return the data, given metadata.'''
        
        if not hasattr(self, '_data'):
            
            # loop over data sources to read in data
            data = []
            for source in self.sources:
                
                # THIS SHOULD USE THE FACT THAT WE ALREADY KNOW THE METADATA
                data.append(source(self.kw, standard_names=self.standard_names, 
                                   only_meta=False, data_types=self.data_types))
                
            self._data = data
                
        return self._data

### Demo use cases

In [908]:
# %%time
data = Data(kw={"min_lon": -99.0,
                "max_lon": -88.0,
                "min_lat": 20.0,
                "max_lat": 30.0,
                "min_time": '2015-7-15', 
                "max_time": '2015-8-1', 
                },
            standard_names=['sea_water_practical_salinity'],
            data_types=['sensor']
            )

# # explore defaults
# print('Default keyword arguments: ', data.kw)
# print('Default variables to search for: ', data.standard_names)
# print('By default, search for only metadata (only_meta): ', data.only_meta)
# print('Default data_types: ', data.data_types)

# Return metadata 
data.meta;

In [909]:
data.data

[{'gov_usgs_waterdata_073802512': None,
  'noaa_nos_co_ops_8771013':                            longitude (degrees_east)  latitude (degrees_north)  \
  time (UTC)                                                                      
  2015-08-01 00:00:00+00:00                  -94.9172                   29.4814   
  2015-07-31 23:54:00+00:00                  -94.9172                   29.4814   
  2015-07-31 23:48:00+00:00                  -94.9172                   29.4814   
  2015-07-31 23:42:00+00:00                  -94.9172                   29.4814   
  2015-07-31 23:36:00+00:00                  -94.9172                   29.4814   
  ...                                             ...                       ...   
  2015-07-15 00:24:00+00:00                  -94.9172                   29.4814   
  2015-07-15 00:18:00+00:00                  -94.9172                   29.4814   
  2015-07-15 00:12:00+00:00                  -94.9172                   29.4814   
  2015-07-15 00:06:0

#### Find data given the metadata

In [893]:
data.meta[1].index[0]

'gov_usgs_waterdata_073814675'

#### Small region, short time

In [773]:
%%time
data = Data(kw={"min_lon": -99.0,
                "max_lon": -88.0,
                "min_lat": 20.0,
                "max_lat": 30.0,
                "min_time": '2015-7-15', 
                "max_time": '2015-8-1', 
                },
            standard_names=['sea_water_practical_salinity']
            )

# # explore defaults
# print('Default keyword arguments: ', data.kw)
# print('Default variables to search for: ', data.standard_names)
# print('By default, search for only metadata (only_meta): ', data.only_meta)
# print('Default data_types: ', data.data_types)

# Return metadata 
meta = data.ChooseSources()
meta

HTTPError: HTTP Error 404: 

#### Return datasets for default

U.S. region, 1 year up to now, all default variables, all data types.

In [764]:
%%time
data = Data()

# explore defaults
print('Default keyword arguments: ', data.kw)
print('Default variables to search for: ', data.standard_names)
print('By default, search for only metadata (only_meta): ', data.only_meta)
print('Default data_types: ', data.data_types)

# Return metadata for the default options
meta = data.ChooseSources()
meta

Default keyword arguments:  {'min_lon': -195, 'max_lon': -60, 'min_lat': 17, 'max_lat': 80, 'min_time': '2020-04-09', 'max_time': '2021-04-08'}
Default variables to search for:  ['sea_water_temperature', 'sea_water_practical_salinity', 'sea_water_speed', 'sea_water_velocity_to_direction', 'sea_surface_height', 'sea_surface_height_above_sea_level', 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide', 'surface_eastward_sea_water_velocity', 'surface_northward_sea_water_velocity', 'sea_ice_speed', 'direction_of_sea_ice_velocity', 'eastward_sea_ice_velocity', 'northward_sea_ice_velocity', 'sea_ice_extent', 'sea_ice_concentration']
By default, search for only metadata (only_meta):  True
Default data_types:  ['sensor', 'platform', 'grid']


  decode_timedelta=decode_timedelta,


HTTPError: HTTP Error 404: 

#### Return all possible datasets

No restrictions on time/space/type/variables beyond the defaults which are U.S. Need to change the time range to get more time than the default (1 year up to now).

In [702]:
%%time
meta = Data(kw=kw, only_meta=True).ChooseSources()


  decode_timedelta=decode_timedelta,


ValueError: 13 columns passed, passed data had 14 columns

In [703]:
meta

Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,keywords,id,infoUrl,institution,source,sourceUrl,data_type
ucsdHfrE2,https://coastwatch.pfeg.noaa.gov/erddap,21.7,46.49442,-97.88385,-57.19249,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"2km, circulation, coast, currents, dilution, E...",202104012100siohfruwlsrtvusegc2km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid
ucsdHfrE6,https://coastwatch.pfeg.noaa.gov/erddap,21.73596,46.49442,-97.88385,-57.23121,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"6km, circulation, coast, currents, dilution, E...",202104012100siohfruwlsrtvusegc6km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid
ucsdHfrE1,https://coastwatch.pfeg.noaa.gov/erddap,21.7,46.49442,-97.88385,-57.19249,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"1km, circulation, coast, currents, dilution, E...",202104012000siohfruwlsrtvusegc1km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid


In [625]:
meta[0]

Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,defaultDataQuery,id,infoUrl,institution
gov_usgs_waterdata_08164370,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_08076700,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_295744093303800,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_08079120,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_08164503,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
...,...,...,...,...,...,...,...,...,...,...,...
gov_noaa_water_csvt2,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_noaa_water_snpt2,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_noaa_water_egil1,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_07385765,http://erddap.sensors.ioos.us/erddap,33.7197,33.7197,-118.2728,-118.2728,2015-05-05T12:42:00Z,2021-04-14T12:23:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",17778,https://sensors.ioos.us/#metadata/17778/station,NOAA Center for Operational Oceanographic Prod...


# ARGO

In [383]:
! conda install -c conda-forge argopy --yes

Collecting package metadata (current_repodata.json): done
Solving environment: done

## Package Plan ##

  environment location: /Users/kthyng/miniconda3/envs/env

  added / updated specs:
    - argopy


The following packages will be downloaded:

    package                    |            build
    ---------------------------|-----------------
    aiohttp-3.7.4              |   py37h271585c_0         607 KB  conda-forge
    argopy-0.1.7               |     pyhd8ed1ab_0          69 KB  conda-forge
    async-timeout-3.0.1        |          py_1000          11 KB  conda-forge
    conda-4.10.0               |   py37hf985489_1         3.0 MB  conda-forge
    fsspec-0.8.3               |             py_0          64 KB  conda-forge
    multidict-5.1.0            |   py37hf967b71_1          60 KB  conda-forge
    scikit-learn-0.24.1        |   py37hbcf18d0_0         6.5 MB  conda-forge
    threadpoolctl-2.1.0        |     pyh5ca1d4c_0          15 KB  conda-forge
    typing-extensions-3.7.4.

In [698]:
def argo(kw, only_ids=False, standard_names=None, data_types=None, only_meta=False):
    
    # metedata loading
    from argopy import IndexFetcher as ArgoIndexFetcher
    index_loader = ArgoIndexFetcher()
    idx = index_loader.region([kw['min_lon'], kw['max_lon'], 
                               kw['min_lat'], kw['max_lat'], 
                               kw['min_time'], kw['max_time']])

    if only_meta:
        return idx.to_dataframe()

    # Data loading
    from argopy import DataFetcher as ArgoDataFetcher
    argo_loader = ArgoDataFetcher(parallel=True)
    pressure = [0, 10]  # db
    # using -180 in place of the larger min_lon bound I had for the US since argopy requires -180,180. 
    # this would need to be updated.
    ds = argo_loader.region([kw['min_lon'], kw['max_lon'], 
                             kw['min_lat'], kw['max_lat'], 
                             pressure[0], pressure[1], 
                             kw['min_time'], kw['max_time']]).to_xarray()

    return ds

In [386]:
ds

# NSIDC

## Extent

In [697]:
def seaice_extent(kw, standard_names=None, only_ids=False, data_types=None, only_meta=False):

    url = 'http://thredds.aoos.org/thredds/dodsC/MASIE.nc'
    ds = xr.open_dataset(url, chunks='auto').sel(time=slice(kw['min_time'],kw['max_time']))

    import pyproj
    proj = pyproj.Proj(ds.polar_stereographic.attrs['spatial_ref'])
    # lon, lat = proj(ds.x, ds.y, inverse=True) 
    x, y = proj([kw['min_lon'], kw['max_lon']], [kw['min_lat'], kw['max_lat']] )
    # # https://github.com/pyproj4/pyproj/issues/152
    # from pyproj import CRS
    # crs = CRS(ds.polar_stereographic.attrs['spatial_ref'])
    # proj = pyproj.Proj(crs)
    # lon, lat = proj(ds.x, ds.y, inverse=True)   

    # this condition defines the region of interest
    box = ((min(x) < ds.x) & (ds.x < max(x)) & (min(y) < ds.y) & (ds.y < max(y)))
    ds = ds.where(box, drop=True)

    if only_meta:
        return ds.attrs
    else:
        return ds


## Concentration

In [696]:
def seaice_con(kw, standard_names=None, only_ids=False, data_types=None, only_meta=False):
    url = 'http://thredds.aoos.org/thredds/dodsC/NSIDC_SEA_ICE_CON.nc#fillmismatch'
    ds = xr.open_dataset(url, chunks='auto').sel(time=slice(kw['min_time'],kw['max_time']))
    # this condition defines the region of interest
    box = ((kw['min_lon'] < ds.longitude) & (ds.longitude < kw['max_lon']) & (kw['min_lat'] < ds.latitude) & (ds.latitude < kw['max_lat']))
    ds = ds.where(box, drop=True)
    if only_meta:
        return ds.attrs
    else:
        return ds


# ERDDAP reader class

IS THIS WHERE A SINGLE DATASET ID READIN SHOULD OCCUR?

In [446]:
class ErddapReader(object):
    

    def __init__(self, known_server='ioos', protocol=None, server=None):
        
#         # run checks for KW 
#         self.kw = kw
        
        # either select a known server or input protocol and server string
        if known_server == 'ioos':
            protocol = 'tabledap'
            server = 'http://erddap.sensors.ioos.us/erddap'
        elif known_server == 'coastwatch':
            protocol = 'griddap'
            server = 'https://coastwatch.pfeg.noaa.gov/erddap/'
        else:
            statement = 'either select a known server or input protocol and server string'
            assert (protocol is not None) & (server is not None), statement
        
        self.e = ERDDAP(server=server)
        self.e.protocol = protocol
                  
        # columns for metadata
        self.columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'keywords',  # for hf radar
               'id', 'infoUrl', 'institution', 'featureType', 'source', 'sourceUrl']
        
# #         self.data_type = data_type
#         self.standard_names = standard_names
#         # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER
    
    
    @property
    def dataset_ids(self):
        '''Find dataset_ids for server.'''
        
        if not hasattr(self, '_dataset_ids'):
            
            # This should be a region search
            if (self.standard_names is not None) and (self._stations is None): 
        
                # find all the dataset ids which we will use to get the data
                # This limits the search to our keyword arguments in kw which should 
                # have min/max lon/lat/time values
                dataset_ids = []
                for standard_name in self.standard_names:

                    # find and save all dataset_ids associated with standard_name
                    # if standard_name is not found, this will return all search 
                    # results. Since we want to know if a standard_name was not 
                    # found, we also do the subsequent search that doesn't include
                    # the standard_name constraint, and compare the length of the 
                    # results. If they are the same length, then we conclude that 
                    # standard_name was not found in the search.
                    search_url = self.e.get_search_url(response="csv", **self.kw, 
                                                       standard_name=standard_name, 
                                                       items_per_page=10000)
                    search_url2 = self.e.get_search_url(response="csv", **self.kw, 
                                                       items_per_page=10000)

                    try:
                        search = pd.read_csv(search_url)
                        search2 = pd.read_csv(search_url2)
                        assert len(search) != len(search2), "standard_name was not found in the search, don't use these dataset_ids"

                        dataset_ids.extend(search["Dataset ID"])
                    except:
                        pass
                        # should go into logger
            #             print('standard_name %s not found' % standard_name)


                # only need a dataset id once since we will check them each for all standard_names
                self._dataset_ids = list(set(dataset_ids))
            
            # This should be a search for the station names
            elif self._stations is not None:
                
                # search by station name for each of stations
                dataset_ids = []
                for station in self._stations:
                    # if station has more than one word, AND will be put between to search for multiple 
                    # terms together
                    url = self.e.get_search_url(response="csv", items_per_page=5, search_for=station)

#                     try:
                    df = pd.read_csv(url)
    
                    # first try for exact station match
                    try:
                        dataset_id = [dataset_id for dataset_id in df['Dataset ID'] if station.lower() in dataset_id.lower().split('_')][0]

                    # if that doesn't work, trying for more general match and just take first returned option
                    except:
                        dataset_id = df.iloc[0]['Dataset ID']
        
#                         if 'tabs' in org_id:  # don't split
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()]
#                         else:
#                             axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0]
                
#                     except:
#                         dataset_id = None
                
                    dataset_ids.append(dataset_id)
                    
                self._dataset_ids = list(set(dataset_ids))
                
            
        return self._dataset_ids
        
    
    def meta_by_dataset(self, dataset_id):

        info_url = self.e.get_info_url(response="csv", dataset_id=dataset_id)
        info = pd.read_csv(info_url)

        items = []

        for col in self.columns:

            try:
                item = info[info['Attribute Name'] == col]['Value'].values[0]
                dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
            except:
                if col == 'featureType':
                    # this column is not present in HF Radar metadata but want it to
                    # map to data_type, so input 'grid' in that case.
                    item = 'grid'
                else:
                    item = 'NA'

            if dtype == 'String':
                pass
            elif dtype == 'double':
                item = float(item)
            elif dtype == 'int':
                item = int(item)
            items.append(item)
            
        if self.standard_names is not None:
            # In case the variable is named differently from the standard names, 
            # we back out the variable names here for each dataset. This also only 
            # returns those names for which there is data in the dataset.
            varnames = self.e.get_var_by_attr(
                dataset_id=dataset_id,
                standard_name=lambda v: v in self.standard_names
            )
        else:
            varnames = None

        ## include download link ##
        self.e.dataset_id = dataset_id
        if self.e.protocol == 'tabledap':
            if self.standard_names is not None:
                self.e.variables = ["time","longitude", "latitude", "station"] + varnames
            # set the same time restraints as before
            self.e.constraints = {'time<=': self.kw['max_time'], 'time>=': self.kw['min_time'],}
            download_url = self.e.get_download_url(response='csvp')

        elif self.e.protocol == 'griddap':
            # the search terms that can be input for tabledap do not work for griddap
            # in erddapy currently. Instead, put together an opendap link and then 
            # narrow the dataset with xarray.
            # get opendap link
            download_url = self.e.get_download_url(response='opendap')
        
        # add erddap server name
        return {dataset_id: [self.e.server, download_url] + items + [varnames]}
    
      
    @property
    def meta(self):
        
        if not hasattr(self, '_meta'):
            
            downloads = []
            for dataset_id in self.dataset_ids:
                downloads.append(self.meta_by_dataset(dataset_id))
            
#             # get metadata for datasets
#             # run in parallel to save time
#             num_cores = multiprocessing.cpu_count()
#             downloads = Parallel(n_jobs=num_cores)(
#                 delayed(self.meta_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
#             )

            # make dict from individual dicts
            from collections import ChainMap
            meta = dict(ChainMap(*downloads)) 

            # Make dataframe of metadata
            # variable names are the column names for the dataframe
            self._meta = pd.DataFrame.from_dict(meta, orient='index', 
                                                columns=['database','download_url'] \
                                                + self.columns + ['variable names'])
           
        return self._meta       
    
    
    def data_by_dataset(self, dataset_id):

        download_url = self.meta.loc[dataset_id, 'download_url']
        # data variables in ds that are not the variables we searched for
        varnames = self.meta.loc[dataset_id, 'variable names']

        if self.e.protocol == 'tabledap':

            try:

                # fetch metadata if not already present
                # found download_url from metadata and use
                dd = pd.read_csv(download_url, index_col=0, parse_dates=True)
                
                # Drop cols and rows that are only NaNs.
                dd = dd.dropna(axis='index', how='all').dropna(axis='columns', how='all')

                if varnames is not None:
                    # check to see if there is any actual data
                    # this is a bit convoluted because the column names are the varnames 
                    # plus units so can't match 1 to 1.
                    datacols = 0  # number of columns that represent data instead of metadata
                    for col in dd.columns:
                        datacols += [varname in col for varname in varnames].count(True)
                    # if no datacols, we can skip this one.
                    if datacols == 0:
                        dd = None
                    
            except Exception as e:
                # LOGGER
        #         print('no data to be read in for %s' % dataset_id)
                print(e)
                dd = None
        
        elif self.e.protocol == 'griddap':

            try:
                dd = xr.open_dataset(download_url, chunks='auto').sel(latitude=slice(self.kw['min_lat'],self.kw['max_lat']), 
                                              longitude=slice(self.kw['min_lon'],self.kw['max_lon']), 
                                              time=slice(self.kw['min_time'],self.kw['max_time']))

                # use variable names to drop other variables (should. Ido this?)
                l = set(dd.data_vars) - set(varnames)
                dd = dd.drop_vars(l)
                
            except Exception as e:
#                 print(e)
                dd = None
                #LOGGER
                
        return (dataset_id, dd)


    @property
    def data(self):
        
        if not hasattr(self, '_data'):
#             downloads = []
#             for dataset_id in self.dataset_ids:
#                 downloads.append(self.data_by_dataset(dataset_id))
            num_cores = multiprocessing.cpu_count()
            downloads = Parallel(n_jobs=num_cores)(
                delayed(self.data_by_dataset)(dataset_id) for dataset_id in self.dataset_ids
            )

#             if downloads is not None:
            dds = {dataset_id: dd for (dataset_id, dd) in downloads}
#             else:
#                 dds = None

            self._data = dds

        return self._data

    
    # Search for stations by region
    def region(self, kw, standard_names):
        
        self._stations = None
        
        # run checks for KW 
        # check for lon/lat values and time
        self.kw = kw
                          
#         self.data_type = data_type
        self.standard_names = standard_names
        # DOESN'T CURRENTLY LIMIT WHICH VARIABLES WILL BE FOUND ON EACH SERVER
        
        return self

    
    def stations(self, dataset_ids=None, stations=None, kw=None):
        ''''''
        
        self.standard_names = None
        
        if dataset_ids is not None:
            if not isinstance(dataset_ids, list):
                dataset_ids = [dataset_ids]
            self._dataset_ids = dataset_ids 
        
        if stations is not None:
            if not isinstance(stations, list):
                stations = [stations]
            self._stations = stations
            self.dataset_ids
            
        
        # CHECK FOR KW VALUES AS TIMES
        if kw is None:
            kw = {'min_time': '1900-01-01', 'max_time': '2100-12-31'}
            
        self.kw = kw
#         print(self.kwself.)
        
            
        return self

For a single station:
* need a special function to map a station id and institution to find it
* get metadata for a particular station (`meta_by_dataset`)
* get data for a particular station (`data by dataset`)
* if searching for a single station, may not want to choose which variables to use and instead use them all. This won't work currently.

Test case:
reader = ErddapReader(kw=kw, standard_names=['sea_water_practical_salinity'], known_server='ioos')
reader.data_by_dataset('gov_usgs_waterdata_073745257')

In [440]:
station = ErddapReader(known_server='ioos').stations(stations=['noaa_nos_co_ops_8771013','8771972','TABS B', 'NDBC 42020'], kw={'min_time': '2019-1-1', 'max_time': '2019-2-1'})
station.dataset_ids

['noaa_nos_co_ops_8771013', 'wmo_42020', 'noaa_nos_co_ops_8771972', 'tabs_b']

In [434]:
station = ErddapReader(known_server='ioos').stations(dataset_ids='noaa_nos_co_ops_8771013', kw={'min_time': '2019-1-1', 'max_time': '2019-2-1'})
station.dataset_ids

['noaa_nos_co_ops_8771013']

In [447]:
reg = ErddapReader(known_server='ioos').region(kw=kw, standard_names=['sea_water_practical_salinity'])
reg.meta

Unnamed: 0,database,download_url,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,defaultDataQuery,subsetVariables,keywords,id,infoUrl,institution,featureType,source,sourceUrl,variable names
noaa_nos_co_ops_8774230,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,28.2275,28.2275,-96.7964,-96.7964,2015-05-05T15:10:00Z,2021-04-16T13:44:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,,57560,https://sensors.ioos.us/#metadata/57560/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_295744093303800,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.962222,29.962222,-93.510556,-93.510556,2015-05-05T13:00:00Z,2021-04-09T17:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,31725,https://sensors.ioos.us/#metadata/31725/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_292859090004000,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.483056,29.483056,-90.011111,-90.011111,2015-05-05T12:00:00Z,2020-04-30T12:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,31970,https://sensors.ioos.us/#metadata/31970/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_07387040,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.713266,29.713266,-91.880398,-91.880398,2015-05-05T11:48:00Z,2021-04-07T16:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,32721,https://sensors.ioos.us/#metadata/32721/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_07387050,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.674444,29.674444,-92.135556,-92.135556,2015-05-05T13:00:00Z,2021-04-09T17:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,29528,https://sensors.ioos.us/#metadata/29528/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_08067252,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.812443,29.812443,-94.731309,-94.731309,2015-05-05T11:45:00Z,2021-04-09T18:15:00Z,"river_discharge,sea_water_electrical_conductiv...",,,32690,https://sensors.ioos.us/#metadata/32690/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_293809092361500,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.635833,29.635833,-92.604167,-92.604167,2015-05-05T12:00:00Z,2021-04-09T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,23981,https://sensors.ioos.us/#metadata/23981/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
noaa_nos_co_ops_8761724,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,29.2633,29.2633,-89.9567,-89.9567,2015-05-05T11:48:00Z,2021-04-16T03:46:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,,45576,https://sensors.ioos.us/#metadata/45576/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_08117300,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,28.898028,28.898028,-95.381608,-95.381608,2015-05-05T11:45:00Z,2021-04-09T18:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,31833,https://sensors.ioos.us/#metadata/31833/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]
gov_usgs_waterdata_0821150305,http://erddap.sensors.ioos.us/erddap,http://erddap.sensors.ioos.us/erddap/tabledap/...,27.89775,27.89775,-97.616167,-97.616167,2015-05-05T11:45:00Z,2021-04-09T17:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,,31319,https://sensors.ioos.us/#metadata/31319/station,USGS National Water Information System (NWIS),TimeSeries,,https://sensors.axds.co/api/,[sea_water_practical_salinity]


## HF Radar

In [881]:


def request_hfradar(e, dataset_id, standard_names):
    
    # In case the variable is named differently from the standard names, 
    # we back out the variable names here for each dataset. This also only 
    # returns those names for which there is data in the dataset.
    varnames = e.get_var_by_attr(
        dataset_id=dataset_id,
        standard_name=lambda v: v in standard_names
    )
    
    # the search terms that can be input for tabledap do not work for griddap
    # in erddapy currently. Instead, put together an opendap link and then 
    # narrow the dataset with xarray.
    e.dataset_id = dataset_id
    # get opendap link
    url = e.get_download_url(response='opendap')
    ds = xr.open_dataset(url).sel(latitude=slice(kw['min_lat'],kw['max_lat']), 
                                  longitude=slice(kw['min_lon'],kw['max_lon']), 
                                  time=slice(kw['min_time'],kw['max_time']))

    return (dataset_id, ds)


def request_metadata(e, dataset_id):
    '''Read select metadata from erddap.
    SHOULD BE ABLE TO COMBINE WITH SENSOR METADATA FUNCTION
    '''
    
    columns = ['geospatial_lat_min', 'geospatial_lat_max', 
           'geospatial_lon_min', 'geospatial_lon_max', 
           'time_coverage_start', 'time_coverage_end',
           'keywords',  
           'id', 'infoUrl', 'institution', 'source', 'sourceUrl']

    info_url = e.get_info_url(response="csv", dataset_id=dataset_id)
    info = pd.read_csv(info_url)
        
    items = []

    for col in columns:

        try:
            item = info[info['Attribute Name'] == col]['Value'].values[0]
            dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
        except:
            item = 'NA'

        if dtype == 'String':
            pass
        elif dtype == 'double':
            item = float(item)
        elif dtype == 'int':
            item = int(item)
        items.append(item)

    # add erddap server name and data_type
    return {dataset_id: [e.server] + items + ['grid']}


def hfradar(kw, standard_names=None, only_ids=False, data_types=None, only_meta=False):
    
    columns = ['geospatial_lat_min', 'geospatial_lat_max', 
           'geospatial_lon_min', 'geospatial_lon_max', 
           'time_coverage_start', 'time_coverage_end',
           'keywords',  
           'id', 'infoUrl', 'institution', 'source', 'sourceUrl']
    
    if standard_names is None:
        
        # use this default list if None input
        standard_names = ['surface_eastward_sea_water_velocity','surface_northward_sea_water_velocity']

    # make into a list if isn't already
    elif isinstance(standard_names, str):
        
        standard_names = list(standard_names)

    # only run this function if user is looking for velocity 
    # look for the only two possible HF Radar variables in the input standard_names
    east = ['surface_eastward_sea_water_velocity' in standard_name for standard_name in standard_names].count(True)
    north = ['surface_northward_sea_water_velocity' in standard_name for standard_name in standard_names].count(True)
    if east+north == 0:
        return None
    
    e = ERDDAP(server='https://coastwatch.pfeg.noaa.gov/erddap/', protocol='griddap')
    
    # search for datasets in our region of interest. We know for HF Radar which instutition we want 
    # to search for so we don't need to specify other terms.
    inst = 'Coastal Observing Research and Development Center, Scripps Institution of Oceanography'
    # just hard wire the velocity standard_name
    url = e.get_search_url(search_for=inst, response="csv", standard_name='surface_eastward_sea_water_velocity', **kw, items_per_page=10000)
    print(url)

    # Read in the search info to find the dataset_ids for datasets we will be reading data from.
    df = pd.read_csv(url)
    dataset_ids = df['Dataset ID']
    
    # only return dataset_ids if requested
    if only_ids:
        return dataset_ids

    
    # get metadata for datasets
    # run in parallel to save time
    num_cores = multiprocessing.cpu_count()
    downloads = Parallel(n_jobs=num_cores)(
        delayed(request_metadata)(e, dataset_id) for dataset_id in dataset_ids
    )

    # make dict from individual dicts
    from collections import ChainMap
    meta = dict(ChainMap(*downloads)) 
#     print(meta)
    # Make dataframe of metadata
    meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database'] + columns + ['data_type'])
    
    if only_meta:
        return meta  
    
    num_cores = multiprocessing.cpu_count()
    downloads = Parallel(n_jobs=num_cores)(
        delayed(request_hfradar)(e, dataset_id, standard_names) for dataset_id in dataset_ids
    )

    dss = {dataset_id: ds for (dataset_id, ds) in downloads}

    return dss

# dss = hfradar(kw)

In [630]:
dataset_ids = hfradar(kw, only_ids=True)
dataset_ids

https://coastwatch.pfeg.noaa.gov/erddap/search/advanced.csv?page=1&itemsPerPage=10000&protocol=griddap&cdm_data_type=(ANY)&institution=(ANY)&ioos_category=(ANY)&keywords=(ANY)&long_name=(ANY)&standard_name=(ANY)&variableName=(ANY)&minLon=-99.0&maxLon=-88.0&minLat=20.0&maxLat=30.0&minTime=1436918400.0&maxTime=1438387200.0&searchFor=Coastal+Observing+Research+and+Development+Center%2C+Scripps+Institution+of+Oceanography


0    ucsdHfrE1
1    ucsdHfrE6
2    ucsdHfrE2
Name: Dataset ID, dtype: object

In [693]:
meta = hfradar(kw, only_meta=True)
meta

Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,keywords,id,infoUrl,institution,source,sourceUrl,data_type
ucsdHfrE2,https://coastwatch.pfeg.noaa.gov/erddap,21.7,46.49442,-97.88385,-57.19249,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"2km, circulation, coast, currents, dilution, E...",202104012100siohfruwlsrtvusegc2km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid
ucsdHfrE6,https://coastwatch.pfeg.noaa.gov/erddap,21.73596,46.49442,-97.88385,-57.23121,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"6km, circulation, coast, currents, dilution, E...",202104012100siohfruwlsrtvusegc6km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid
ucsdHfrE1,https://coastwatch.pfeg.noaa.gov/erddap,21.7,46.49442,-97.88385,-57.19249,2012-01-01T00:00:00Z,2021-04-07T19:00:00Z,"1km, circulation, coast, currents, dilution, E...",202104012000siohfruwlsrtvusegc1km,https://cordc.ucsd.edu/projects/mapping/,Coastal Observing Research and Development Cen...,surface ocean velocity field from hf-radar,https://hfrnet-tds.ucsd.edu/thredds/dodsC/HFR/...,grid


In [691]:
info_url = e.get_info_url(response="html", dataset_id='ucsdHfrE1')
info_url
# info = pd.read_csv(info_url)
# info

'https://coastwatch.pfeg.noaa.gov/erddap/info/ucsdHfrE1/index.html'

In [150]:
e = ERDDAP(server='https://coastwatch.pfeg.noaa.gov/erddap/', protocol='griddap')

In [151]:
standard_names = ['surface_eastward_sea_water_velocity','surface_northward_sea_water_velocity']

# search for datasets in our region of interest. We know for HF Radar which instutition we want 
# to search for so we don't need to specify other terms.
inst = 'Coastal Observing Research and Development Center, Scripps Institution of Oceanography'
url = e.get_search_url(search_for=inst, response="csv", **kw, items_per_page=10000)
print(url)

# Read in the search info to find the dataset_ids for datasets we will be reading data from.
df = pd.read_csv(url)
dataset_ids = df['Dataset ID']
print(dataset_ids.head())



https://coastwatch.pfeg.noaa.gov/erddap/search/advanced.csv?page=1&itemsPerPage=10000&protocol=griddap&cdm_data_type=(ANY)&institution=(ANY)&ioos_category=(ANY)&keywords=(ANY)&long_name=(ANY)&standard_name=(ANY)&variableName=(ANY)&minLon=-170&maxLon=-160&minLat=70&maxLat=75&minTime=1436918400.0&maxTime=1438387200.0&searchFor=Coastal+Observing+Research+and+Development+Center%2C+Scripps+Institution+of+Oceanography
0    ucsdHfrA6
Name: Dataset ID, dtype: object


In [152]:
# decide which datasets to actually use; presumably the highest res?
# maybe should depend on the model output?

In [156]:
def request(dataset_id, standard_names):
    
    # In case the variable is named differently from the standard names, 
    # we back out the variable names here for each dataset. This also only 
    # returns those names for which there is data in the dataset.
    varnames = e.get_var_by_attr(
        dataset_id=dataset_id,
        standard_name=lambda v: v in standard_names
    )
    
    # the search terms that can be input for tabledap do not work for griddap
    # in erddapy currently. Instead, put together an opendap link and then 
    # narrow the dataset with xarray.
    e.dataset_id = dataset_id
    # get opendap link
    url = e.get_download_url(response='opendap')
    ds = xr.open_dataset(url).sel(latitude=slice(kw['min_lat'],kw['max_lat']), 
                                  longitude=slice(kw['min_lon'],kw['max_lon']), 
                                  time=slice(kw['min_time'],kw['max_time']))

    return (dataset_id, ds)

In [157]:
%%time
    
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(request)(dataset_id, standard_names) for dataset_id in dataset_ids
)

dss = {dataset_id: ds for (dataset_id, ds) in downloads}

# (or can concat together the dss)
len(dss)

CPU times: user 5.31 ms, sys: 2.36 ms, total: 7.67 ms
Wall time: 4.27 s


1

In [158]:
dss

{'ucsdHfrA6': <xarray.Dataset>
 Dimensions:            (latitude: 75, longitude: 70, time: 235)
 Coordinates:
   * time               (time) datetime64[ns] 2015-07-20T20:00:00 ... 2015-08-...
   * latitude           (latitude) float32 70.05 70.1 70.16 ... 73.93 73.99 74.04
   * longitude          (longitude) float32 -170.0 -169.8 ... -160.2 -160.1
 Data variables:
     water_u            (time, latitude, longitude) float32 ...
     water_v            (time, latitude, longitude) float32 ...
     DOPx               (time, latitude, longitude) float32 ...
     DOPy               (time, latitude, longitude) float32 ...
     hdop               (time, latitude, longitude) float32 ...
     number_of_radials  (time, latitude, longitude) float32 ...
 Attributes:
     _CoordSysBuilder:           ucar.nc2.dataset.conv.CF1Convention
     cdm_data_type:              Grid
     Conventions:                COARDS, CF-1.6, ACDD-1.3
     creator_email:              hfrnet.administrators@sio.ucsd.edu
   

## Sensors

New approach using erddapy to do this

In [886]:



def request_sensor(e, dataset_id, standard_names):
    '''
    Note that a dataframe might be empty even though the datasets should
    only be made available to the search if in range.
    
    return dictionary with entry for each dataset_id:
    * organization name, website, and id; axiom id (if relevant, or wherever to find the data if not the first place?); 
    data itself (if requested, as combined dataframe or dataset); data type tag; time range; spatial location or box; depth(s)

    '''
    
    # In case the variable is named differently from the standard names, 
    # we back out the variable names here for each dataset. This also only 
    # returns those names for which there is data in the dataset.
    varnames = e.get_var_by_attr(
        dataset_id=dataset_id,
        standard_name=lambda v: v in standard_names
    )

    e.protocol = "tabledap"
    e.variables = ["time","longitude", "latitude", "station"] + varnames
    # set the same time restraints as before
    e.constraints = {'time<=': kw['max_time'], 'time>=': kw['min_time'],}
    e.dataset_id = dataset_id
    # Drop cols and rows that are only NaNs.
    try:
        df = e.to_pandas(response="csvp", index_col=0, parse_dates=True).dropna(axis='index', how='all').dropna(axis='columns', how='all')
        # check to see if there is any actual data
        # this is a bit convoluted because the column names are the varnames 
        # plus units so can't match 1 to 1.
        datacols = 0  # number of columns that represent data instead of metadata
        for col in df.columns:
            datacols += [varname in col for varname in varnames].count(True)
        # if no datacols, we can skip this one.
        if datacols == 0:
            df = None
    except:
#         print('no data to be read in for %s' % dataset_id)
        df = None
    return (dataset_id, df)


def request_metadata(e, dataset_id):
    '''Read select metadata from erddap.'''
    
    columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'id', 'infoUrl', 'institution', 'featureType']

    info_url = e.get_info_url(response="csv", dataset_id=dataset_id)
    info = pd.read_csv(info_url)
        
    items = []

    for col in columns:

        try:
            item = info[info['Attribute Name'] == col]['Value'].values[0]
            dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
        except:
            item = 'NA'

        if dtype == 'String':
            pass
        elif dtype == 'double':
            item = float(item)
        elif dtype == 'int':
            item = int(item)
        items.append(item)

    # add erddap server name and data_type
    return {dataset_id: [e.server] + items}


def sensors(kw, standard_names=None, only_meta=False, data_types=None, only_ids=False):
    '''Return dataframe for only_meta.
    
    NEEDS TO BE ABLE TO DISTINGUISH BETWEEN SENSORS AND PLATFORM
    '''
    
    # e = ERDDAP(server="http://erddap.sensors.axds.co/erddap")
    e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")  # includes gliders

    
    columns = ['geospatial_lat_min', 'geospatial_lat_max', 
               'geospatial_lon_min', 'geospatial_lon_max', 
               'time_coverage_start', 'time_coverage_end',
               'defaultDataQuery', 'subsetVariables',  # first works for timeseries sensors, 2nd for gliders
               'id', 'infoUrl', 'institution', 'featureType']
    
    
    # Not sure if standard_names should be allowed at this level or only in the class initiation
    if standard_names is None:
        
        # use this default list if None input
        standard_names = ['sea_water_temperature', 
                          'sea_water_practical_salinity', 
                          'sea_water_speed', 
                          'sea_water_velocity_to_direction', 
                          'sea_surface_height', 
                          'sea_surface_height_above_sea_level', 
                          'sea_surface_height_amplitude_due_to_geocentric_ocean_tide']
        
#         # Possibly add in the future but may not be any data associated with:
#         standard_names += ['sea_water_temperature_quality_flag', 
#                            'sea_water_temperature_status_flag', 
#                            'sea_water_practical_salinity_quality_flag', 
#                            'sea_water_practical_salinity_status_flag', 
#                            'sea_water_speed_quality_flag', 
#                            'sea_water_velocity_to_direction_quality_flag', 
#                            'sea_surface_height_above_sea_level_quality_flag', 
#                            'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_quality_flag', 
#                            'sea_surface_height_quality_flag']

    # make into a list if isn't already
    elif isinstance(standard_names, str):
        
        standard_names = list(standard_names)

#     # Vars is the base names to use for searching but not full standard_names
#     Vars = ['sea_water_temperature','sea_water_practical_salinity','sea_water_speed','sea_water_velocity_to_direction','sea_surface_height']

#     # Get all possible related standard_names
#     url = e.get_categorize_url(
#         categorize_by="standard_name",
#         response="csv"
#     )

#     cats = pd.read_csv(url)["Category"]

#     # get one big list of all the standard_names to search for
#     standard_names = []
#     for Var in Vars:
#         standard_names += [name for name in cats if name.startswith(Var)]
#         print(standard_names)

    # find all the dataset ids which we will use to get the data
    # This limits the search to our keyword arguments in kw which should 
    # have min/max lon/lat/time values
    dataset_ids = []
    for standard_name in standard_names:

        search_url = e.get_search_url(response="csv", **kw, standard_name=standard_name, items_per_page=10000)
        try:
            search = pd.read_csv(search_url)
            dataset_ids.extend(search["Dataset ID"])
        except:
            pass
#             print('standard_name %s not found' % standard_name)


    # only need a dataset id once since we will check them each for all standard_names
    dataset_ids = list(set(dataset_ids))
    
    if only_ids:
        return dataset_ids
    
    # get metadata for datasets
    # run in parallel to save time
    num_cores = multiprocessing.cpu_count()
    downloads = Parallel(n_jobs=num_cores)(
        delayed(request_metadata)(e, dataset_id) for dataset_id in dataset_ids
    )

    # make dict from individual dicts
    from collections import ChainMap
    meta = dict(ChainMap(*downloads)) 
#     print(meta)
    # Make dataframe of metadata
    meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database'] + columns)
    
    if only_meta:
        return meta  
    
    # otherwise actually return data
    num_cores = multiprocessing.cpu_count()
    downloads = Parallel(n_jobs=num_cores)(
        delayed(request_sensor)(e, dataset_id, standard_names) for dataset_id in dataset_ids
    )
    
    dfs = {dataset_id: df for (dataset_id, df) in downloads}
    
    return dfs



In [667]:
dataset_ids = sensors(kw, only_ids=True)
dataset_ids

['noaa_nos_co_ops_8764227',
 'noaa_nos_co_ops_8762928',
 'gov_usgs_waterdata_08211200',
 'gov_usgs_waterdata_292952090565300',
 'noaa_nos_co_ops_8762482',
 'noaa_nos_co_ops_8762888',
 'noaa_nos_co_ops_8761826',
 'noaa_nos_co_ops_8773701',
 'gov_usgs_waterdata_08180850',
 'noaa_nos_co_ops_8774513',
 'noaa_nos_co_ops_8770559',
 'gov_usgs_waterdata_295124089542100',
 'noaa_nos_co_ops_8760889',
 'gov_usgs_waterdata_295826095082200',
 'noaa_nos_co_ops_8770475',
 'noaa_nos_co_ops_8771328',
 'noaa_nos_co_ops_tec4525',
 'noaa_nos_co_ops_8779750',
 'tcoon_rlit2',
 'noaa_nos_co_ops_8778490',
 'noaa_nos_co_ops_8760736',
 'gov_usgs_waterdata_292800090060000',
 'gov_usgs_waterdata_08067118',
 'gov_usgs_waterdata_0821150305',
 'gov_usgs_waterdata_07387050',
 'gov_usgs_waterdata_08188060',
 'gov_usgs_waterdata_295827090052800',
 'noaa_nos_co_ops_8773259',
 'gov_usgs_waterdata_290802098232901',
 'noaa_nos_co_ops_8760424',
 'gov_usgs_waterdata_08181800',
 'gov_usgs_waterdata_08181500',
 'noaa_nos_co_op

In [840]:
%%time
meta = sensors(kw, only_meta=True)
meta

CPU times: user 700 ms, sys: 53.9 ms, total: 754 ms
Wall time: 12.1 s


Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,defaultDataQuery,subsetVariables,id,infoUrl,institution,featureType
gov_usgs_waterdata_07381600,http://erddap.sensors.ioos.us/erddap,29.692819,29.692819,-91.211937,-91.211937,2015-05-05T11:30:00Z,2021-04-08T17:00:00Z,"sea_water_ph_reported_on_total_scale,river_dis...",,27723,https://sensors.ioos.us/#metadata/27723/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_073802512,http://erddap.sensors.ioos.us/erddap,29.398555,29.398555,-90.041184,-90.041184,2015-05-05T12:00:00Z,2021-04-08T16:15:00Z,"sea_water_ph_reported_on_total_scale,sea_water...",,25996,https://sensors.ioos.us/#metadata/25996/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_073745257,http://erddap.sensors.ioos.us/erddap,29.708266,29.708266,-89.719506,-89.719506,2015-05-05T13:00:00Z,2021-04-08T16:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,29913,https://sensors.ioos.us/#metadata/29913/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_295744093303800,http://erddap.sensors.ioos.us/erddap,29.962222,29.962222,-93.510556,-93.510556,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,31725,https://sensors.ioos.us/#metadata/31725/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_291929089562600,http://erddap.sensors.ioos.us/erddap,29.324722,29.324722,-89.940500,-89.940500,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,23003,https://sensors.ioos.us/#metadata/23003/station,USGS National Water Information System (NWIS),TimeSeries
...,...,...,...,...,...,...,...,...,...,...,...,...,...
noaa_nos_co_ops_8762482,http://erddap.sensors.ioos.us/erddap,29.776700,29.776700,-90.418300,-90.418300,2015-05-05T12:12:00Z,2021-04-08T17:30:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,45580,https://sensors.ioos.us/#metadata/45580/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries
gov_usgs_waterdata_292952090565300,http://erddap.sensors.ioos.us/erddap,29.497778,29.497778,-90.948056,-90.948056,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,28596,https://sensors.ioos.us/#metadata/28596/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_08211200,http://erddap.sensors.ioos.us/erddap,27.937796,27.937796,-97.775831,-97.775831,2015-05-05T12:15:00Z,2021-04-08T16:00:00Z,"sea_water_ph_reported_on_total_scale,river_dis...",,27072,https://sensors.ioos.us/#metadata/27072/station,USGS National Water Information System (NWIS),TimeSeries
noaa_nos_co_ops_8762928,http://erddap.sensors.ioos.us/erddap,29.245000,29.245000,-90.661700,-90.661700,2015-05-05T06:19:00Z,2021-04-15T04:44:00Z,sea_surface_height_amplitude_due_to_geocentric...,,47793,https://sensors.ioos.us/#metadata/47793/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries


In [838]:
%%time
meta = sensors(kw, only_meta=True)
meta

CPU times: user 748 ms, sys: 58.1 ms, total: 806 ms
Wall time: 12.1 s


Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,defaultDataQuery,subsetVariables,id,infoUrl,institution,featureType
gov_usgs_waterdata_07381600,http://erddap.sensors.ioos.us/erddap,29.692819,29.692819,-91.211937,-91.211937,2015-05-05T11:30:00Z,2021-04-08T17:00:00Z,"sea_water_ph_reported_on_total_scale,river_dis...",,27723,https://sensors.ioos.us/#metadata/27723/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_073802512,http://erddap.sensors.ioos.us/erddap,29.398555,29.398555,-90.041184,-90.041184,2015-05-05T12:00:00Z,2021-04-08T16:15:00Z,"sea_water_ph_reported_on_total_scale,sea_water...",,25996,https://sensors.ioos.us/#metadata/25996/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_073745257,http://erddap.sensors.ioos.us/erddap,29.708266,29.708266,-89.719506,-89.719506,2015-05-05T13:00:00Z,2021-04-08T16:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,29913,https://sensors.ioos.us/#metadata/29913/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_295744093303800,http://erddap.sensors.ioos.us/erddap,29.962222,29.962222,-93.510556,-93.510556,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,31725,https://sensors.ioos.us/#metadata/31725/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_291929089562600,http://erddap.sensors.ioos.us/erddap,29.324722,29.324722,-89.940500,-89.940500,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,23003,https://sensors.ioos.us/#metadata/23003/station,USGS National Water Information System (NWIS),TimeSeries
...,...,...,...,...,...,...,...,...,...,...,...,...,...
noaa_nos_co_ops_8762482,http://erddap.sensors.ioos.us/erddap,29.776700,29.776700,-90.418300,-90.418300,2015-05-05T12:12:00Z,2021-04-08T17:30:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,45580,https://sensors.ioos.us/#metadata/45580/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries
gov_usgs_waterdata_292952090565300,http://erddap.sensors.ioos.us/erddap,29.497778,29.497778,-90.948056,-90.948056,2015-05-05T13:00:00Z,2021-04-08T16:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,28596,https://sensors.ioos.us/#metadata/28596/station,USGS National Water Information System (NWIS),TimeSeries
gov_usgs_waterdata_08211200,http://erddap.sensors.ioos.us/erddap,27.937796,27.937796,-97.775831,-97.775831,2015-05-05T12:15:00Z,2021-04-08T16:00:00Z,"sea_water_ph_reported_on_total_scale,river_dis...",,27072,https://sensors.ioos.us/#metadata/27072/station,USGS National Water Information System (NWIS),TimeSeries
noaa_nos_co_ops_8762928,http://erddap.sensors.ioos.us/erddap,29.245000,29.245000,-90.661700,-90.661700,2015-05-05T06:19:00Z,2021-04-15T04:44:00Z,sea_surface_height_amplitude_due_to_geocentric...,,47793,https://sensors.ioos.us/#metadata/47793/station,NOAA Center for Operational Oceanographic Prod...,TimeSeries


In [682]:
# %%time

columns = ['geospatial_lat_min', 'geospatial_lat_max', 
           'geospatial_lon_min', 'geospatial_lon_max', 
           'time_coverage_start', 'time_coverage_end',
           'defaultDataQuery', 'subsetVariables',
           'id', 'infoUrl', 'institution']

meta = {}
for dataset_id in dataset_ids:
    print(dataset_id)
    info_url = e.get_info_url(response="csv", dataset_id=dataset_id)
    info = pd.read_csv(info_url)
#     print(info)
    items = []
    for col in columns:
    #     print(col)
        try:
            item = info[info['Attribute Name'] == col]['Value'].values[0]
            dtype = info[info['Attribute Name'] == col]['Data Type'].values[0]
        except:
            item = 'NA'
    #     print(dtype)
        if dtype == 'String':
            pass
        elif dtype == 'double':
            item = float(item)
        elif dtype == 'int':
            item = int(item)
        items.append(item)

    meta[dataset_id] = [e.server] + items

# Make dataframe of metadata
meta = pd.DataFrame.from_dict(meta, orient='index', columns=['database'] + columns)
meta

noaa_nos_co_ops_8764227
noaa_nos_co_ops_8762928
gov_usgs_waterdata_08211200
gov_usgs_waterdata_292952090565300
noaa_nos_co_ops_8762482
noaa_nos_co_ops_8762888
noaa_nos_co_ops_8761826
noaa_nos_co_ops_8773701
gov_usgs_waterdata_08180850
noaa_nos_co_ops_8774513
noaa_nos_co_ops_8770559
gov_usgs_waterdata_295124089542100
noaa_nos_co_ops_8760889
gov_usgs_waterdata_295826095082200
noaa_nos_co_ops_8770475
noaa_nos_co_ops_8771328
noaa_nos_co_ops_tec4525
noaa_nos_co_ops_8779750
tcoon_rlit2
noaa_nos_co_ops_8778490
noaa_nos_co_ops_8760736
gov_usgs_waterdata_292800090060000
gov_usgs_waterdata_08067118
gov_usgs_waterdata_0821150305
gov_usgs_waterdata_07387050
gov_usgs_waterdata_08188060
gov_usgs_waterdata_295827090052800
noaa_nos_co_ops_8773259
gov_usgs_waterdata_290802098232901
noaa_nos_co_ops_8760424
gov_usgs_waterdata_08181800
gov_usgs_waterdata_08181500
noaa_nos_co_ops_8761899
tabs_42047
tcoon_hist2
noaa_nos_co_ops_tec4507
gov_usgs_waterdata_07374527
noaa_nos_co_ops_8763719
noaa_nos_co_ops_87797

Unnamed: 0,database,geospatial_lat_min,geospatial_lat_max,geospatial_lon_min,geospatial_lon_max,time_coverage_start,time_coverage_end,defaultDataQuery,subsetVariables,id,infoUrl,institution
noaa_nos_co_ops_8764227,http://erddap.sensors.ioos.us/erddap,29.449200,29.449200,-91.338100,-91.338100,2015-05-05T12:36:00Z,2021-04-14T12:07:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,45582,https://sensors.ioos.us/#metadata/45582/station,NOAA Center for Operational Oceanographic Prod...
noaa_nos_co_ops_8762928,http://erddap.sensors.ioos.us/erddap,29.245000,29.245000,-90.661700,-90.661700,2015-05-05T06:19:00Z,2021-04-14T04:04:00Z,sea_surface_height_amplitude_due_to_geocentric...,,47793,https://sensors.ioos.us/#metadata/47793/station,NOAA Center for Operational Oceanographic Prod...
gov_usgs_waterdata_08211200,http://erddap.sensors.ioos.us/erddap,27.937796,27.937796,-97.775831,-97.775831,2015-05-05T12:15:00Z,2021-04-07T15:00:00Z,"sea_water_ph_reported_on_total_scale,river_dis...",,27072,https://sensors.ioos.us/#metadata/27072/station,USGS National Water Information System (NWIS)
gov_usgs_waterdata_292952090565300,http://erddap.sensors.ioos.us/erddap,29.497778,29.497778,-90.948056,-90.948056,2015-05-05T13:00:00Z,2021-04-07T15:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,28596,https://sensors.ioos.us/#metadata/28596/station,USGS National Water Information System (NWIS)
noaa_nos_co_ops_8762482,http://erddap.sensors.ioos.us/erddap,29.776700,29.776700,-90.418300,-90.418300,2015-05-05T12:12:00Z,2021-04-07T16:12:00Z,"sea_surface_height_above_sea_level_geoid_mllw,...",,45580,https://sensors.ioos.us/#metadata/45580/station,NOAA Center for Operational Oceanographic Prod...
...,...,...,...,...,...,...,...,...,...,...,...,...
gov_usgs_waterdata_291929089562600,http://erddap.sensors.ioos.us/erddap,29.324722,29.324722,-89.940500,-89.940500,2015-05-05T13:00:00Z,2021-04-07T15:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,23003,https://sensors.ioos.us/#metadata/23003/station,USGS National Water Information System (NWIS)
gov_usgs_waterdata_295744093303800,http://erddap.sensors.ioos.us/erddap,29.962222,29.962222,-93.510556,-93.510556,2015-05-05T13:00:00Z,2021-04-07T15:00:00Z,"sea_water_electrical_conductivity,sea_water_te...",,31725,https://sensors.ioos.us/#metadata/31725/station,USGS National Water Information System (NWIS)
gov_usgs_waterdata_073745257,http://erddap.sensors.ioos.us/erddap,29.708266,29.708266,-89.719506,-89.719506,2015-05-05T13:00:00Z,2021-04-07T15:15:00Z,"sea_water_electrical_conductivity,sea_water_te...",,29913,https://sensors.ioos.us/#metadata/29913/station,USGS National Water Information System (NWIS)
gov_usgs_waterdata_073802512,http://erddap.sensors.ioos.us/erddap,29.398555,29.398555,-90.041184,-90.041184,2015-05-05T12:00:00Z,2021-04-07T15:15:00Z,"sea_water_ph_reported_on_total_scale,sea_water...",,25996,https://sensors.ioos.us/#metadata/25996/station,USGS National Water Information System (NWIS)


In [678]:
dataset_id = 'unit_308_20150701t222927z'
info_url = e.get_info_url(response="html", dataset_id=dataset_id)
info_url
# info = pd.read_csv(info_url)
# info

'http://erddap.sensors.ioos.us/erddap/info/unit_308_20150701t222927z/index.html'

In [474]:
dataset_ids

['noaa_nos_co_ops_9410660',
 'noaa_nos_co_ops_8722588',
 'gov_usgs_cmgp_fi14_990',
 'gov_usgs_waterdata_412354086421001',
 'w_gmaine-1056',
 'sp011_20161130t1806',
 'noaa_nos_co_ops_9462782',
 'noaa_nos_co_ops_9454764',
 'gov_usgs_waterdata_03374100',
 'nerrs_kacsdwq',
 'ce_383_20170719t1900_delayed',
 'noaa_nos_co_ops_8535835',
 'noaa_nos_co_ops_8724991',
 'wmo_44072',
 'gov_usgs_waterdata_04136000',
 'gov_usgs_waterdata_14165500',
 'gov_usgs_waterdata_16213000',
 'noaa_nos_co_ops_8517401',
 'gov_usgs_waterdata_04199000',
 'gov_usgs_waterdata_01467042',
 'nerrs_elksmwq',
 'noaa_nos_co_ops_8578853',
 'noaa_nos_co_ops_8537374',
 'noaa_nos_co_ops_8532715',
 'noaa_nos_co_ops_9052000',
 'gov_usgs_waterdata_452012093412701',
 'noaa_nos_co_ops_8590111',
 'gov_noaa_nws_hads_17dd1000',
 'noaa_nos_co_ops_ohbc1',
 'gov_usgs_waterdata_09306255',
 'mbay_ltb-687',
 'noaa_nos_co_ops_9469237',
 'gov_usgs_waterdata_06601200',
 'gov_usgs_cmgp_pv_shelf07_844',
 'noaa_nos_co_ops_8416804',
 'noaa_nos_co_o

In [58]:
kw

{'min_lon': -170,
 'max_lon': -160,
 'min_lat': 70,
 'max_lat': 75,
 'min_time': '2015-7-15',
 'max_time': '2015-8-1'}

In [76]:
# e = ERDDAP(server="http://erddap.sensors.axds.co/erddap")
e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")

In [77]:
# Vars is the base names to use for searching but not full standard_names
Vars = ['sea_water_temperature','sea_water_practical_salinity','sea_water_speed','sea_water_velocity_to_direction','sea_surface_height']

# Get all possible related standard_names
url = e.get_categorize_url(
    categorize_by="standard_name",
    response="csv"
)

cats = pd.read_csv(url)["Category"]

# get one big list of all the standard_names to search for
standard_names = []
for Var in Vars:
    standard_names += [name for name in cats if name.startswith(Var)]
standard_names

['sea_water_temperature',
 'sea_water_temperature_quality_flag',
 'sea_water_temperature_status_flag',
 'sea_water_practical_salinity',
 'sea_water_practical_salinity_quality_flag',
 'sea_water_practical_salinity_status_flag',
 'sea_water_speed',
 'sea_water_speed_quality_flag',
 'sea_water_velocity_to_direction',
 'sea_water_velocity_to_direction_quality_flag',
 'sea_surface_height',
 'sea_surface_height_above_sea_level',
 'sea_surface_height_above_sea_level_quality_flag',
 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide',
 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_quality_flag',
 'sea_surface_height_quality_flag']

In [78]:
# find all the dataset ids which we will use to get the data
dataset_ids = []
for standard_name in standard_names:
#     kw['standard_name'] = standard_name
    
    search_url = e.get_search_url(response="csv", **kw, standard_name=standard_name, items_per_page=10000)
    try:
        search = pd.read_csv(search_url)
        dataset_ids.extend(search["Dataset ID"])
    except:
        print('standard_name %s not found' % standard_name)
        
    
# only need a dataset id once since we will check them each for all standard_names
dataset_ids = list(set(dataset_ids))
len(dataset_ids)

standard_name sea_water_temperature_quality_flag not found
standard_name sea_water_temperature_status_flag not found
standard_name sea_water_practical_salinity_quality_flag not found
standard_name sea_water_practical_salinity_status_flag not found
standard_name sea_water_speed_quality_flag not found
standard_name sea_water_velocity_to_direction_quality_flag not found
standard_name sea_surface_height not found
standard_name sea_surface_height_above_sea_level not found
standard_name sea_surface_height_above_sea_level_quality_flag not found
standard_name sea_surface_height_amplitude_due_to_geocentric_ocean_tide not found
standard_name sea_surface_height_amplitude_due_to_geocentric_ocean_tide_quality_flag not found
standard_name sea_surface_height_quality_flag not found


5

In [79]:
dataset_ids

['unit_191_20150711t2000',
 'com_shell_48214',
 'com_olgoonik_mob101',
 'com_olgoonik_mob103',
 'com_shell_48213']

In [140]:
def request(dataset_id, standard_names):
    
    # In case the variable is named differently from the standard names, 
    # we back out the variable names here for each dataset. This also only 
    # returns those names for which there is data in the dataset.
    varnames = e.get_var_by_attr(
        dataset_id=dataset_id,
        standard_name=lambda v: v in standard_names
    )
#     print(varnames)

#     e.constraints = None
    e.protocol = "tabledap"
    e.variables = ["time","longitude", "latitude", "station"] + varnames
    # set the same time restraints as before
    e.constraints = {'time<=': kw['max_time'], 'time>=': kw['min_time'],}
    e.dataset_id = dataset_id
    # Drop cols and rows that are only NaNs.
    try:
        df = e.to_pandas(response="csvp", index_col=0, parse_dates=True, skiprows=[1]).dropna(axis='index', how='all').dropna(axis='columns', how='all')
        # check to see if there is any actual data
        # this is a bit convoluted because the column names are the varnames 
        # plus units so can't match 1 to 1.
        datacols = 0  # number of columns that represent data instead of metadata
        for col in df.columns:
            datacols += [varname in col for varname in varnames].count(True)
        # if no datacols, we can skip this one.
        if datacols == 0:
            df = None
    except:
#         print('no data to be read in for %s' % dataset_id)
        df = None
    return (dataset_id, df)

In [141]:
downloads = []
for dataset_id in dataset_ids:
    downloads.append(request(dataset_id, standard_names))
    
dfs = {dataset_id: df for (dataset_id, df) in downloads}


In [139]:
dfs

{'unit_191_20150711t2000': None,
 'com_shell_48214': None,
 'com_olgoonik_mob101': None,
 'com_olgoonik_mob103': None,
 'com_shell_48213':                            longitude (degrees_east)  latitude (degrees_north)  \
 time (UTC)                                                                      
 2015-07-31 23:54:00+00:00                  -164.134                    71.502   
 2015-07-31 23:50:00+00:00                  -164.134                    71.502   
 2015-07-31 23:42:00+00:00                  -164.134                    71.502   
 2015-07-31 23:36:00+00:00                  -164.134                    71.502   
 2015-07-31 23:00:00+00:00                  -164.134                    71.502   
 ...                                             ...                       ...   
 2015-07-27 20:45:00+00:00                  -164.134                    71.502   
 2015-07-27 20:24:00+00:00                  -164.134                    71.502   
 2015-07-27 20:00:00+00:00                

In [121]:
list(dfs['com_shell_48214'].columns) #(["time","longitude", "latitude", "station"])

['longitude', 'latitude', 'station', 'sea_water_temperature']

In [None]:
list(dfs['com_shell_48214'].columns).pop

In [136]:
datacols = 0  # number of columns that represent data instead of metadata
for col in dfs['com_shell_48213'].columns:
    datacols += [varname in col for varname in varnames].count(True)

In [129]:
[varname in dfs['com_shell_48214'].columns for varname in varnames].count(True)

0

In [128]:
[varname in dfs['com_shell_48213'].columns for varname in varnames].count(True)

1

In [111]:
dfs['com_shell_48214'].columns

Index(['longitude (degrees_east)', 'latitude (degrees_north)', 'station'], dtype='object')

In [85]:
%%time
    
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(request)(dataset_id, standard_names) for dataset_id in dataset_ids
)

dfs = {dataset_id: df for (dataset_id, df) in downloads}
# dfs
# (or can concat together the dfs)
len(dfs)

CPU times: user 17.3 ms, sys: 3.57 ms, total: 20.9 ms
Wall time: 2.83 s


5

In [83]:
dfs['com_shell_48214']

Unnamed: 0_level_0,longitude (degrees_east),latitude (degrees_north),station
time (UTC),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2015-08-01 00:00:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-31 23:50:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-31 23:42:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-31 23:00:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-31 22:42:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
...,...,...,...
2015-07-21 03:36:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-21 03:24:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-21 03:20:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)
2015-07-21 03:00:00+00:00,-165.246,70.871,48214 - Klondike (MOB1)


# OOI

OOI has some relevant data. This should be added in in the future once the ERDDAP z issue (defaulting to 0) is fixed so that it can be searched to narrow the data by depth.

The ERDDAP server to use is: https://erddap.dataexplorer.oceanobservatories.org/erddap

Note that gliders can be separated out either because glider is in their name, it is a type under the IOOS defns, or because the `cdm_data_type` for them is TrajectoryProfile.

In [364]:
e = ERDDAP(server='https://erddap.dataexplorer.oceanobservatories.org/erddap')
standard_names = ['sea_water_practical_salinity']#,'eastward_sea_water_velocity']#, 'eastward_sea_water_velocity_quality_flag',
#                   'northward_sea_water_velocity', 'northward_sea_water_velocity_quality_flag']

In [365]:
dataset_ids = []
for standard_name in standard_names:

    search_url = e.get_search_url(response="csv", **kw, standard_name=standard_name, items_per_page=10000)
    try:
        search = pd.read_csv(search_url)
        dataset_ids.extend(search["Dataset ID"])
    except:
        pass


Check the depth range for a fixed-depth sensor with the following. Currently the default is for z to be (0, 0) as the following code snippet demonstrates. But in the future they will reflect the actual values and these could be queried to see if a dataset should be kept in a search based on its depth.

In [382]:
info_url = e.get_info_url(dataset_id='ooi-ce01issm-rid16-02-flortd000', response="csv")
df = pd.read_csv(info_url)
df[ (df['Variable Name'] == 'z') & (df['Attribute Name'] == 'actual_range') ].Value

86    0.0, 0.0
Name: Value, dtype: object

# --------------------------

# Look at CO-OPS ERDDAP

In [212]:
e = ERDDAP(server="https://opendap.co-ops.nos.noaa.gov/erddap")
df = pd.read_csv(e.get_search_url(response="csv", items_per_page=50, search_for='water level'))
df

Unnamed: 0,griddap,Subset,tabledap,Make A Graph,files,Title,Summary,FGDC,ISO 19115,Info,Background Info,RSS,Email,Institution,Dataset ID
0,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Preliminary One Minute,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Raw_One_Min_Water_Level
1,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Preliminary Six Minute,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Raw_Water_Level
2,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Verified Daily Mean,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Daily_Verified_Water_Level
3,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Verified High/Lows,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_High_Low_Verified_Water_Level
4,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Verified Hourly Heights,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Hourly_Height_Verified_Water_Level
5,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Verified Monthly Mean,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Monthly_Verified_Water_Level
6,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Water Level - Verified Six Minute,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_SixMin_Verified_Water_Level
7,,,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,https://opendap.co-ops.nos.noaa.gov/erddap/tab...,,Air Gap,These raw data have not been subjected to the ...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/met...,https://opendap.co-ops.nos.noaa.gov/erddap/inf...,https://opendap.co-ops.nos.noaa.gov/,https://opendap.co-ops.nos.noaa.gov/erddap/rss...,https://opendap.co-ops.nos.noaa.gov/erddap/sub...,NOAA NOS CO-OPS (Center for Operational Oceano...,IOOS_Air_Gap


In [218]:
e.protocol = "tabledap"
# e.variables = ["time","longitude", "latitude", "station"] + varnames
# set the same time restraints as before
# e.constraints = {'time<=': kw['max_time'], 'time>=': kw['min_time'],}
e.dataset_id = 'IOOS_SixMin_Verified_Water_Level'
# e.STATION_ID = '8770971'
# Drop cols and rows that are only NaNs.
df = e.to_pandas(response="csv", STATION_ID = '8770971')#, index_col=0, parse_dates=True).dropna(axis='index', how='all').dropna(axis='columns', how='all')
df

HTTPError: Error {
    code=500;
    message="Internal Server Error: ERROR from data source: com.sybase.jdbc4.jdbc.SybSQLException: Procedure GET_ERDDAP_WL_6MIN_VFD expects parameter @STATION_ID, which was not supplied.";
}


In [None]:
url = 'https://opendap.co-ops.nos.noaa.gov/ioos-dif-sos/SOS?service=SOS&request=GetObservation&version=1.0.0&observedProperty=sea_water_speed&direction_of_sea_water_velocity&offering=urn:ioos:station:NOAA.NOS.CO-OPS:g06010&procedure=urn:ioos:sensor:NOAA.NOS.CO-OPS:g06010:Nortek-ADP-34465:rtb
&responseFormat=text/csv'

In [221]:
url = 'https://opendap.co-ops.nos.noaa.gov/ioos-dif-sos/SOS?service=SOS&request=GetObservation&version=1.0.0&observedProperty=sea_water_speed&direction_of_sea_water_velocity&offering=urn:ioos:station:NOAA.NOS.CO-OPS:15319&responseFormat=text/csv'
pd.read_csv(url)

Unnamed: 0,"<?xml version=""1.0"" encoding=""UTF-8""?>"
0,"<ExceptionReport xmlns=""http://www.opengis.net..."
1,"<Exception exceptionCode=""InvalidParameterVa..."
2,<ExceptionText>The station ID of the param...
3,</Exception>
4,</ExceptionReport>


# Track stations for known organizations

In [235]:
e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")

def search(org_id):
    '''Search for organization ids in Axiom database.
    
    Return organization id and associated Axiom id, if it is found.
    '''
    axiom_ids = {}
    url = e.get_search_url(response="csv", items_per_page=5, search_for=org_id)
    try:
        df = pd.read_csv(url)
        if 'tabs' in org_id:  # don't split
            axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() == axiom_id.lower()]
        else:
            axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if org_id.lower() in axiom_id.lower().split('_')][0]
    except:
        axiom_id = None
    return org_id, axiom_id


In [9]:
def update_ndbc_ids(ndbc_ids, other_ids, name):
    '''Remove other organization ids from ndbcs_ids.
    
    Print summary statements and remove other ids from ndbc_ids 
    to keep an updated list of unique ids for ndbc_ids, which 
    otherwise overlaps with other organizations.
    '''
    
    print('Number of NDBC stations before removing %s: %i' % (name, len(ndbc_ids)))
    dupes = set(other_ids) & set(ndbc_ids)
    ndbc_ids = set(ndbc_ids) - set(other_ids)
    print('Number of NDBC stations after removing %s: %i' % (name, len(ndbc_ids)))
    print('Duplicates (by alias name) that were in NDBC list: ', dupes)
    return ndbc_ids

## NDBC

Have to scrape this from the online list: https://www.ndbc.noaa.gov/to_station.shtml.

In [157]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.ndbc.noaa.gov/to_station.shtml'
soup = BeautifulSoup(requests.get(url).text, "lxml")

ndbc_ids = [str(station_line.contents[0]) for station_line in soup.findAll('a')[35:-11]]

print('Number of NDBC stations to start: ', len(ndbc_ids))


Number of NDBC stations to start:  1891


Only met stations:

In [271]:
url = 'https://www.ndbc.noaa.gov/ndbcmapstations.json'
df = pd.read_json(url, orient='index')


In [277]:
[station for station in df.loc['station'][0] if 'SXXA2'.lower() in station['id'].lower()]

[{'id': 'sxxa2',
  'lat': 55.315,
  'lon': -131.596,
  'elev': 5,
  'name': 'Saxman Landing, AK',
  'owner': 74,
  'program': 2,
  'status': 'E',
  'data': 'y',
  'type': 'fixed'}]

In [279]:
len([station['id'] for station in df.loc['station'][0] if station['owner'] == 74])

34

In [280]:
[station for station in df.loc['station'][0] if '46040'.lower() in station['id'].lower()]

[{'id': '46040',
  'lat': 44.8,
  'lon': -124.3,
  'elev': None,
  'name': '',
  'owner': 0,
  'program': 0,
  'status': 'D',
  'data': 'n',
  'type': 'buoy'}]

Find lon/lat of stations to only check US-based stations.

In [158]:

url = 'https://www.ndbc.noaa.gov/ndbcmapstations.json'
df = pd.read_json(url, orient='index')
lls = []
for ndbc_id in ndbc_ids:
    lls.append([(station['lon'],station['lat']) for station in df.loc['station'][0] if station['id'].lower() == ndbc_id.lower()])


# if lat==30 and lon==-90, it is a glider so we don't want at this point
glider_ids = [ndbc_id for ndbc_id,ll in zip(ndbc_ids,lls) 
                 if (ll[0][0] == -90) and (ll[0][1] == 30) ]
# remove glider ids from ndbc list
ndbc_ids = list(set(ndbc_ids) - set(glider_ids))

print('Number of NDBC stations after removing gliders: ', len(ndbc_ids))

# redo finding ll for new ndbc list
lls = []
for ndbc_id in ndbc_ids:
    lls.append([(station['lon'],station['lat']) for station in df.loc['station'][0] if station['id'].lower() == ndbc_id.lower()])

# Only keep ndbc stations if in the US, as defined by kw dict
ndbc_ids = [ndbc_id for ndbc_id,ll in zip(ndbc_ids,lls) 
             if (kw['min_lon'] < ll[0][0] < kw['max_lon']) and (kw['min_lat'] < ll[0][1] < kw['max_lat']) ]

print('Number of NDBC stations after removing non-US: ', len(ndbc_ids))


Number of NDBC stations after removing gliders:  1821
Number of NDBC stations after removing non-US:  1539


## CO-OPS

Extracted `url_coops` from this website: https://tidesandcurrents.noaa.gov/stations.html?type=Historic+Water+Levels. This list contains active stations too. Only use US-based stations.

Save alternative listing code too.

In [159]:
url_coops = 'https://api.tidesandcurrents.noaa.gov/mdapi/prod/webapi/stations.json?type=historicwl&expand=details'
df = pd.read_json(url_coops)
coops_ids = [(station['details']['id'], station['shefcode']) for station in df['stations'] 
             if (kw['min_lon'] < station['lng'] < kw['max_lon']) 
             and (kw['min_lat'] < station['lat'] < kw['max_lat']) ]
coops_ids = {coops_id: alt_id for (coops_id, alt_id) in coops_ids}

print('Number of CO-OPS stations: ', len(coops_ids))


Number of CO-OPS stations:  2721


Remove duplicates from NDBC list that overlap with CO-OPS list, either from the keys or values of the dictionary. This removes TCOON stations from NDBC list, so they are only in the CO-OPS list.

In [240]:
ndbc_ids2 = update_ndbc_ids(ndbc_ids, coops_ids.keys(), 'CO-OPS')

Number of NDBC stations before removing CO-OPS: 1270
Number of NDBC stations after removing CO-OPS: 1270
Duplicates (by alias name) that were in NDBC list:  set()


In [160]:
ndbc_ids = update_ndbc_ids(ndbc_ids, coops_ids.values(), 'CO-OPS')

Number of NDBC stations before removing CO-OPS: 1539
Number of NDBC stations after removing CO-OPS: 1279
Duplicates (by alias name) that were in NDBC list:  {'KWHH1', 'RLOT2', 'VCAT2', 'CPTR1', 'EREP1', 'AAMC1', 'ULRA2', 'SJSN4', 'BUFN6', 'GISL1', 'VCAF1', 'OLSA2', 'BDRN4', 'WYCM6', 'FPKG1', 'PMOA2', 'GNJT2', 'MYPF1', 'SDHN4', 'ICAC1', 'UNLA2', 'BRHC3', 'HCGN7', 'ALIA2', 'HBYC1', 'WASD2', 'WELM1', 'EPTT2', 'CRVA2', 'NIAN6', 'HRVC1', 'VCVA2', 'NEAW1', 'KGCA2', 'EINL1', 'RLIT2', 'NFDF1', 'THRO1', 'LCLL1', 'APAM2', 'GDMM5', 'FRCB6', 'RDYD1', 'WPTW1', 'KPTV2', 'FRDW1', 'TCBM2', 'TESL1', 'PRYC1', 'VAKF1', 'GRRT2', 'EBSW1', 'MGZP4', 'MRHO1', 'SWPM4', 'PMNT2', 'YABP4', 'AMRL1', 'KWNW3', 'CHAV3', 'LTRM4', 'KLIH1', 'LTBV3', 'CHSV3', 'APCF1', 'DULM5', 'RSJT2', 'FAIO1', 'BLTM2', 'PTIM4', 'MTKN6', 'NTBC1', 'FRDF1', 'MCGA1', 'TRDF1', 'PCGT2', 'NTKM3', 'SHBL1', 'FRVM3', 'PNLM4', 'ILOH1', 'MBRM4', 'ANVC1', 'ESPP4', 'HRBM4', 'CHYW1', 'CECC1', 'ELFA2', 'KECA2', 'MQTT2', 'NWHC3', 'MOKH1', 'MGIP4', 'NWPR

## PORTS

Lists of stations are available online!
https://tidesandcurrents.noaa.gov/cdata/StationList?type=Current+Data&filter=active

These are all US based.

CHECK FOR PROFILES and depth

In [162]:
url_historic = 'https://tidesandcurrents.noaa.gov/cdata/StationListFormat?type=Current+Data&filter=historic&format=csv'

# active are not a subset of historical
url_active = 'https://tidesandcurrents.noaa.gov/cdata/StationListFormat?type=Current+Data&filter=active&format=csv'

df_ports_historic = pd.read_csv(url_historic)
df_ports_active = pd.read_csv(url_active)

ports_ids = list(df_ports_historic['Station ID'].values) + list(df_ports_active['Station ID'].values)


Remove duplicates from NDBC list that overlap with PORTS. (None overlap)

In [163]:
ndbc_ids = update_ndbc_ids(ndbc_ids, ports_ids, 'PORTS')

Number of NDBC stations before removing PORTS: 1279
Number of NDBC stations after removing PORTS: 1279
Duplicates (by alias name) that were in NDBC list:  set()


## TABS

These are all US based.

In [164]:
# csv file with buoy attributes for TABS website
url = 'https://raw.githubusercontent.com/kthyng/tabswebsite/master/includes/buoys.csv'
buoys = pd.read_csv(url)
tabs = buoys[buoys['table1'] == 'ven']#['alias']
aliases = tabs['alias']
tabs_ids = list(buoys[buoys['table1'] == 'ven']['buoy'].values)
tabs_ids = ['tabs_' + tabs_id.lower() for tabs_id in tabs_ids]
tabs_ids = {tabs_id: alias for (tabs_id, alias) in zip(tabs_ids, aliases)}

Remove element from `ndbc_ids` if duplicate of `tabs_ids.values()` which are the station aliases.

In [166]:
ndbc_ids = update_ndbc_ids(ndbc_ids, tabs_ids.values(), 'TABS')

Number of NDBC stations before removing TABS: 1279
Number of NDBC stations after removing TABS: 1270
Duplicates (by alias name) that were in NDBC list:  {'42043', '42050', '42044', '42051', '42045', '42048', '42046', '42047', '42049'}


## DART

These are limited to US only.

It turns out that all but 3 US Dart buoys are included in the NDBC list, and the 3 that are not included previously drifted off from their location and so were renamed. So, we will not include DART buoys separately but just leave them in the NDBC list.

In [169]:
url = 'https://www.ndbc.noaa.gov/ndbcmapstations.json'
df = pd.read_json(url, orient='index')
df

Unnamed: 0,0
disclaimer,This undocumented resource was created for the...
created,2021-04-06T17:25:01UTC
count,1946
program,"[NDBC Meteorological/Ocean, International Part..."
owner,"[NDBC, Alaska Ocean Observing System, Amerada ..."
station,"[{'id': '00922', 'lat': 30, 'lon': -90, 'elev'..."
dartevents,{}


In [191]:
[station['id'] for station in df.loc['station'][0] if (station['id'] in ndbc_ids) and (station['type'] == 'dart')]

['41420',
 '41421',
 '41424',
 '41425',
 '42408',
 '42409',
 '42429',
 '44402',
 '44403',
 '46401',
 '46402',
 '46403',
 '46404',
 '46405',
 '46407',
 '46408',
 '46409',
 '46410',
 '46411',
 '46412',
 '46413',
 '46414',
 '46415',
 '46416',
 '46419',
 '46490',
 '51407']

In [None]:
[ndbc_id for ndbc_id,ll in zip(ndbc_ids,lls) 
                 if (ll[0][0] == -90) and (ll[0][1] == 30) ]

In [39]:
url = 'https://www.ndbc.noaa.gov/data/stations/station_table.txt'
dfall = pd.read_table(url, sep='|', index_col=0, skiprows=[1])

This is the number of DART buoys in the NDBC station list (not all active) that are in the U.S. (defined by `llbox`).

In [40]:
idart = [('DART' in str(payload)) and ('TEST' not in name.upper()) and 
         ('N' in loc) and ('W' in loc) and ((kw['min_lon'] < -float(loc.split('N')[1].split(' ')[1]) < kw['max_lon']) 
                                       and (kw['min_lat'] < float(loc.split('N')[0][:-1]) < kw['max_lat']))
         for payload, name, loc in zip(dfall[' PAYLOAD '].values, dfall[' NAME '].values, dfall[' LOCATION '])]

idart.count(True)

29

In [41]:
dart_ids = list(dfall[idart].index)
dart_ids[:5]

['41420', '41421', '41424', '41425', '41d21']

# Which known stations are already ingested by Axiom?

## NDBC

Looking through some of the missing buoys, looks like they are mostly either just met data and/or old discontinued stations.

In [236]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(ndbc_id) for ndbc_id in ndbc_ids
)
axiom_ndbc_ids = {ndbc_id: axiom_id for (ndbc_id, axiom_id) in downloads}

missing = list(axiom_ndbc_ids.values()).count(None)
total = len(axiom_ndbc_ids.keys())
words = '''Total U.S. NDBC buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)

Total U.S. NDBC buoys: 1270. Axiom has 614 of them and is missing 656.
CPU times: user 5.67 s, sys: 454 ms, total: 6.13 s
Wall time: 3min 59s


In [269]:
axiom_ndbc_ids

{'SKXA2': None,
 '46040': None,
 'KBBF': 'wmo_kbbf',
 '46079': None,
 '45138': 'ca_weather_45138',
 'KMIU': 'gov_noaa_nws_kmiu',
 'GRMM4': 'gov_noaa_nws_grmm4',
 '46010': None,
 'BSKF1': 'gov_nps_ever_bskf1',
 '45165': 'com_limno_45165',
 'APXA2': None,
 'SACV4': 'gov_epa_sacv4',
 'SECG1': None,
 '42904': 'com_anadarko_42904',
 'BRBN4': None,
 'CPVM2': 'noaa_nos_co_ops_cpvm2',
 'BGCF1': 'edu_usf_marine_comps_bgcf1',
 '42892': None,
 'BDXC1': None,
 'JPRN7': None,
 'GEXA2': None,
 'ELXC1': None,
 'GBCL1': None,
 '42079': None,
 'SAQG1': None,
 '42852': 'wmo_42852',
 'TKEA2': 'org_mxak_tkea2',
 'DRSD1': None,
 '46023': 'wmo_46023',
 'VERV4': 'gov_epa_verv4',
 'BLIA2': 'wmo_blia2',
 'ANCF1': None,
 'LBRF1': 'gov_nps_ever_lbrf1',
 '41112': None,
 '44040': 'edu_uconn_mysound_44040',
 'KSPR': 'wmo_kspr',
 '46063': None,
 'FBPS1': None,
 '42864': 'com_shell_42864',
 'CHII2': 'gov_glerl_chii2',
 '46220': None,
 '42875': 'com_llog_42875',
 '44059': None,
 'SETO3': 'org_cmop_seto3',
 'MHPA1': 'd

## CO-OPS

In [237]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(coops_id) for coops_id in coops_ids.keys()
)
axiom_coops_ids = {coops_id: axiom_id for (coops_id, axiom_id) in downloads}

missing = list(axiom_coops_ids.values()).count(None)
total = len(axiom_coops_ids.keys())
words = '''Total U.S. CO-OPS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. CO-OPS buoys: 2721. Axiom has 1891 of them and is missing 830.
CPU times: user 11.3 s, sys: 754 ms, total: 12.1 s
Wall time: 6min 20s


In [741]:
missing = []
for key, val in axiom_coops_ids.items():
    if val is None:
        missing.append(key)
missing

['1612479',
 '1612702',
 '8311018',
 '8311060',
 '8311070',
 '8414821',
 '8417087',
 '8417134',
 '8417144',
 '8417208',
 '8419751',
 '8422272',
 '8422301',
 '8423005',
 '8442417',
 '8447819',
 '8448151',
 '8448533',
 '8448725',
 '8448875',
 '8449375',
 '8449383',
 '8453572',
 '8455137',
 '8459449',
 '8459479',
 '8460751',
 '8465748',
 '8468191',
 '8469057',
 '8510321',
 '8517756',
 '8518962',
 '8530345',
 '8530464',
 '8530502',
 '8530538',
 '8530576',
 '8530802',
 '8531142',
 '8531630',
 '8533542',
 '8534320',
 '8534469',
 '8534772',
 '8534773',
 '8534774',
 '8534779',
 '8534780',
 '8534781',
 '8534782',
 '8534884',
 '8534885',
 '8535055',
 '8535445',
 '8535555',
 '8536831',
 '8536840',
 '8536915',
 '8536999',
 '8537103',
 '8537201',
 '8537363',
 '8537589',
 '8537689',
 '8538055',
 '8538086',
 '8538095',
 '8538414',
 '8538479',
 '8538568',
 '8538779',
 '8538831',
 '8539494',
 '8539808',
 '8542426',
 '8543025',
 '8543111',
 '8545534',
 '8554501',
 '8555461',
 '8557125',
 '8558101',
 '85

In [752]:
np.savetxt('coops_missing.txt', np.asarray((missing)).astype(int), '%7i')

In [281]:
axiom_coops_ids

{'1611347': 'noaa_nos_co_ops_1611347',
 '1611400': 'noaa_nos_co_ops_1611400',
 '1612340': 'noaa_nos_co_ops_1612340',
 '1612366': 'noaa_nos_co_ops_1612366',
 '1612404': 'noaa_nos_co_ops_1612404',
 '1612479': None,
 '1612480': 'noaa_nos_co_ops_1612480',
 '1612702': None,
 '1613198': 'noaa_nos_co_ops_1613198',
 '1614465': 'noaa_nos_co_ops_1614465',
 '1615680': 'noaa_nos_co_ops_1615680',
 '1617433': 'noaa_nos_co_ops_1617433',
 '1617760': 'noaa_nos_co_ops_1617760',
 '1619910': 'noaa_nos_co_ops_1619910',
 '2695535': 'noaa_nos_co_ops_2695535',
 '2695540': 'noaa_nos_co_ops_2695540',
 '8311018': None,
 '8311030': 'noaa_nos_co_ops_8311030',
 '8311060': None,
 '8311062': 'noaa_nos_co_ops_8311062',
 '8311070': None,
 '8410140': 'noaa_nos_co_ops_8410140',
 '8410714': 'noaa_nos_co_ops_8410714',
 '8410715': 'noaa_nos_co_ops_8410715',
 '8410834': 'noaa_nos_co_ops_8410834',
 '8410864': 'noaa_nos_co_ops_8410864',
 '8411060': 'noaa_nos_co_ops_8411060',
 '8411250': 'noaa_nos_co_ops_8411250',
 '8412581': '

## PORTS

In [238]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(ports_id) for ports_id in ports_ids
)
axiom_ports_ids = {ports_id: axiom_id for (ports_id, axiom_id) in downloads}

missing = list(axiom_ports_ids.values()).count(None)
total = len(axiom_ports_ids.keys())
words = '''Total U.S. PORTS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. PORTS buoys: 952. Axiom has 11 of them and is missing 941.
CPU times: user 3.96 s, sys: 273 ms, total: 4.23 s
Wall time: 3min 50s


## TABS

CAN GET OLD STATIONS FROM GCOOS OR MY TABS SITE

In [239]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(tabs_id) for tabs_id in tabs_ids
)
axiom_tabs_ids = {tabs_id: axiom_id for (tabs_id, axiom_id) in downloads}

missing = list(axiom_tabs_ids.values()).count(None)
total = len(axiom_tabs_ids.keys())
words = '''Total TABS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total TABS buoys: 19. Axiom has 9 of them and is missing 10.
CPU times: user 69.7 ms, sys: 6.7 ms, total: 76.4 ms
Wall time: 4.64 s


# Check for what stations already in databases

NEED TO MAKE SURE THE DATASET IDS FOUND ARE UNIQUE (can do this in axiom list)

SHOULD THESE BY CHECKED IN SPACE OR TIME AT ALL? RIGHT NOW US AND 20 YEARS
--> Should not be checked in time but yes space

## NDBC

### NOAA list

Have to scrape this from the online list: https://www.ndbc.noaa.gov/to_station.shtml.

In [7]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.ndbc.noaa.gov/to_station.shtml'
soup = BeautifulSoup(requests.get(url).text, "lxml")

ndbc_ids = [str(station_line.contents[0]) for station_line in soup.findAll('a')[35:]]

print('Number of NDBC stations to start: ', len(ndbc_ids))


Number of NDBC stations to start:  1902


### Axiom list

In [34]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(ndbc_id) for ndbc_id in ndbc_ids
)
axiom_ndbc_ids = {ndbc_id: axiom_id for (ndbc_id, axiom_id) in downloads}

missing = list(axiom_ndbc_ids.values()).count(None)
total = len(axiom_ndbc_ids.keys())
words = '''Total U.S. NDBC buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. NDBC buoys: 1902. Axiom has 473 of them and is missing 1429.
CPU times: user 7.74 s, sys: 537 ms, total: 8.28 s
Wall time: 4min 21s


In [35]:
axiom_ndbc_ids

{'21413': 'wmo_21413',
 '21414': 'wmo_21414',
 '21415': 'wmo_21415',
 '21416': 'wmo_21416',
 '21417': None,
 '21418': 'wmo_21418',
 '21419': 'wmo_21419',
 '21420': None,
 '32301': None,
 '32302': None,
 '32411': 'wmo_32411',
 '32412': 'wmo_32412',
 '32413': 'wmo_32413',
 '32745': None,
 '32746': None,
 '32D12': None,
 '41001': 'wmo_41001',
 '41002': 'wmo_41002',
 '41003': None,
 '41004': 'wmo_41004',
 '41005': None,
 '41006': None,
 '41007': None,
 '41008': 'university_of_georgia_uga_41008',
 '41009': 'wmo_41009',
 '41010': 'wmo_41010',
 '41011': None,
 '41012': None,
 '41013': 'wmo_41013',
 '41015': None,
 '41016': None,
 '41017': None,
 '41018': None,
 '41021': None,
 '41022': None,
 '41023': None,
 '41025': 'wmo_41025',
 '41035': None,
 '41036': None,
 '41040': 'wmo_41040',
 '41041': 'wmo_41041',
 '41043': 'wmo_41043',
 '41044': 'wmo_41044',
 '41046': 'wmo_41046',
 '41047': 'wmo_41047',
 '41048': 'wmo_41048',
 '41049': 'wmo_41049',
 '41420': 'wmo_41420',
 '41421': 'wmo_41421',
 '414

## CO-OPS

Extracted `url_coops` from this website: https://tidesandcurrents.noaa.gov/stations.html?type=Historic+Water+Levels. This list contains active stations too. Only use US-based stations.

### NOAA list

Save alternative listing code too.

In [228]:
url_coops = 'https://api.tidesandcurrents.noaa.gov/mdapi/prod/webapi/stations.json?type=historicwl&expand=details'
df = pd.read_json(url_coops)
coops_ids = [(station['details']['id'], station['shefcode']) for station in df['stations'] 
             if (kw['min_lon'] < station['lng'] < kw['max_lon']) 
             and (kw['min_lat'] < station['lat'] < kw['max_lat']) ]
print(len(coops_ids))
coops_ids = {coops_id: alt_id for (coops_id, alt_id) in coops_ids}


2721


Remove duplicates from NDBC list that overlap with CO-OPS list, either from the keys or values of the dictionary.

In [226]:
def update_ndbc_ids(ndbc_ids, other_ids, name):
    print('Number of NDBC stations before removing %s: %i' % (name, len(ndbc_ids)))
    dupes = set(other_ids) & set(ndbc_ids)
    ndbc_ids = set(ndbc_ids) - set(other_ids)
    print('Number of NDBC stations after removing %s: %i' % (name, len(ndbc_ids)))
    print('Duplicates (by alias name) that were in NDBC list: ', dupes)
    return ndbc_ids

In [227]:
ndbc_ids = update_ndbc_ids(ndbc_ids, coops_ids.values(), 'CO-OPS')

Number of NDBC stations before removing CO-OPS: 1642
Number of NDBC stations after removing CO-OPS: 1642
Duplicates (by alias name) that were in NDBC list:  set()


### Axiom list

In [4]:
# %%time
# e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")
# axiom_coops_ids = {}
# for coops_id in coops_ids[:100]:
#     url = e.get_search_url(response="csv", items_per_page=5, search_for=coops_id)
#     try:
#         df = pd.read_csv(url)
#         axiom_coops_ids[coops_id] = [axiom_id for axiom_id in df['Dataset ID'] if coops_id in axiom_id.split('_')]
#     except:
#         pass
    
# missing = list(axiom_coops_ids.values()).count([])
# total = len(axiom_coops_ids.keys())
# words = '''Total U.S. CO-OPS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
# print(words)

Total U.S. CO-OPS buoys: 85. Axiom has 85 of them and is missing 0.
CPU times: user 740 ms, sys: 74.8 ms, total: 815 ms
Wall time: 1min 33s


In [5]:
axiom_coops_ids

{'1611347': ['noaa_nos_co_ops_1611347'],
 '1611400': ['noaa_nos_co_ops_1611400'],
 '1612340': ['noaa_nos_co_ops_1612340'],
 '1612366': ['noaa_nos_co_ops_1612366'],
 '1612404': ['noaa_nos_co_ops_1612404'],
 '1612480': ['noaa_nos_co_ops_1612480'],
 '1613198': ['noaa_nos_co_ops_1613198'],
 '1614465': ['noaa_nos_co_ops_1614465'],
 '1615680': ['noaa_nos_co_ops_1615680'],
 '1617433': ['noaa_nos_co_ops_1617433'],
 '1617760': ['noaa_nos_co_ops_1617760'],
 '1619910': ['noaa_nos_co_ops_1619910'],
 '2695535': ['noaa_nos_co_ops_2695535'],
 '2695540': ['noaa_nos_co_ops_2695540'],
 '8311030': ['noaa_nos_co_ops_8311030'],
 '8311062': ['noaa_nos_co_ops_8311062'],
 '8410140': ['noaa_nos_co_ops_8410140'],
 '8410714': ['noaa_nos_co_ops_8410714'],
 '8410715': ['noaa_nos_co_ops_8410715'],
 '8410834': ['noaa_nos_co_ops_8410834'],
 '8410864': ['noaa_nos_co_ops_8410864'],
 '8411060': ['noaa_nos_co_ops_8411060'],
 '8411250': ['noaa_nos_co_ops_8411250'],
 '8412581': ['noaa_nos_co_ops_8412581'],
 '8413320': ['no

In [13]:
# e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")

# def search(coops_id):
#     axiom_ids = {}
#     url = e.get_search_url(response="csv", items_per_page=5, search_for=coops_id)
#     try:
#         df = pd.read_csv(url)
#         axiom_id = [axiom_id for axiom_id in df['Dataset ID'] if coops_id in axiom_id.split('_')][0]
#     except:
#         axiom_id = None
#     return coops_id, axiom_id



In [22]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(coops_id) for coops_id in coops_ids
)
axiom_coops_ids = {coops_id: axiom_id for (coops_id, axiom_id) in downloads}

missing = list(axiom_coops_ids.values()).count(None)
total = len(axiom_coops_ids.keys())
words = '''Total U.S. CO-OPS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. CO-OPS buoys: 2720. Axiom has 1891 of them and is missing 829.
CPU times: user 10.8 s, sys: 728 ms, total: 11.5 s
Wall time: 6min 1s


## PORTS/Currents from NOAA

In [None]:
STILL NEED TO DO OTHER CURRENTS??

### NOAA list

Lists are available online!
https://tidesandcurrents.noaa.gov/cdata/StationList?type=Current+Data&filter=active

In [224]:
url_historic = 'https://tidesandcurrents.noaa.gov/cdata/StationListFormat?type=Current+Data&filter=historic&format=csv'

# active are not a subset of historical
url_active = 'https://tidesandcurrents.noaa.gov/cdata/StationListFormat?type=Current+Data&filter=active&format=csv'

df_ports_historic = pd.read_csv(url_historic)
df_ports_active = pd.read_csv(url_active)

ports_ids = list(df_ports_historic['Station ID'].values) + list(df_ports_active['Station ID'].values)


Remove duplicates from NDBC list that overlap with PORTS.

In [228]:
ndbc_ids = update_ndbc_ids(ndbc_ids, ports_ids, 'PORTS')

Number of NDBC stations before removing PORTS: 1642
Number of NDBC stations after removing PORTS: 1642
Duplicates (by alias name) that were in NDBC list:  set()


### Axiom list

In [None]:
# %%time
# e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")
# axiom_ports_ids = {}
# for ports_id in ports_ids:
#     url = e.get_search_url(response="csv", items_per_page=5, search_for=ports_id)
#     try:
#         df = pd.read_csv(url)
#         axiom_ports_ids[ports_id] = [axiom_id for axiom_id in df['Dataset ID'] if ports_id in axiom_id.split('_')]
#     except:
#         pass
    
# missing = list(axiom_ports_ids.values()).count([])
# total = len(axiom_ports_ids.keys())
# words = '''Total U.S. CO-OPS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
# print(words)

In [30]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(ports_id) for ports_id in ports_ids
)
axiom_ports_ids = {ports_id: axiom_id for (ports_id, axiom_id) in downloads}

missing = list(axiom_ports_ids.values()).count(None)
total = len(axiom_ports_ids.keys())
words = '''Total U.S. PORTS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. PORTS buoys: 952. Axiom has 11 of them and is missing 941.
CPU times: user 4.49 s, sys: 323 ms, total: 4.81 s
Wall time: 4min 6s


In [31]:
axiom_ports_ids

{'cb0901': None,
 'cc0101': None,
 'db0201': None,
 'db0501': None,
 'db1935': None,
 'g02010': None,
 'g04010': None,
 'g05010': None,
 'g07010': None,
 'gp0101': None,
 'gp0201': None,
 'gp0301': None,
 'gp0401': None,
 'hb0101': 'noaa_nos_co_ops_hb0101',
 'hb0201': 'noaa_nos_co_ops_hb0201',
 'hb0301': None,
 'jx0101': None,
 'jx0201': None,
 'jx0301': None,
 'jx0501': None,
 'kb0101': None,
 'kb0201': None,
 'kb0401': None,
 'lb0101': None,
 'mb0101': None,
 'mb0301': 'noaa_nos_co_ops_mb0301',
 'mb0401': 'noaa_nos_co_ops_mb0401',
 'n01010': None,
 'n05010': None,
 'nb0101': None,
 'nb0201': None,
 'ps0201': None,
 'ps0301': None,
 'ps0401': 'noaa_nos_co_ops_ps0401',
 'BOS1101': None,
 'BOS1102': None,
 'BOS1103': None,
 'BOS1104': None,
 'BOS1105': None,
 'BOS1106': None,
 'BOS1107': None,
 'BOS1108': None,
 'BOS1109': None,
 'BOS1110': None,
 'BOS1111': None,
 'BOS1112': None,
 'BOS1113': None,
 'BOS1114': None,
 'BOS1115': None,
 'BOS1117': None,
 'BOS1118': None,
 'BOS1119': None

## DART

### NOAA list

In [39]:
url = 'https://www.ndbc.noaa.gov/data/stations/station_table.txt'
dfall = pd.read_table(url, sep='|', index_col=0, skiprows=[1])

This is the number of DART buoys in the NDBC station list (not all active) that are in the U.S. (defined by `llbox`).

In [40]:
idart = [('DART' in str(payload)) and ('TEST' not in name.upper()) and 
         ('N' in loc) and ('W' in loc) and ((kw['min_lon'] < -float(loc.split('N')[1].split(' ')[1]) < kw['max_lon']) 
                                       and (kw['min_lat'] < float(loc.split('N')[0][:-1]) < kw['max_lat']))
         for payload, name, loc in zip(dfall[' PAYLOAD '].values, dfall[' NAME '].values, dfall[' LOCATION '])]

idart.count(True)

29

In [41]:
dart_ids = list(dfall[idart].index)
dart_ids[:5]

['41420', '41421', '41424', '41425', '41d21']

It turns out that all but 3 US Dart buoys are included in the NDBC list, and the 3 that are not included previously drifted off from their location and so were renamed. So, we will not include DART buoys separately but just leave them in the NDBC list.

In [230]:
print(len(ndbc_ids), len(dart_ids))
print(len(set(ndbc_ids) - set(dart_ids)))
set(dart_ids) - set(ndbc_ids)  # THESE ARE ALL ADRIFT AND DON"T WANT TO USE THEM. SO< DON"T NEED DART BUOYS SEPARATELY>

1616 29
1616


{'41420',
 '41421',
 '41424',
 '41425',
 '41d21',
 '41x24',
 '42408',
 '42429',
 '44402',
 '44403',
 '44d03',
 '46401',
 '46402',
 '46403',
 '46404',
 '46405',
 '46407',
 '46408',
 '46409',
 '46410',
 '46411',
 '46412',
 '46413',
 '46414',
 '46415',
 '46416',
 '46419',
 '46490',
 '51407'}

### Axiom list

Was hard to find them by DART affiliation in metadata so just search for station name.

Loop through the `ndbc_dart_ids` we just found and for each do a list comprehension with the dart station name.

In [8]:
# e = ERDDAP(server="http://erddap.sensors.ioos.us/erddap")
# axiom_dart_ids = {}
# for ndbc_dart_id in ndbc_dart_ids:
#     url = e.get_search_url(response="csv", items_per_page=10000, search_for=ndbc_dart_id)
#     try:
#         df = pd.read_csv(url)
#         axiom_dart_ids[ndbc_dart_id] = [axiom_id for axiom_id in df['Dataset ID'] if ndbc_dart_id in axiom_id.split('_')]
#     except:
#         pass
    
# missing = list(axiom_dart_ids.values()).count([])
# total = len(axiom_dart_ids.keys())
# words = '''Total U.S. DART buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
# print(words)

NameError: name 'ndbc_dart_ids' is not defined

In [9]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(dart_id) for dart_id in dart_ids
)
axiom_dart_ids = {dart_id: axiom_id for (dart_id, axiom_id) in downloads}

missing = list(axiom_dart_ids.values()).count(None)
total = len(axiom_dart_ids.keys())
words = '''Total U.S. DART buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total U.S. DART buoys: 29. Axiom has 16 of them and is missing 13.
CPU times: user 169 ms, sys: 105 ms, total: 274 ms
Wall time: 6.22 s


## TABS

### TABS list

In [206]:
# csv file with buoy attributes for TABS website
url = 'https://raw.githubusercontent.com/kthyng/tabswebsite/master/includes/buoys.csv'
buoys = pd.read_csv(url)
tabs = buoys[buoys['table1'] == 'ven']#['alias']
aliases = tabs['alias']
tabs_ids = list(buoys[buoys['table1'] == 'ven']['buoy'].values)
tabs_ids = ['tabs_' + tabs_id.lower() for tabs_id in tabs_ids]
tabs_ids = {tabs_id: alias for (tabs_id, alias) in zip(tabs_ids, aliases)}
tabs_ids

{'tabs_b': '42043',
 'tabs_d': '42048',
 'tabs_f': '42050',
 'tabs_j': '42044',
 'tabs_k': '42045',
 'tabs_r': '42051',
 'tabs_v': '42047',
 'tabs_w': '42049',
 'tabs_x': nan,
 'tabs_a': nan,
 'tabs_c': nan,
 'tabs_e': nan,
 'tabs_g': nan,
 'tabs_h': nan,
 'tabs_l': nan,
 'tabs_m': nan,
 'tabs_n': '42046',
 'tabs_p': nan,
 'tabs_s': nan}

Remove element from `ndbc_ids` if duplicate of `tabs_ids.values()` which are the station aliases.

In [212]:
print('Number of NDBC stations before removing TABS: ', len(ndbc_ids))
dupes = set(tabs_ids.values()) & set(ndbc_ids)
ndbc_ids = set(ndbc_ids) - set(tabs_ids.values())
print('Number of NDBC stations after removing TABS: ', len(ndbc_ids))
print('Duplicates (by alias name) that were in NDBC list: ', dupes)

Number of NDBC stations before removing TABS:  1902
Number of NDBC stations after removing TABS:  1893
Duplicates that were in NDBC list:  {'42044', '42051', '42046', '42049', '42043', '42045', '42047', '42048', '42050'}


### Axiom list

In [205]:
%%time
num_cores = multiprocessing.cpu_count()
downloads = Parallel(n_jobs=num_cores)(
    delayed(search)(tabs_id) for tabs_id in tabs_ids
)
axiom_tabs_ids = {tabs_id: axiom_id for (tabs_id, axiom_id) in downloads}

missing = list(axiom_tabs_ids.values()).count(None)
total = len(axiom_tabs_ids.keys())
words = '''Total TABS buoys: %i. Axiom has %i of them and is missing %i.''' % (total, total-missing, missing)
print(words)    

Total TABS buoys: 19. Axiom has 9 of them and is missing 10.
CPU times: user 120 ms, sys: 119 ms, total: 240 ms
Wall time: 8.65 s


## Compare NOAA lists for different organizations

Looking for duplicates.

Remove from `ndbc_ids` duplicates from the values of `coops_ids`.

In [102]:
print(len(ndbc_ids))
ndbc_ids2 = list(set(ndbc_ids) - set(coops_ids.values()))

In [103]:
noaa_ids = ndbc_ids + list(coops_ids.keys()) + ports_ids + dart_ids
len(noaa_ids)

5344

In [107]:
print(len(ndbc_ids), len(coops_ids.keys()))
len(set(ndbc_ids) - set(coops_ids.keys()))

1642 2721


1642

In [110]:
print(len(ndbc_ids), len(ports_ids))
len(set(ndbc_ids) - set(ports_ids))

1642 952


1642

In [113]:
print(len(ndbc_ids), len(dart_ids))
print(len(set(ndbc_ids) - set(dart_ids)))
set(dart_ids) - set(ndbc_ids)  # THESE ARE ALL ADRIFT AND DON"T WANT TO USE THEM. SO< DON"T NEED DART BUOYS SEPARATELY>

1642 29
1616


{'41d21', '41x24', '44d03'}

In [112]:
1642-1616

26

What if there are multiple names for ids? How to correctly match those, both in noaa lists and Axiom lists?

In [55]:
'8770475' in coops_ids

True

In [57]:
'PORT2' in ndbc_ids

True

# ---------------

In [465]:

kw = {
    "standard_name": "sea_water_practical_salinity",
    "min_lon": -99.0,
    "max_lon": -88.0,
    "min_lat": 20.0,
    "max_lat": 30.0,
    "min_time": "2016-07-10T00:00:00Z",
    "max_time": "2017-02-10T00:00:00Z"
}


search_url = e.get_search_url(response="csv", **kw, items_per_page=10000)
# search_url
search = pd.read_csv(search_url)
search
# print(search["Dataset ID"][0])

# dataset_id = search["Dataset ID"][0]
# info_url = e.get_info_url(dataset_id=dataset_id, response="csv")
# info = pd.read_csv(info_url)
# info[100:150]

Unnamed: 0,griddap,Subset,tabledap,Make A Graph,wms,files,Title,Summary,FGDC,ISO 19115,Info,Background Info,RSS,Institution,Dataset ID
0,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,Aransas Wildlife Refuge (TCOON),Timeseries data from 'Aransas Wildlife Refuge ...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/noaa...,https://sensors.ioos.us/#metadata/57560/station,http://erddap.sensors.axds.co/erddap/rss/noaa_...,NOAA Center for Operational Oceanographic Prod...,noaa_nos_co_ops_8774230
1,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Baffin Bay, TX","Timeseries data from 'Baffin Bay, TX' (urn:ioo...",http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/noaa...,https://sensors.ioos.us/#metadata/45616/station,http://erddap.sensors.axds.co/erddap/rss/noaa_...,NOAA Center for Operational Oceanographic Prod...,noaa_nos_co_ops_8776604
2,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Barataria Bay N of Grand Isle, LA",Timeseries data from 'Barataria Bay N of Grand...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/gov_...,https://sensors.ioos.us/#metadata/24642/station,http://erddap.sensors.axds.co/erddap/rss/gov_u...,USGS National Water Information System (NWIS),gov_usgs_waterdata_07380251
3,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Barataria Bay near Grand Terre Island, LA",Timeseries data from 'Barataria Bay near Grand...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/gov_...,https://sensors.ioos.us/#metadata/23003/station,http://erddap.sensors.axds.co/erddap/rss/gov_u...,USGS National Water Information System (NWIS),gov_usgs_waterdata_291929089562600
4,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Barataria Pass at Grand Isle, LA",Timeseries data from 'Barataria Pass at Grand ...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/gov_...,https://sensors.ioos.us/#metadata/61772/station,http://erddap.sensors.axds.co/erddap/rss/gov_u...,USGS National Water Information System (NWIS),gov_usgs_waterdata_073802516
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Texas Point, Sabine Pass","Timeseries data from 'Texas Point, Sabine Pass...",http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/noaa...,https://sensors.ioos.us/#metadata/57559/station,http://erddap.sensors.axds.co/erddap/rss/noaa_...,NOAA Center for Operational Oceanographic Prod...,noaa_nos_co_ops_8770822
62,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Trinity Rv at Wallisville, TX",Timeseries data from 'Trinity Rv at Wallisvill...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/gov_...,https://sensors.ioos.us/#metadata/32690/station,http://erddap.sensors.axds.co/erddap/rss/gov_u...,USGS National Water Information System (NWIS),gov_usgs_waterdata_08067252
63,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"USS Lexington, TX","Timeseries data from 'USS Lexington, TX' (urn:...",http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/noaa...,https://sensors.ioos.us/#metadata/45612/station,http://erddap.sensors.axds.co/erddap/rss/noaa_...,NOAA Center for Operational Oceanographic Prod...,noaa_nos_co_ops_8775296
64,,,http://erddap.sensors.axds.co/erddap/tabledap/...,http://erddap.sensors.axds.co/erddap/tabledap/...,,,"Vermilion Bay near Cypremort Point, LA",Timeseries data from 'Vermilion Bay near Cypre...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/metadata/...,http://erddap.sensors.axds.co/erddap/info/gov_...,https://sensors.ioos.us/#metadata/32721/station,http://erddap.sensors.axds.co/erddap/rss/gov_u...,USGS National Water Information System (NWIS),gov_usgs_waterdata_07387040


In [427]:
info[info['Attribute Name'] == 'defaultDataQuery']['Value'].values

array(['sea_surface_height_above_sea_level_geoid_mllw,air_temperature,sea_water_electrical_conductivity,sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw,wind_speed_of_gust,sea_water_temperature,z,wind_speed,time,wind_from_direction,air_pressure,sea_water_practical_salinity&time>=max(time)-3days'],
      dtype=object)

In [436]:
info[info['Row Type'] == 'variable']['Variable Name'].values

array(['time', 'latitude', 'longitude', 'z', 'air_pressure',
       'sea_water_electrical_conductivity',
       'sea_water_practical_salinity', 'air_temperature',
       'sea_water_temperature',
       'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw',
       'sea_surface_height_above_sea_level_geoid_mllw',
       'wind_speed_of_gust', 'wind_speed', 'wind_from_direction',
       'station'], dtype=object)

In [419]:
e.get_var_by_attr(
    dataset_id=dataset_id,
    standard_name="sea_water_temperature"
)


['sea_water_temperature']

In [420]:
e.get_var_by_attr(
    dataset_id=dataset_id,
    standard_name="sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw"
)


[]

In [421]:
e.get_var_by_attr(dataset_id, axis=lambda v: v in ["X", "Y", "Z", "T"])

['longitude', 'time', 'latitude', 'z']

In [430]:
e.get_var_by_attr(
    dataset_id=dataset_id,
    standard_name=lambda v: v in ["sea_water_practical_salinity","sea_water_temperature",'sea_surface_height',
 'sea_surface_height_above_sea_level',
 'sea_surface_height_above_sea_level_quality_flag',
 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide',
 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_quality_flag',
 'sea_surface_height_quality_flag']
)


['sea_water_practical_salinity',
 'sea_surface_height_above_sea_level_geoid_mllw',
 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw',
 'sea_water_temperature']

In [422]:
e.get_var_by_attr(
    dataset_id=dataset_id,
    standard_name="sea_water_practical_salinity"
)


['sea_water_practical_salinity']

In [445]:
url = e.get_search_url(search_for="tabs", response="csv")

df = pd.read_csv(url)

In [449]:
info_url = e.get_info_url(dataset_id='tabs_b', response="csv")
info = pd.read_csv(info_url)
info

Unnamed: 0,Row Type,Variable Name,Attribute Name,Data Type,Value
0,attribute,NC_GLOBAL,cdm_data_type,String,TimeSeries
1,attribute,NC_GLOBAL,cdm_timeseries_variables,String,"station,longitude,latitude"
2,attribute,NC_GLOBAL,contributor_email,String,"None,feedback@axiomdatascience.com"
3,attribute,NC_GLOBAL,contributor_name,String,Gulf of Mexico Coastal Ocean Observing System ...
4,attribute,NC_GLOBAL,contributor_role,String,"funder,processor"
...,...,...,...,...,...
202,attribute,station,ioos_category,String,Identifier
203,attribute,station,ioos_code,String,urn:ioos:station:com.axiomdatascience:57437
204,attribute,station,long_name,String,TABS Buoy B
205,attribute,station,short_name,String,urn:ioos:station:tabs:B


In [450]:
info[info['Row Type'] == 'variable']['Variable Name'].values

array(['time', 'latitude', 'longitude', 'z', 'air_pressure',
       'sea_water_electrical_conductivity',
       'sea_water_velocity_to_direction', 'sea_water_speed',
       'relative_humidity', 'sea_water_practical_salinity',
       'air_temperature', 'sea_water_temperature', 'wind_speed_of_gust',
       'wind_speed', 'wind_from_direction', 'station'], dtype=object)

In [452]:
url = e.get_categorize_url(
    categorize_by="standard_name",
    response="csv"
)

cats = pd.read_csv(url)["Category"]
print([name for name in cats if 'sea_surface_height' in name])
print([name for name in cats if 'sea_water_practical_salinity' in name])
print([name for name in cats if 'sea_water_temperature' in name])
print([name for name in cats if 'sea_water_speed' in name])
print([name for name in cats if 'sea_water_velocity_to_direction' in name])


['sea_surface_height', 'sea_surface_height_above_sea_level', 'sea_surface_height_above_sea_level_quality_flag', 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide', 'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_quality_flag', 'sea_surface_height_quality_flag']
['sea_water_practical_salinity', 'sea_water_practical_salinity_quality_flag']
['difference_between_sea_water_temperature_and_freezing_point', 'sea_water_temperature', 'sea_water_temperature_quality_flag']
['sea_water_speed', 'sea_water_speed_quality_flag']
['sea_water_velocity_to_direction', 'sea_water_velocity_to_direction_quality_flag']


In [494]:
# e = ERDDAP(server="http://erddap.sensors.axds.co/erddap")
e.constraints = None
e.protocol = "tabledap"
# e.variables = list(info[info['Row Type'] == 'variable']['Variable Name'].values)
e.variables = ["time","longitude", "latitude", "station", "sea_water_temperature"]#, "sea_water_practical_salinity",'sea_surface_height_above_sea_level_geoid_mllw',
#  'sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw']
e.dataset_id = dataset_id
# Drop units in the first line and NaNs.
df = e.to_pandas(response="csvp").dropna(axis='index', how='all').dropna(axis='columns', how='all')
df

ValueError: I/O operation on closed file.

In [368]:
df

Unnamed: 0,longitude,latitude,sea_water_temperature
64471,-69.248,40.503,12.7
64473,-69.248,40.503,12.7
64475,-69.248,40.503,12.7
64477,-69.248,40.503,12.7
64479,-69.248,40.503,12.7
...,...,...,...
127974,-69.248,40.503,11.8
127976,-69.248,40.503,11.8
127978,-69.248,40.503,11.9
127980,-69.248,40.503,11.9


# What data does Axiom already have compared to what is out there

## NDBC

### Active

Get list of active stations. 

List also available here: https://www.ndbc.noaa.gov/activestations.xml

In [233]:
# https://unidata.github.io/siphon/latest/examples/ndbc/latest_request.html
df = NDBC.latest_observations()
stations_ndbc_active = df.station.values
df.head()

Unnamed: 0,station,latitude,longitude,wind_direction,wind_speed,wind_gust,wave_height,dominant_wave_period,average_wave_period,dominant_wave_direction,pressure,3hr_pressure_tendency,air_temperature,water_temperature,dewpoint,visibility,water_level_above_mean,time
0,14041,-8.0,55.0,229.0,1.0,,,,,,1011.2,,28.3,29.6,,,,2021-03-22 17:00:00+00:00
1,14047,-4.0,57.0,,,,,,,,,,30.6,30.4,,,,2021-03-22 17:00:00+00:00
2,22101,37.24,126.02,180.0,3.0,,,,,,,,5.2,5.2,,,,2021-03-22 17:00:00+00:00
3,22102,34.79,125.78,240.0,3.0,,,,,,,,8.1,8.1,,,,2021-03-22 17:00:00+00:00
4,22103,34.0,127.5,290.0,9.0,,,,,,,,10.4,14.3,,,,2021-03-22 17:00:00+00:00


### Full

Get list of all stations, which would have to be filtered to find appropriate historical data

In [144]:
url = 'https://www.ndbc.noaa.gov/data/stations/station_table.txt'
dfall = pd.read_table(url, sep='|', index_col=0)
dfall

Unnamed: 0_level_0,OWNER,TTYPE,HULL,NAME,PAYLOAD,LOCATION,TIMEZONE,FORECAST,NOTE
# STATION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
#,,,,,,,,,
00922,DU,Slocum Glider,,OTN201 - 4800922,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
00923,DU,Slocum Glider,,OTN200 - 4800923,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
01500,R,Spray Glider,,SP031 - 3801500,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",?,,
01502,UA,Slocum Glider,,Penobscot - 4801502,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
...,...,...,...,...,...,...,...,...,...
ygnn6,EA,GLOS Weather Station,,"Niagara Coast Guard Station, NY",,"43.262 N 79.064 W (43&#176;15'42"" N 79&#176;3'...",E,,"For Great Lakes marine forecasts, select: <a h..."
ykrv2,PT,Water Level Observation Network,,"8637611 - York River East Rear Range Light, VA",,"37.251 N 76.342 W (37&#176;15'5"" N 76&#176;20'...",E,FZUS51.KAKQ,
yktv2,O,Water Level Observation Network,,"8637689 - Yorktown USCG Training Center, VA",,"37.227 N 76.479 W (37&#176;13'36"" N 76&#176;28...",E,FZUS51.KAKQ,
yrsv2,NR,NERRS Weather Station,,"Taskinas Creek, Chesapeake Bay Reserve, VA",,"37.414 N 76.712 W (37&#176;24'51"" N 76&#176;42...",E,,Water Quality data for this Reserve are availa...


Filter full list of NDBC stations to get those that would match with the descriptions of the active stations, and also limit to the US.

In [294]:
# descriptions used in active buoy list
descs = ['buoy', 'platform', 'tower', 'station', 'Water Level Observation Network']

# lon lat box to determine if in US
# [just west of AK stations, just east of Maine stations, just south of Hawaii, north of AK historial stations]
llbox = [-192, -65, 15.5, 76.5]

# loop over full list of NDBC stations and limit to those that seem to match
# those in the active list (avoiding listings like gliders)
# for id, ttype in dfall[['# STATION_ID ',' TTYPE ']].values:
stations_ndbc_all = []
for id, (loc, ttype) in zip(dfall.index.values[1:], dfall[[' LOCATION ', ' TTYPE ']].values[1:]):
    inList, toKeep = False, False
    # search for description in NDBC 
    for desc in descs:
        if desc in str(ttype):
            inList = True
    # filter by lon/lat
    if inList and ('N' in loc) and ('W' in loc):  # don't need any in southern hemisphere
#         print(loc)
        lat = float(loc.split('N')[0][:-1])
        lon = -float(loc.split('N')[1].split(' ')[1])
#         print(lon,lat)
        if (llbox[0]<lon<llbox[1]) and (llbox[2] < lat < llbox[3]):
            toKeep = True
#             print(id)
            stations_ndbc_all.append(id)
    

In [295]:
len(stations_ndbc_all)

606

### Axiom list

In [296]:
# agent id is 18 for NDBC
url2 = 'https://oikos.axds.co/rest/sensor/stations/by/agent/18'
ds2 = pd.read_json(url2)
ds2.set_index('id', inplace=True)
ds2.sourceLabel = [label.lower() for label in ds2.sourceLabel.values]
ds2.head()

Unnamed: 0_level_0,label,platformTypeId,sourceLabel,sourceUrl,isactive,submitToNdbc,addToThredds,wmoId,source,owner,otherAffiliations,lat,lon,elevation,active,visible
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
49357,"46247 - San Francisco Offshore, CA (180)",10,46247,https://www.ndbc.noaa.gov/station_page.php?sta...,False,False,False,,"{'id': 51, 'label': 'NetCDF', 'type': 'source'}","{'id': 18, 'label': 'NOAA National Data Buoy C...",[],37.752617,-122.83313,0,False,False
41916,"SHPF1 - SHP - Shell Point, FL",6,shpf1,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,SHPF1,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 144, 'label': 'USF CMS - Coastal Ocean ...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",30.058,-84.29,0,True,True
41922,SSBN7 - Sunset Beach Nearshore Waves,6,ssbn7,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,SSBN7,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 154, 'label': 'UNCW - Coastal Ocean Res...","[{'id': 234, 'label': 'Southeast Coastal Ocean...",33.83,-78.48,0,True,True
57366,KATP - Green Canyon 787 / Atlantis (BP),6,katp,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,KATP,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 18, 'label': 'NOAA National Data Buoy C...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",27.195,-90.027,0,True,True
15683,"PILA2 - Pilot Rock, AK",10,pila2,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,PILA2,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 18, 'label': 'NOAA National Data Buoy C...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",59.742,-149.47,24,True,True


In [331]:
urlall = 'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?datasetID%2Caccessible%2Cinstitution%2CdataStructure%2Ccdm_data_type%2Cclass%2Ctitle%2CminLongitude%2CmaxLongitude%2ClongitudeSpacing%2CminLatitude%2CmaxLatitude%2ClatitudeSpacing%2CminAltitude%2CmaxAltitude%2CminTime%2CmaxTime%2CtimeSpacing%2Cgriddap%2Csubset%2Ctabledap%2CMakeAGraph%2Csos%2Cwcs%2Cwms%2Cfiles%2Cfgdc%2Ciso19115%2Cmetadata%2CsourceUrl%2CinfoUrl%2Crss%2Cemail%2CtestOutOfDate%2CoutOfDate%2Csummary&minLongitude%3E=-99&maxLongitude%3C=-88&minLatitude%3E=20&maxLatitude%3E=31&minTime%3E=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z.nc'

In [335]:
urlall = 'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?minLongitude%2CmaxLongitude%2CminLatitude%2CmaxLatitude%2CminTime%2CmaxTime&minLongitude%3E=-99&maxLongitude%3C=-88&minLatitude%3E=20&maxLatitude%3E=31&minTime%3E=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z.nc'

In [None]:
'https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_244.html'
'https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_244.html?time%2Csea_water_velocity_to_direction%2Csea_water_speed%2Csea_water_velocity_to_direction_qc_agg%2Csea_water_speed_qc_agg'
'https://erddap.sensors.ioos.us/erddap/tabledap/indian-river-lagoon-fort-pierce-.html?time%2Cair_pressure%2Cair_pressure_qc_agg'

In [339]:
'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?accessible%2Cinstitution%2CdataStructure%2Ccdm_data_type%2Cclass%2CminLongitude%2CmaxLongitude%2CminLatitude%2CmaxLatitude%2CminTime%2CmaxTime%2CtestOutOfDate&accessible=%22public%22&minLongitude%3E=-99&maxLongitude%3C=-89&minLatitude%3E=20&maxLatitude%3C=30&minTime%3E=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z&distinct().nc'

'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?accessible%2Cinstitution%2CdataStructure%2Ccdm_data_type%2Cclass%2CminLongitude%2CmaxLongitude%2CminLatitude%2CmaxLatitude%2CminTime%2CmaxTime%2CtestOutOfDate&accessible=%22public%22&minLongitude%3E=-99&maxLongitude%3C=-89&minLatitude%3E=20&maxLatitude%3C=30&minTime%3E=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z&distinct().nc'

In [348]:
urlall = 'https://erddap.sensors.ioos.us/erddap/tabledap/indian-river-lagoon-fort-pierce-.nc?time%2Cair_pressure%2Cair_pressure_qc_agg'

In [349]:
import xarray as xr
ds = xr.open_dataset(urlall)

OSError: [Errno -90] NetCDF: file not found: b'https://erddap.sensors.ioos.us/erddap/tabledap/indian-river-lagoon-fort-pierce-.nc?time%2Cair_pressure%2Cair_pressure_qc_agg'

In [353]:
url = 'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?minLongitude=-99&maxLongitude=-89&minLatitude=20&maxLatitude=30&minTime=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z'
ds = xr.open_dataset(url)

OSError: [Errno -75] NetCDF: Malformed or unexpected Constraint: b'https://erddap.dataexplorer.oceanobservatories.org/erddap/tabledap/allDatasets.nc?minLongitude=-99&maxLongitude=-89&minLatitude=20&maxLatitude=30&minTime=2010-01-01T00%3A00%3A00Z&maxTime%3C=2020-01-01T00%3A00%3A00Z'

In [347]:
url = 'https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_244.csv?time%2Csea_water_velocity_to_direction%2Csea_water_speed%2Csea_water_velocity_to_direction_qc_agg%2Csea_water_speed_qc_agg'
https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_132.csv?time%2Csea_water_velocity_to_direction%2Csea_water_speed%2Csea_water_velocity_to_direction_qc_agg%2Csea_water_speed_qc_agg
dft = pd.read_csv(url)
dft

Unnamed: 0,time,sea_water_velocity_to_direction,sea_water_speed,sea_water_velocity_to_direction_qc_agg,sea_water_speed_qc_agg
0,UTC,degrees,m.s-1,,
1,2019-02-16T01:03:45Z,,,,
2,2019-02-16T01:32:45Z,,,,
3,2019-02-16T01:33:45Z,,,,
4,2019-02-16T02:02:45Z,,,,
...,...,...,...,...,...
116294,2021-03-22T21:55:00Z,,,,
116295,2021-03-22T22:00:00Z,125.69230651855469,0.22499999403953552,1.0,1.0
116296,2021-03-22T22:05:00Z,,,,
116297,2021-03-22T22:10:00Z,116.02198028564453,0.2619999945163727,1.0,1.0


In [354]:
url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-134.69678953882718%2C33.10075517593025%5D%2C%5B-122.22549788393279%2C33.10075517593025%5D%2C%5B-122.22549788393279%2C43.67582565045436%5D%2C%5B-134.69678953882718%2C43.67582565045436%5D%2C%5B-134.69678953882718%2C33.10075517593025%5D%5D%5D%7D&startDateTime=1616140800&endDateTime=1616486399'

In [361]:
ds = pd.read_json(url, typ='series')
ds

ValueError: Expected object or value

In [371]:
import io
url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10&type=sensor_station'
res = io.StringIO(url)
# result.decode('utf-8')
# pd.read_json(url)

In [531]:
nc4.date2num?

[0;31mDocstring:[0m
date2num(dates, units, calendar=None)

Return numeric time values given datetime objects. The units
of the numeric time values are described by the **units** argument
and the **calendar** keyword. The datetime objects must
be in UTC with no time-zone offset.  If there is a
time-zone offset in **units**, it will be applied to the
returned numeric values.

**dates**: A datetime object or a sequence of datetime objects.
The datetime objects should not include a time-zone offset. They
can be either native python datetime instances (which use
the proleptic gregorian calendar) or cftime.datetime instances.

**units**: a string of the form **<time units> since <reference time>**
describing the time units. **<time units>** can be days, hours, minutes,
seconds, milliseconds or microseconds. **<reference time>** is the time
origin. **months_since** is allowed *only* for the **360_day** calendar.

**calendar**: describes the calendar to be used in the time calculations.
All 

In [529]:
import netCDF4 as nc4
nc4.num2date(1000186399, units='seconds since 1970-01-01')

cftime.DatetimeGregorian(2001, 9, 11, 5, 33, 19, 0)

In [532]:
nc4.date2num(pd.Timestamp('2010-01-01'), units='seconds since 1970-01-01')

1262304000

In [539]:
import urllib
import json
# url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22,%22coordinates%22%3A[[[-99,20],[-88,20],[-88,30],[-99,30],[-99,20]]]%7D&searchStartDateTime=1262304000&endDateTime=1616486399'
url = 'https://search.axds.co/v2/search?portalId=45&page=1&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-134.40150203577394%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C50.51342652633956%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&type=sensor_station&type=parameter_group&type=affiliate&pageSize=1'
# with specific variable
url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-134.40150203577394%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C50.51342652633956%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&tag=Parameter%20Group%3ATemperature%3A%20Water%20Temperature'
# with multiple variables
url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-100.52377679565701%2C16.29905101458183%5D%2C%5B-77.97366543708242%2C16.29905101458183%5D%2C%5B-77.97366543708242%2C32.84267363195431%5D%2C%5B-100.52377679565701%2C32.84267363195431%5D%2C%5B-100.52377679565701%2C16.29905101458183%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&tag=Parameter%20Group%3ATemperature%3A%20Water%20Temperature&tag=Parameter%20Group%3ACurrents'
result = urllib.request.urlopen(url)
result = result.readline()
res = result.decode('utf-8')
res = res[len('callback('):-1]
res
res = json.loads(res)
res
meta = pd.json_normalize(res)
meta

Unnamed: 0,error,accessMethods,types,results,totalHits,page,totalPages,searchTimeInMillis,totalTimeInMillis,portalStartDateTime,...,searchEndDateTime,successfulShards,failedShards,totalShards,shardSearchFailures,query,explanations,tags.Data Provider,tags.Parameter Group,tags.Affiliate
0,,[],"[{'id': 'sensor_station', 'label': 'Sensor Sta...","[{'data': {'hasNarrative': False, 'figures': [...",804,1,1,147,199,1918-10-12T01:00:00.000Z,...,2021-03-30T10:55:00.000Z,5,0,5,[],,,"[{'id': None, 'label': 'Amerada Hess Corporati...","[{'id': None, 'label': 'Atmospheric Pressure: ...","[{'id': None, 'label': 'Amerada Hess Corporati..."


In [540]:
meta.columns

Index(['error', 'accessMethods', 'types', 'results', 'totalHits', 'page',
       'totalPages', 'searchTimeInMillis', 'totalTimeInMillis',
       'portalStartDateTime', 'portalEndDateTime', 'searchStartDateTime',
       'searchEndDateTime', 'successfulShards', 'failedShards', 'totalShards',
       'shardSearchFailures', 'query', 'explanations', 'tags.Data Provider',
       'tags.Parameter Group', 'tags.Affiliate'],
      dtype='object')

In [541]:
meta[['portalStartDateTime', 'portalEndDateTime', 'searchStartDateTime',
       'searchEndDateTime']]

Unnamed: 0,portalStartDateTime,portalEndDateTime,searchStartDateTime,searchEndDateTime
0,1918-10-12T01:00:00.000Z,2021-03-30T17:53:00.000Z,1990-07-28T16:30:00.000Z,2021-03-30T10:55:00.000Z


In [495]:
meta[['portalStartDateTime', 'portalEndDateTime', 'searchStartDateTime',
       'searchEndDateTime']]

Unnamed: 0,portalStartDateTime,portalEndDateTime,searchStartDateTime,searchEndDateTime
0,1918-10-12T01:00:00.000Z,2021-03-30T16:54:00.000Z,1990-07-28T16:30:00.000Z,2021-03-30T10:55:00.000Z


In [459]:
len(meta['results'][0])

570

### Compare lists

These stations are on both the full NDBC list and Axiom's NDBC list

In [304]:
onbothlists = list(set(stations_ndbc_all) & set(list(ds2.sourceLabel.values)))

In [325]:
summary = '''There are:
             -%i buoys on the active NDBC buoy list,
             -%i buoys on the full NDBC list, 
             -%i buoys on the filtered full NDBC to try to catch only appropriate U.S. buoys, 
             -%i buoys on the Axiom NDBC list, and 
             -%i buoys that are on both the Axiom and filtered full NDBC list''' \
             % (len(stations_ndbc_active), len(dfall), len(stations_ndbc_all),
                len(ds2), len(onbothlists))
print(summary)

There are:
             -733 buoys on the active NDBC buoy list,
             -1935 buoys on the full NDBC list, 
             -606 buoys on the filtered full NDBC to try to catch only appropriate U.S. buoys, 
             -1001 buoys on the Axiom NDBC list, and 
             -236 buoys that are on both the Axiom and filtered full NDBC list


In [328]:
print('The following NDBC buoys are not on the Axiom list: ', list(set(stations_ndbc_all) - set(list(ds2.sourceLabel.values))))

The following NDBC buoys are not on the Axiom list:  ['tcbm2', 'scis1', 'atgm1', 'tcmw1', 'pxsc1', '41005', '42037', 'mcyf1', 'bdsp1', 'fptt2', '42006', 'neaw1', '46304', 'gtot2', 'sapf1', 'camm2', 'gctf1', 'mbet2', 'espp4', 'ppxc1', 'nwwh1', '42041', 'labl1', 'acyn4', 'amps3', 'gwpm6', 'dkcm6', 'mgpt2', 'eb31', '46416', 'ulra2', 'darts', 'pnlm4', '46037', 'pmoa2', 'optf1', '41021', 'msg10', 'pvdr1', 'pxac1', 'sblf1', 'eb43', 'nwcl1', 'chyw1', 'mzxc1', 'txpt2', 'mros1', 'psxc1', '44490', 'hrbm4', 'qptr1', 'lndc1', 'msg14', '46008', 'wahv2', '44038', '42009', 'crva2', 'jmpn7', '44026', 'pmaf1', '41011', '46flo', 'frvm3', 'kwhh1', 'tlbo3', 'dartl', 'dartn', 'smoc1', 'brhc3', 'oouh1', 'apcf1', 'cndo1', 'nfdf1', 'amrl1', 'bltm2', '42015', 'orin7', 'mnmm4', '41012', 'lpnm4', 'pfdc1', 'kptn6', 'fcgt2', 'bftn7', '44023', 'rplv2', 'cman4', 'tpaf1', 'ptit2', 'alxn6', 'casm1', 'dmbc1', 'pacf1', 'clbp4', '42025', 'capl1', 'mnpv2', 'frdf1', 'dpxc1', '41023', 'gom01', 'blif1', 'skcf1', '46107', 'sh

In [300]:
stations_ndbc_all

606

In [47]:
[('buoy' in str(ttype).lower()) or ('platform' in str(ttype).lower()) or ('station' in str(ttype).lower()) or 
 ('tower' in str(ttype).lower()) or ('Water Level Observation Network' in str(ttype).lower()) for ttype in allstations[' TTYPE ']].count(True)

1385

DON'T INCLUDE IF OUTSIDE US

In [125]:
stationlist = [('buoy' in str(ttype).lower()) or ('platform' in str(ttype).lower()) or ('station' in str(ttype).lower()) or 
 ('tower' in str(ttype).lower()) or ('Water Level Observation Network' in str(ttype).lower()) for ttype in allstations[' TTYPE ']]
stationlist = list(allstations.iloc[stationlist].index.values)
stationlist

['0y2w3',
 '13001',
 '13002',
 '13008',
 '13009',
 '13010',
 '14040',
 '14041',
 '14043',
 '14047',
 '15001',
 '15002',
 '15006',
 '15007',
 '15319',
 '18ci3',
 '18cy3',
 '20cm4',
 '21178',
 '21346',
 '21347',
 '21348',
 '21401',
 '21413',
 '21414',
 '21415',
 '21416',
 '21417',
 '21418',
 '21419',
 '21420',
 '21595',
 '21597',
 '21598',
 '21600',
 '21636',
 '21637',
 '21640',
 '21707',
 '21d13',
 '22101',
 '22102',
 '22103',
 '22104',
 '22105',
 '22106',
 '22107',
 '22108',
 '23001',
 '23003',
 '23004',
 '23006',
 '23007',
 '23008',
 '23009',
 '23010',
 '23011',
 '23012',
 '23013',
 '23014',
 '23015',
 '23016',
 '23017',
 '23020',
 '23217',
 '23218',
 '23219',
 '23220',
 '23223',
 '23225',
 '23226',
 '23227',
 '23228',
 '23401',
 '28401',
 '31001',
 '31002',
 '31003',
 '31004',
 '31005',
 '31006',
 '31007',
 '31051',
 '31052',
 '31053',
 '31201',
 '31260',
 '31261',
 '31478',
 '32012',
 '32066',
 '32067',
 '32068',
 '32069',
 '32301',
 '32302',
 '32401',
 '32402',
 '32403',
 '32404',


In [51]:
[station for station in allstations.index if ('buoy' in str(station[' TTYPE ']).lower()) or ('platform' in str(station[' TTYPE ']).lower()) or ('station' in str(station[' TTYPE ']).lower()) or 
 ('tower' in str(station[' TTYPE ']).lower()) or ('Water Level Observation Network' in str(station[' TTYPE ']).lower())]

TypeError: string indices must be integers

In [66]:
allstations.loc['00922'][' TTYPE ']

'Slocum Glider'

In [71]:
[station for station in allstations.index[1:] if 'buoy' in str(allstations.loc[station][' TTYPE '])]

['21413',
 '21414',
 '21415',
 '21416',
 '21417',
 '21418',
 '21419',
 '21420',
 '21d13',
 '32012',
 '32066',
 '32067',
 '32068',
 '32069',
 '32301',
 '32302',
 '32401',
 '32402',
 '32411',
 '32412',
 '32413',
 '32489',
 '32d12',
 '41001',
 '41002',
 '41003',
 '41004',
 '41005',
 '41006',
 '41007',
 '41008',
 '41009',
 '41010',
 '41011',
 '41012',
 '41013',
 '41015',
 '41016',
 '41017',
 '41018',
 '41021',
 '41022',
 '41023',
 '41025',
 '41035',
 '41036',
 '41040',
 '41041',
 '41043',
 '41044',
 '41046',
 '41047',
 '41048',
 '41049',
 '41060',
 '41061',
 '41420',
 '41421',
 '41424',
 '41425',
 '41a46',
 '41b41',
 '41d20',
 '41d21',
 '41s43',
 '41s46',
 '41x01',
 '41x24',
 '42001',
 '42002',
 '42003',
 '42004',
 '42005',
 '42006',
 '42007',
 '42009',
 '42012',
 '42015',
 '42016',
 '42017',
 '42018',
 '42019',
 '42020',
 '42025',
 '42035',
 '42036',
 '42037',
 '42038',
 '42039',
 '42040',
 '42041',
 '42042',
 '42053',
 '42054',
 '42055',
 '42056',
 '42057',
 '42058',
 '42059',
 '42060',


In [44]:
[allstations.loc[station.lower(),' TTYPE '] for station in df.station if station.lower() in allstations.index.values]

['Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Atlas Buoy',
 'Ocean Reference Station',
 '3-meter discus buoy',
 '3-meter foam buoy',
 '3-meter foam buoy',
 '3-meter foam buoy',
 'Moored Buoy',
 'Moored Buoy',
 'Moored Buoy',
 'Moored Buoy',
 'Moored Buoy',
 '3-meter foam buoy',
 '3-meter discus buoy',
 '3-meter discus buoy',
 '3-meter discus buoy',
 '3-meter foam buoy',
 '3-meter discus buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Waverider Buoy',
 'Atlas Buoy',
 'Waverider Buoy',
 'Ocean Reference Station',
 '3-meter discus buoy',
 '3-meter discus buoy',
 '3-meter discus buoy',
 'Offshore Buoy',
 '2.1-meter ionomer foam buoy',
 'Offshore Buoy',
 'Offshore Buoy',
 'Offshore Buoy',
 '3-meter discus buoy',
 '3-meter foam 

In [40]:
[station.lower() in allstations.index.values for station in df.station].count(True)

738

In [25]:
allstations['# STATION_ID '].values

array(['#', '00922', '00923', ..., 'yktv2', 'yrsv2', 'zbqn7'],
      dtype=object)

In [32]:
df.station

0      13001
1      13002
2      13008
3      14040
4      14041
       ...  
733    WWEF1
734    WYCM6
735    YATA2
736    YKRV2
737    YKTV2
Name: station, Length: 738, dtype: object

### Axiom NDBC station

In [213]:
url2 = 'https://oikos.axds.co/rest/sensor/stations/by/agent/18'
ds2 = pd.read_json(url2)
ds2.set_index('id', inplace=True)
ds2.sourceLabel = [label.lower() for label in ds2.sourceLabel.values]
ds2

Unnamed: 0_level_0,label,platformTypeId,sourceLabel,sourceUrl,isactive,submitToNdbc,addToThredds,wmoId,source,owner,otherAffiliations,lat,lon,elevation,active,visible
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
49357,"46247 - San Francisco Offshore, CA (180)",10,46247,https://www.ndbc.noaa.gov/station_page.php?sta...,False,False,False,,"{'id': 51, 'label': 'NetCDF', 'type': 'source'}","{'id': 18, 'label': 'NOAA National Data Buoy C...",[],37.752617,-122.833130,0,False,False
41916,"SHPF1 - SHP - Shell Point, FL",6,shpf1,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,SHPF1,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 144, 'label': 'USF CMS - Coastal Ocean ...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",30.058000,-84.290000,0,True,True
41922,SSBN7 - Sunset Beach Nearshore Waves,6,ssbn7,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,SSBN7,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 154, 'label': 'UNCW - Coastal Ocean Res...","[{'id': 234, 'label': 'Southeast Coastal Ocean...",33.830000,-78.480000,0,True,True
57366,KATP - Green Canyon 787 / Atlantis (BP),6,katp,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,KATP,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 18, 'label': 'NOAA National Data Buoy C...","[{'id': 2009, 'label': 'World Meteorological O...",27.195000,-90.027000,0,True,True
15683,"PILA2 - Pilot Rock, AK",10,pila2,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,PILA2,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 18, 'label': 'NOAA National Data Buoy C...","[{'id': 2009, 'label': 'World Meteorological O...",59.742000,-149.470000,24,True,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58090,T2N155W (51021),6,t2n155w,http://tao.ndbc.noaa.gov/refreshed/site.php?si...,True,False,False,51021,"{'id': 87, 'label': 'Ocean SITES', 'type': 'so...","{'id': 149, 'label': 'OceanSITES', 'type': 'Ow...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",6.639700,-146.119995,0,True,True
57574,T0N110W (32323),6,t0n110w,http://tao.ndbc.noaa.gov/refreshed/site.php?si...,True,False,False,32323,"{'id': 87, 'label': 'Ocean SITES', 'type': 'so...","{'id': 149, 'label': 'OceanSITES', 'type': 'Ow...","[{'id': 2009, 'label': 'World Meteorological O...",0.046600,-109.917999,0,True,True
58106,T5N155W (51020),6,t5n155w,http://tao.ndbc.noaa.gov/refreshed/site.php?si...,True,False,False,51020,"{'id': 87, 'label': 'Ocean SITES', 'type': 'so...","{'id': 149, 'label': 'OceanSITES', 'type': 'Ow...","[{'id': 2009, 'label': 'World Meteorological O...",4.966200,-154.947006,0,True,True
75575,"OCSM2 - Ocean City, MD",6,ocsm2,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 202, 'label': 'U.S. Army Corps of Engin...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",38.338000,-75.070000,0,True,True


In [215]:
len([station for station in df.station if (station.lower() in ds2.sourceLabel.values)])# or (station in str(ds2.index.values)) or (station in ds2.label.values)])

429

In [197]:
[station for station in stationlist if (station.lower() in ds2.sourceLabel.values) or (station.lower() in str(ds2.index.values))]

['21346',
 '21347',
 '21348',
 '21413',
 '21414',
 '21415',
 '21416',
 '21418',
 '21419',
 '21595',
 '21597',
 '21598',
 '21637',
 '23218',
 '23219',
 '23220',
 '23223',
 '23226',
 '23227',
 '23228',
 '23401',
 '32012',
 '32066',
 '32067',
 '32401',
 '32402',
 '32403',
 '32411',
 '32412',
 '32413',
 '32489',
 '34420',
 '41001',
 '41002',
 '41004',
 '41008',
 '41009',
 '41010',
 '41013',
 '41024',
 '41025',
 '41029',
 '41033',
 '41036',
 '41037',
 '41038',
 '41040',
 '41041',
 '41043',
 '41044',
 '41046',
 '41047',
 '41048',
 '41049',
 '41051',
 '41052',
 '41053',
 '41056',
 '41057',
 '41060',
 '41062',
 '41063',
 '41064',
 '41108',
 '41110',
 '41112',
 '41113',
 '41114',
 '41115',
 '41116',
 '41117',
 '41118',
 '41159',
 '41420',
 '41421',
 '41424',
 '42001',
 '42002',
 '42003',
 '42012',
 '42013',
 '42019',
 '42020',
 '42022',
 '42023',
 '42035',
 '42036',
 '42039',
 '42040',
 '42055',
 '42056',
 '42057',
 '42058',
 '42059',
 '42060',
 '42067',
 '42085',
 '42088',
 '42089',
 '42090',


In [217]:
len(set(stationlist) & set(list(ds2.sourceLabel.values)))

783

In [218]:
len(stationlist)

1385

In [149]:
len(ds2)

1001

What is a buoy that is present in Axiom list but not NDBC list?

In [219]:
notndbc = list(set(ds2.sourceLabel.values) - set(stationlist))
len(notndbc)

216

In [228]:
list(set(stationlist) - set(ds2.sourceLabel.values))

['62091',
 '42094',
 '53009',
 '45178',
 '53040',
 '32302',
 '45019',
 '52842',
 'caro3',
 '22107',
 '45183',
 '41005',
 '42037',
 '42010',
 '46245',
 '48677',
 'erxa2',
 '31003',
 '44142',
 'cdxa2',
 '41098',
 'gptw1',
 'nwst2',
 '42006',
 '46264',
 '46304',
 'shxa2',
 '42078',
 '22103',
 '91356',
 '31052',
 'lprp4',
 '22102',
 '42008',
 'jnga2',
 '46111',
 'lixa2',
 '42041',
 'dmno3',
 'dsln7',
 '21417',
 'amps3',
 '42024',
 '22106',
 '91222',
 '22104',
 'huqn6',
 'ildl1',
 'spag1',
 'eb31',
 'farp2',
 '46416',
 'darts',
 '41193',
 'mxxa2',
 '91352',
 '46016',
 '53401',
 '46037',
 '62052',
 'kcmb',
 'acmn4',
 '41021',
 'msg10',
 '44086',
 'fpxc1',
 'eb43',
 '48680',
 '42014',
 'eb92',
 '45017',
 '44076',
 '44488',
 'ehsc1',
 'mbnn7',
 '31007',
 'hssf1',
 '41933',
 '42391',
 '46269',
 '51212',
 '44490',
 '41027',
 '46250',
 'tibc1',
 'msg14',
 '46008',
 '44012',
 'svls1',
 'tdpc1',
 '44038',
 '42009',
 '44174',
 'dryf1',
 '23001',
 '46266',
 '46780',
 'hplm2',
 'gbcl1',
 '44026',
 'mb

In [227]:
ds2[ds2['sourceLabel'] == notndbc[-50]][['owner','sourceUrl']].values#['sourceUrl'].values

array([[{'id': 205, 'label': 'Amerada Hess Corporation', 'type': 'Owner'},
        'https://www.ndbc.noaa.gov/station_page.php?station=42919']],
      dtype=object)

In [192]:
ds2[ds2['sourceLabel'] == notndbc]

Unnamed: 0_level_0,label,platformTypeId,sourceLabel,sourceUrl,isactive,submitToNdbc,addToThredds,wmoId,source,owner,otherAffiliations,lat,lon,elevation,active,visible
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
13782,"AUGA2 - Augustine Island, AK",10,AUGA2,https://www.ndbc.noaa.gov/station_page.php?sta...,True,False,False,AUGA2,"{'id': 18, 'label': 'National Data Buoy Center...","{'id': 18, 'label': 'NOAA National Data Buoy C...","[{'id': 18, 'label': 'NOAA National Data Buoy ...",59.378,-153.348,0,True,True


In [193]:
NDBC.buoy_data_types('AUGA2')

{}

## Axiom CO-OPS stations

In [72]:
url1 = 'https://oikos.axds.co/rest/sensors/stations/by/agent/9'
url2 = 'https://oikos.axds.co/rest/sensor/stations/by/agent/9'

In [95]:
ds1 = pd.read_json(url1)
ds1

Unnamed: 0,id,label,slug,lat,lon,z,platformType,visible,publicNotice,hasNarrative,stationAgents,datasetId
0,52539,El Capitan Passage,noaa_nos_co_ops_9450997,56.163300,-133.330000,0.00,fixed,True,,False,"[{'id': 252, 'associationType': 'publisher', '...",noaa_nos_co_ops_9450997
1,15253,"Red Bluff Bay, Baranof Island",noaa_nos_co_ops_9451467,56.856700,-134.723000,0.00,fixed,True,,False,"[{'id': 258, 'associationType': 'owner', 'fore...",noaa_nos_co_ops_9451467
2,15629,Perry Island (South Bay),noaa_nos_co_ops_9454721,60.671700,-147.932000,0.00,fixed,True,,False,"[{'id': 272, 'associationType': 'owner', 'fore...",noaa_nos_co_ops_9454721
3,45388,"Beck Island, Clarence Strait",noaa_nos_co_ops_9450906,56.046700,-132.862000,0.00,fixed,True,,False,"[{'id': 250, 'associationType': 'owner', 'fore...",noaa_nos_co_ops_9450906
4,13822,"KDAA2 - 9457292- Kodiak Island, AK",noaa_nos_co_ops_kdaa2,57.730000,-152.514000,0.00,fixed,True,,False,"[{'id': 942, 'associationType': 'affiliate', '...",noaa_nos_co_ops_kdaa2
...,...,...,...,...,...,...,...,...,...,...,...,...
335,105538,"Aguchik Island, AK, Tide Station (9456901)",aguchik-island-ak-tide-station-9,58.294639,-154.265611,0.00,fixed,True,,True,"[{'id': 13284, 'associationType': 'publisher',...",aguchik-island-ak-tide-station-9
336,100007,"Chinitna Bay, AK, Tide Station (9456357) [Prel...",chinitna-bay-ak-tide-station-,59.842095,-152.992627,-16.36,fixed,False,,True,"[{'id': 34, 'associationType': 'owner', 'forei...",chinitna-bay-ak-tide-station-
337,100008,"Coal Point, Homer, AK, Tide Station (9455558)",coal-point-homer-ak-tide-stat,59.602639,-151.410306,-17.00,fixed,True,,True,"[{'id': 13094, 'associationType': 'sponsor', '...",coal-point-homer-ak-tide-stat
338,103695,"Gadsden Cut, Tampa Bay",gadsden-cut-tampa-bay,27.773528,-82.516861,0.00,fixed,True,,False,"[{'id': 8217, 'associationType': 'owner', 'for...",gadsden-cut-tampa-bay


Which active NDBC stations are found in url2?

In [97]:
ds2 = pd.read_json(url2)
ds2

Unnamed: 0,id,label,platformTypeId,sourceLabel,sourceUrl,isactive,submitToNdbc,addToThredds,wmoId,source,owner,otherAffiliations,lat,lon,elevation,active,visible
0,46477,"Dahlgren, Upper Machodoc Creek, Va.",10,8635027,https://tidesandcurrents.noaa.gov/noaatidepred...,False,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...","[{'id': 236, 'label': 'Mid-Atlantic Coastal Oc...",38.3200,-77.0367,0,False,True
1,15298,"Muir Inlet, Glacier Bay",10,9452584,https://tidesandcurrents.noaa.gov/noaatidepred...,False,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],58.9133,-136.1080,0,False,False
2,15656,"AKUTAN, ALASKA",10,9462694,https://tidesandcurrents.noaa.gov/noaatidepred...,False,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],54.1333,-165.7773,0,False,False
3,52545,"TEXAS POINT, SABINE PASS (TCOON)",10,8770822,https://tidesandcurrents.noaa.gov/noaatidepred...,False,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],29.6893,-93.8418,0,False,True
4,46191,BRANDYWINE SHOAL LIGHT,10,8555889,https://tidesandcurrents.noaa.gov/noaatidepred...,False,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...","[{'id': 236, 'label': 'Mid-Atlantic Coastal Oc...",38.9867,-75.1133,0,False,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3339,47827,Queen Isabella Causeway,10,8779724,https://tidesandcurrents.noaa.gov/noaatidepred...,True,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],26.0783,-97.1700,0,True,True
3340,47584,"Hungry Harbor, Wash.",10,9440563,https://tidesandcurrents.noaa.gov/noaatidepred...,True,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],46.2583,-123.8480,0,True,True
3341,47835,Ifalik Atoll,10,TPT2647,https://tidesandcurrents.noaa.gov/noaatidepred...,True,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],7.2500,144.4500,0,True,True
3342,46181,"Stone Harbor, Great Channel",10,8535581,https://tidesandcurrents.noaa.gov/noaatidepred...,True,False,False,,"{'id': 9, 'label': 'Center for Operational Oce...","{'id': 9, 'label': 'NOAA Center for Operationa...",[],39.0567,-74.7650,0,True,True


In [115]:
len([station for station in df.station if (station in ds2.sourceLabel.values) or (station in str(ds2.id.values))])

50

In [113]:
ds2.id.values

array([46477, 15298, 15656, ..., 47835, 46181, 46928])

In [109]:
[label for label in ds2.sourceLabel if label in df.station.values]

['SKTA2',
 'OVIA2',
 'CECC1',
 'LJAC1',
 'OHBC1',
 'PRDA2',
 'MGZP4',
 'VDZA2',
 'NMTA2',
 'SDBC1',
 'MHRN6',
 'ITKA2',
 'SWLA2',
 'ALIA2',
 'KECA2',
 'FREL1',
 'PRUR1',
 'PORO3',
 'CHAO3',
 'PFXC1',
 'RDDA2',
 'SNDA2',
 'NKTA2',
 'KGCA2',
 'PRJC1',
 'YATA2',
 'ELFA2',
 'RTYC1',
 'NTBC1',
 'UPBC1',
 'PCOC1',
 'OMHC1',
 'PRYC1',
 'SBEO3',
 'ANTA2',
 'OLSA2',
 'FTPC1',
 'PSLC1',
 'RCMC1',
 'ANVC1',
 'ICAC1',
 'HBYC1',
 'AAMC1',
 'CPVM2',
 'UNLA2',
 'PLXA2',
 'ATKA2',
 'ADKA2',
 'KDAA2']

In [126]:
[station for station in stationlist if (station in ds2.sourceLabel.values) or (station in str(ds2.id.values))]

['46181']

In [134]:
list(ds2.sourceLabel.values)

['8635027',
 '9452584',
 '9462694',
 '8770822',
 '8555889',
 '8774230',
 'SKTA2',
 '9450997',
 '9458849',
 '8740166',
 'OVIA2',
 'CECC1',
 '9452611',
 '9457283',
 '9469439',
 '9455204',
 'LJAC1',
 '9451124',
 '9491873',
 '9451263',
 '9455869',
 '9459465',
 '9457724',
 '9452368',
 '9455711',
 '9491253',
 'OHBC1',
 'PRDA2',
 '9466477',
 '9454616',
 '9457376',
 '9456901',
 '9457634',
 '9466931',
 '9454751',
 '9451335',
 '9454721',
 '9457152',
 '9451467',
 '9454652',
 '9458819',
 '9462723',
 '9462782',
 '9454777',
 '9452022',
 '9454757',
 '9450906',
 '9455159',
 '9462719',
 '9455437',
 '9454662',
 '9454794',
 '9462662',
 '9456173',
 '9450913',
 '9450998',
 '9462787',
 '9466298',
 '9450296',
 '9462705',
 '9451936',
 '9451422',
 '9451434',
 '9451005',
 '9465261',
 '9453208',
 '9466153',
 '9459163',
 '9455145',
 '9458779',
 '9452336',
 '9454949',
 '9454825',
 '9469338',
 '9451853',
 '9462711',
 '9490424',
 '9451317',
 '9459758',
 '9454755',
 '9466057',
 '9469239',
 '9451906',
 '9450970',
 '94

In [133]:
set(stationlist) & set(list(ds2.sourceLabel.values))

set()

In [139]:
list(ds2.id.values)

[46477,
 15298,
 15656,
 52545,
 46191,
 52549,
 13804,
 52539,
 15451,
 47744,
 13768,
 18403,
 15744,
 15411,
 15589,
 15366,
 16977,
 15194,
 19977,
 15196,
 15638,
 15465,
 15422,
 15295,
 15729,
 15733,
 16978,
 13839,
 15574,
 15326,
 15432,
 15440,
 15416,
 52542,
 15331,
 15206,
 15629,
 15401,
 15253,
 15627,
 15446,
 15873,
 15486,
 15355,
 15229,
 15353,
 45388,
 15365,
 15653,
 15375,
 15335,
 15351,
 15657,
 15394,
 15183,
 52540,
 15484,
 20277,
 15594,
 20665,
 15275,
 15208,
 15200,
 15141,
 15568,
 15305,
 15662,
 15650,
 15360,
 15448,
 15241,
 15350,
 15349,
 15588,
 15273,
 15655,
 19976,
 15198,
 15470,
 15356,
 15661,
 15587,
 15222,
 15142,
 15619,
 20662,
 15444,
 15134,
 15271,
 15673,
 15389,
 20663,
 15382,
 15114,
 15306,
 15232,
 45389,
 15129,
 15453,
 15218,
 15263,
 15443,
 15586,
 15408,
 15307,
 20664,
 15221,
 15438,
 20703,
 15746,
 15171,
 15321,
 15381,
 15660,
 15199,
 15244,
 15135,
 15745,
 15550,
 15342,
 15441,
 15487,
 15144,
 15391,
 15362,


## HF Radar

## Compare amount of available data

### NDBC

In [144]:
url = 'https://www.ndbc.noaa.gov/data/stations/station_table.txt'
dfall = pd.read_table(url, sep='|', index_col=0)
dfall

Unnamed: 0_level_0,OWNER,TTYPE,HULL,NAME,PAYLOAD,LOCATION,TIMEZONE,FORECAST,NOTE
# STATION_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
#,,,,,,,,,
00922,DU,Slocum Glider,,OTN201 - 4800922,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
00923,DU,Slocum Glider,,OTN200 - 4800923,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
01500,R,Spray Glider,,SP031 - 3801500,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",?,,
01502,UA,Slocum Glider,,Penobscot - 4801502,,"30.000 N 90.000 W (30&#176;0'0"" N 90&#176;0'0"" W)",E,,
...,...,...,...,...,...,...,...,...,...
ygnn6,EA,GLOS Weather Station,,"Niagara Coast Guard Station, NY",,"43.262 N 79.064 W (43&#176;15'42"" N 79&#176;3'...",E,,"For Great Lakes marine forecasts, select: <a h..."
ykrv2,PT,Water Level Observation Network,,"8637611 - York River East Rear Range Light, VA",,"37.251 N 76.342 W (37&#176;15'5"" N 76&#176;20'...",E,FZUS51.KAKQ,
yktv2,O,Water Level Observation Network,,"8637689 - Yorktown USCG Training Center, VA",,"37.227 N 76.479 W (37&#176;13'36"" N 76&#176;28...",E,FZUS51.KAKQ,
yrsv2,NR,NERRS Weather Station,,"Taskinas Creek, Chesapeake Bay Reserve, VA",,"37.414 N 76.712 W (37&#176;24'51"" N 76&#176;42...",E,,Water Quality data for this Reserve are availa...


Filter full list of NDBC stations to get those that would match with the descriptions of the active stations, and also limit to the US.

In [294]:
# descriptions used in active buoy list
descs = ['buoy', 'platform', 'tower', 'station', 'Water Level Observation Network']

# lon lat box to determine if in US
# [just west of AK stations, just east of Maine stations, just south of Hawaii, north of AK historial stations]
llbox = [-192, -65, 15.5, 76.5]

# loop over full list of NDBC stations and limit to those that seem to match
# those in the active list (avoiding listings like gliders)
# for id, ttype in dfall[['# STATION_ID ',' TTYPE ']].values:
stations_ndbc_all = []
for id, (loc, ttype) in zip(dfall.index.values[1:], dfall[[' LOCATION ', ' TTYPE ']].values[1:]):
    inList, toKeep = False, False
    # search for description in NDBC 
    for desc in descs:
        if desc in str(ttype):
            inList = True
    # filter by lon/lat
    if inList and ('N' in loc) and ('W' in loc):  # don't need any in southern hemisphere
#         print(loc)
        lat = float(loc.split('N')[0][:-1])
        lon = -float(loc.split('N')[1].split(' ')[1])
#         print(lon,lat)
        if (llbox[0]<lon<llbox[1]) and (llbox[2] < lat < llbox[3]):
            toKeep = True
#             print(id)
            stations_ndbc_all.append(id)
    

In [295]:
len(stations_ndbc_all)

606

Currently available in Axiom erddap:

In [305]:
url = 'http://erddap.sensors.axds.co/erddap/search/advanced.csv?page=1&itemsPerPage=10000&searchFor=ndbc'
df = pd.read_csv(url)
print(len(df))

884


In [None]:
DO A MORE CAREFUL MATCHING OF STATIONS TYPES?

### CO-OPS

CUrrently available in axiom erddap

In [306]:
url = 'http://erddap.sensors.axds.co/erddap/search/advanced.csv?page=1&itemsPerPage=10000&searchFor=co-ops'
df = pd.read_csv(url)
print(len(df))

3309


## Access data

In [None]:
startDate = '2020-01-01'
endDate = '2021-01-01'
minLon, maxLon = -99, -88
minLat, maxLat = 20, 30

In [267]:
# function to read in only the columns that overlap with variable names
def readfunc(col):
    for Var in Vars+indices+checks:
        if Var in col:
            return True

In [266]:
indices = ['time (UTC)','latitude (degrees_north)','longitude (degrees_east)','station']
Vars = ['sea_water_temperature','sea_water_practical_salinity','sea_water_speed','sea_surface_height']
checks = ['depth_reading_ocean (m)']

Cols = ['time (UTC)','latitude (degrees_north)','longitude (degrees_east)','station','sea_water_temperature (degree_Celsius)', 'sea_water_temperature_qc_agg', 'sea_water_temperature_ocean (degree_Celsius)', 'sea_water_practical_salinity (1e-3)', 'sea_water_speed (m.s-1)', 'sea_water_velocity_to_direction (degrees)','sea_surface_height_above_sea_level_geoid_mllw (m)']

# Var = Vars[0]
dfs = []
for Var in Vars:
    url_base = 'http://erddap.sensors.axds.co/erddap/search/advanced.csv?page=1&itemsPerPage=10000'
    url = '%s&searchFor=%s&maxLat=%f&minLon=%f&maxLon=%f&minLat=%f&minTime=%s&maxTime=%s' % (url_base,Var,maxLat,minLon,maxLon,minLat,pd.Timestamp(startDate).isoformat(), pd.Timestamp(endDate).isoformat())
    dfs.append(pd.read_csv(url))
# &variableName=
# Gather together all data urls, but drop duplicates
# will have to check for all Vars in each dataset
df = pd.concat(dfs, axis=0, ignore_index=True).drop_duplicates()

In [206]:
import numpy as np

LIST OF NAMES OF variables by source

TABS
z (Altitude, m) 
sea_water_velocity_to_direction (degrees)
sea_water_speed (Current Speed, m.s-1)
sea_water_practical_salinity (1e-3)


NDBC
z (Altitude, m)
sea_surface_height_above_sea_level_geoid_mllw (m)
sea_water_temperature (degree_Celsius)


CO-OPS
z (Altitude, m)
sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw (cm)
sea_water_practical_salinity (1e-3)
sea_water_temperature (degree_Celsius)
sea_surface_height_above_sea_level_geoid_mllw (m) (not always available)

Another COOPS
sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw (cm)
 sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw_qc_agg
 sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw_qc_tests
 sea_surface_height_above_sea_level_geoid_mllw (m)
 sea_surface_height_above_sea_level_geoid_mllw_qc_agg
 
 
NWIS
height_geoid_local_station_datum (m)
 sea_water_temperature (degree_Celsius)
 water_surface_height_above_reference_datum_geoid_localstationdatum (m)
 


ECOHAB_II - 621   RSS
Institution: 	USGS Coastal and Marine Geology Program (USGS-CMGP)   (Dataset ID: ecohab_ii-621)
sea_water_temperature_6211mc_a (degree_Celsius)




SEE WHAT STATIONS I AM NOT CATCHING WITH THESE INSTITUTIONS AND THEN FILL IN
NEED TO BE ABLE TO CALL VARIABLE COLUMNS BY NAME and don't want to catch weird bad data

In [303]:
%%time
print(len(df['tabledap']))
dfs = []
for url in df['tabledap']:
    dfnew = pd.read_csv(url + '.csvp', index_col=indices, parse_dates=['time (UTC)'],
            usecols = readfunc).dropna(axis='index', how='all').dropna(axis='columns', how='all')
    
    checkkey = 'depth_reading_ocean (m)'
    if checkkey in dfnew.columns:
        # check if depth changes and if it does, don't use dataset
        if not np.allclose(dfnew[checkkey].min(), dfnew[checkkey].max()):
            # DROP DATAFRAME
            dfnew = None
        # CHECK QA TOO
    
    dfs.append(dfnew)

252


  mask |= (ar1 == a)


CPU times: user 5min 25s, sys: 33.6 s, total: 5min 58s
Wall time: 51min 46s


In [304]:
%%time
dfall = pd.concat(dfs)

CPU times: user 10.7 s, sys: 5.43 s, total: 16.1 s
Wall time: 18.2 s


In [312]:
dfall.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,sea_water_temperature (degree_Celsius),sea_water_temperature_qc_agg,depth_reading_ocean (m),sea_water_temperature_ocean (degree_Celsius),height_geoid_local_station_datum (m),water_surface_height_above_reference_datum_geoid_localstationdatum (m),sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw (cm),sea_surface_height_above_sea_level_geoid_mllw (m),sea_water_practical_salinity (1e-3),sea_water_speed (m.s-1),sea_surface_height_above_sea_level_geoid_navd88 (m)
time (UTC),latitude (degrees_north),longitude (degrees_east),station,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2020-01-14 20:32:45+00:00,29.01665,-89.832433,,19.5,1.0,,,,,,,,,
2020-01-14 21:02:45+00:00,29.01665,-89.832433,,19.35,1.0,,,,,,,,,
2020-01-14 21:32:45+00:00,29.01665,-89.832433,,19.15,1.0,,,,,,,,,
2020-01-14 22:02:45+00:00,29.01665,-89.832433,,19.799999,1.0,,,,,,,,,
2020-01-14 22:32:45+00:00,29.01665,-89.832433,,20.049999,1.0,,,,,,,,,


In [319]:
dfall.reset_index(level=[0,1,2,3])

Unnamed: 0,time (UTC),latitude (degrees_north),longitude (degrees_east),station,sea_water_temperature (degree_Celsius),sea_water_temperature_qc_agg,depth_reading_ocean (m),sea_water_temperature_ocean (degree_Celsius),height_geoid_local_station_datum (m),water_surface_height_above_reference_datum_geoid_localstationdatum (m),sea_surface_height_amplitude_due_to_geocentric_ocean_tide_geoid_mllw (cm),sea_surface_height_above_sea_level_geoid_mllw (m),sea_water_practical_salinity (1e-3),sea_water_speed (m.s-1),sea_surface_height_above_sea_level_geoid_navd88 (m)
0,2020-01-14 20:32:45+00:00,29.01665,-89.832433,,19.500000,1.0,,,,,,,,,
1,2020-01-14 21:02:45+00:00,29.01665,-89.832433,,19.350000,1.0,,,,,,,,,
2,2020-01-14 21:32:45+00:00,29.01665,-89.832433,,19.150000,1.0,,,,,,,,,
3,2020-01-14 22:02:45+00:00,29.01665,-89.832433,,19.799999,1.0,,,,,,,,,
4,2020-01-14 22:32:45+00:00,29.01665,-89.832433,,20.049999,1.0,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42247296,2015-05-07 06:56:00+00:00,26.06830,-97.156700,"SOUTH PADRE ISLAND, BRAZOS SANTIAGO PASS",,,,,,,-9.0,,,,
42247297,2015-05-06 15:25:00+00:00,26.06830,-97.156700,"SOUTH PADRE ISLAND, BRAZOS SANTIAGO PASS",,,,,,,49.0,,,,
42247298,2015-05-06 06:16:00+00:00,26.06830,-97.156700,"SOUTH PADRE ISLAND, BRAZOS SANTIAGO PASS",,,,,,,-9.0,,,,
42247299,2015-05-05 14:32:00+00:00,26.06830,-97.156700,"SOUTH PADRE ISLAND, BRAZOS SANTIAGO PASS",,,,,,,46.0,,,,


In [281]:
# for url in df['tabledap']:
url = df['tabledap'][0]

# open a dataset
# pd.read_csv(url + '.csvp', index_col=0, parse_dates=[0],
#             usecols = lambda col:(col in Cols))
# df1 = pd.read_csv(url + '.csvp', index_col=[0,1,2,-1], parse_dates=[0],
#             usecols = lambda col: col in Cols).dropna()
df1 = pd.read_csv(url + '.csvp', index_col=indices, parse_dates=['time (UTC)'],
            usecols = readfunc).dropna(axis='index', how='all').dropna(axis='columns', how='all')


url = df['tabledap'][100]
df2 = pd.read_csv(url + '.csvp', index_col=indices, parse_dates=['time (UTC)'],
            usecols = readfunc).dropna(axis='index', how='all').dropna(axis='columns', how='all')
checkkey = 'depth_reading_ocean (m)'
if checkkey in df2.columns:
    # check if depth changes and if it does, don't use dataset
    if not np.allclose(df2[checkkey].min(), df2[checkkey].max()):
        # DROP DATAFRAME
        df2 = None
    # CHECK QA TOO

Use the following to find the relevant datasets for a time range and region

HERE I HAVE SKETCHED OUT HOW TO BE ABLE TO QUERY AND AGGREGATE DATA. Need to:

* see what data I am missing that isn't in the sensors database
* see about adding the missing stations from the data sources
* do I need to include something about QA/QC at this stage? How to do that?

THIS IS THE AGGREGATED CELL OF WORK CURRENTLY:

In [125]:
import requests

# user input
startDate = '2020-03-23 00:00'
endDate = '2021-03-23'
minLon, maxLon = -99, -88
minLat, maxLat = 20, 30

# convert input datetime to seconds since 1970 which is almost correct but isn't quite what the 
# system does apparently
startDateTime = (pd.Timestamp(startDate) - pd.Timestamp("1970-01-01 00:00")) // pd.Timedelta('1s')
endDateTime = (pd.Timestamp(endDate) - pd.Timestamp("1970-01-01 00:00")) // pd.Timedelta('1s')

search_headers = {'Accept': 'application/json'}

Vars = {'temp': {'metaName': 'Temperature: Water Temperature', 'downloadName': 'sea_water_temperature'},
        'salt': {'metaName': 'Salinity', 'downloadName': 'sea_water_practical_salinity'},
        'currents': {'metaName': 'Currents', 'downloadName': 'sea_water_velocity_to_direction,sea_water_speed'},
        'ssh': {'metaName': 'Sea Surface Height', 'downloadName': 'sea_surface_height_geoid_navd88_9751hwl_a'},
        'waterLevel': {'metaName': 'Water Level', 'downloadName': 'sea_surface_height_above_sea_level_geoid_mllw'},
        'u_seaice': {'metaName': 'Sea Ice: Northward Velocity', 'downloadName': 'eastward_sea_ice_velocity'},
        'v_seaice': {'metaName': 'Sea Ice: Eastward Velocity', 'downloadName': 'northward_sea_ice_velocity'}}

# initialize downloadUrls as list for each data type
for key in Vars.keys():
    Vars[key]['downloadUrls'] = []

url_meta_base = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10000&type=sensor_station&geom={"type":"Polygon","coordinates":'
url = '%s[[[%f,%f],[%f,%f],[%f,%f],[%f,%f],[%f,%f]]]}&startDateTime=%i&endDateTime=%i' % (url_meta_base,minLon,minLat,maxLon,minLat,maxLon,maxLat,minLon,maxLat,minLon,minLat,startDateTime,endDateTime)
url_download_base = 'https://erddap.sensors.ioos.us/erddap/tabledap'

# Find url for dataset search for each data type
for key in Vars.keys():
    Vars[key]['url_meta'] = url + '&tag=Parameter Group:' + Vars[key]['metaName']

    search_results = requests.get(Vars[key]['url_meta'], headers = search_headers).json()

    # Find download link for each data type
    for result in search_results['results']:
        # https://erddap.sensors.ioos.us/erddap/tabledap/mares-mooring-m1-2017-2019.csv?time%2Ceastward_sea_ice_velocity%2Cz&time%3E%3D2017-11-07T11%3A30%3A33Z&time%3C%3D2019-02-11T16%3A13%3A47Z
#         url_csv = '%s/%s.csvp?time,%s&time>=2017-11-07T11:30:33Z&time<=2019-02-11T16:13:47Z' % (url_download_base,result['data']['datasetId'], Vars[key]['downloadName'])
        url_csv = '%s/%s.csvp?time,%s&time>=%sZ&time<=%sZ' % (url_download_base,result['data']['datasetId'], Vars[key]['downloadName'],pd.Timestamp(startDate).isoformat(),pd.Timestamp(endDate).isoformat())
        Vars[key]['downloadUrls'].append(url_csv)


# Vars
# then can download with `pd.read_csv`

In [127]:
len(Vars['waterLevel']['downloadUrls'])

50

In [72]:
url_csv

'https://erddap.sensors.ioos.us/erddap/tabledap/com_chevron_42934.csvp?time,sea_water_temperature&time>=2020-03-23T00:00:00Z&time<=2021-03-23T00:00:00Z'

In [71]:
Vars['temp']['url_meta']

'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10000&type=sensor_station&geom={"type":"Polygon","coordinates":[[[-99.000000,20.000000],[-88.000000,20.000000],[-88.000000,30.000000],[-99.000000,30.000000],[-99.000000,20.000000]]]}&startDateTime=1584921600&endDateTime=1616457600&tag=Parameter Group:Temperature: Water Temperature'

In [78]:
search_results['results'][0]#['data']['figures'][0]['plots'][0]['subPlots'][0]['datasetVariableId']

{'data': {'hasNarrative': False,
  'figures': [{'annotations': [],
    'label': 'Conductivity',
    'parameterGroupId': 23,
    'plots': [{'subPlots': [{'plotLabel': '[default]',
        'parameterId': 49,
        'endDate': '2021-03-17T05:00:00Z',
        'maxZ': 0.0,
        'medianTimeIntervalSecs': None,
        'numObservations': None,
        'qcConfigId': None,
        'instrument': {},
        'label': 'Conductivity',
        'units': 'mS.cm-1',
        'availableZBins': [],
        'hasQc': False,
        'deviceId': 482550,
        'parameterGroupId': 23,
        'sensorParameterId': 49,
        'maxVal': None,
        'minVal': None,
        'discriminant': None,
        'unitId': 28,
        'feeds': [482550],
        'datasetVariableId': 'sea_water_electrical_conductivity',
        'minZ': 0.0,
        'startDate': '2017-01-15T04:00:00Z',
        'availableZ': []}],
      'label': '[default]'}]},
   {'annotations': [],
    'label': 'Salinity',
    'parameterGroupId': 14,
 

In [66]:
!pip install nested-lookup

Collecting nested-lookup
  Downloading nested-lookup-0.2.22.tar.gz (10 kB)
Building wheels for collected packages: nested-lookup
  Building wheel for nested-lookup (setup.py) ... [?25ldone
[?25h  Created wheel for nested-lookup: filename=nested_lookup-0.2.22-py3-none-any.whl size=7684 sha256=216337c7177a7f015982050c6c7c19ea0b598420e0e7507f73ad890558046021
  Stored in directory: /Users/kthyng/Library/Caches/pip/wheels/51/ce/8c/92df23b3564f2d3e84de93b535ef062d3159d68a10fead7742
Successfully built nested-lookup
Installing collected packages: nested-lookup
Successfully installed nested-lookup-0.2.22


In [70]:
from nested_lookup import nested_lookup
print(nested_lookup('datasetVariableId', search_results['results'][0])) # THEN NEED TO SELECT OUT THE ONE WITH TEMP IN IT FOR EXAMPLE

['sea_water_electrical_conductivity', 'sea_water_practical_salinity', 'height_geoid_local_station_datum', 'sea_water_temperature', 'water_surface_height_above_reference_datum_geoid_localstationdatum']


In [61]:
from functools import reduce
from operator import getitem
def get_nested_item(data, keys):
    return reduce(getitem, keys, data)

In [64]:
def get_nested_item(data, keys):
    return reduce(lambda seq, key: seq[key], keys, data)

In [65]:
get_nested_item(search_results, ['datasetVariableId'])

KeyError: 'datasetVariableId'

In [60]:
any(key.startswith('datasetVariableId') for key in search_results)

False

In [20]:
%%time

df = pd.DataFrame()

for url in Vars['salt']['downloadUrls']:
    dfnew = pd.read_csv(url, index_col=0, parse_dates=True).sort_index()
    datasetId = url.split('/')[-1].split('.')[0]
    dfnew.rename(columns={dfnew.columns[0]: '%s: %s' % (dfnew.columns[0], datasetId)}, inplace=True)

    df = df.join(dfnew, how='outer')

CPU times: user 12.5 s, sys: 1.19 s, total: 13.7 s
Wall time: 4min 11s


In [21]:
print(len(Vars['salt']['downloadUrls']))

63


In [24]:
Vars['salt']['df'] = df

In [None]:
https://erddap.sensors.ioos.us/erddap/tabledap/wmo_42876.csv?time%2Csea_water_temperature_ocean%2Cz&time%3E%3D2017-04-25T00%3A37%3A00Z&time%3C%3D2021-03-09T18%3A37%3A00Z

In [28]:
# pd.read_csv(url)
url

'https://erddap.sensors.ioos.us/erddap/tabledap/wmo_42876.csvp?time,sea_water_temperature&time>=2020-03-23T00:00:00Z&time<=2021-03-23T00:00:00Z'

In [25]:
%%time

# loop over all variables
for key in Vars.keys():
    df = pd.DataFrame()
    for url in Vars[key]['downloadUrls']:
        dfnew = pd.read_csv(url, index_col=0, parse_dates=True).sort_index()
        datasetId = url.split('/')[-1].split('.')[0]
        dfnew.rename(columns={dfnew.columns[0]: '%s: %s' % (dfnew.columns[0], datasetId)}, inplace=True)

        df = df.join(dfnew, how='outer')
    
    # add resultant dataframe to Vars dict
    Vars[key]['df'] = df

HTTPError: HTTP Error 400: 

In [607]:
tz = 'US/Central'
date = pd.Timestamp('2020-03-23 00:00').tz_localize(tz)
pd.Timedelta(str(1584939600 - (date - pd.Timestamp("1970-01-01 00:00").tz_localize(tz)) // pd.Timedelta('1s')) + 's')
# (date - pd.Timestamp("1970-01-01 00:00").tz_localize(tz)) // pd.Timedelta('1s')


Timedelta('0 days 06:00:00')

In [727]:
import urllib
import json
# url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22,%22coordinates%22%3A[[[-99,20],[-88,20],[-88,30],[-99,30],[-99,20]]]%7D&searchStartDateTime=1262304000&endDateTime=1616486399'
# url = 'https://search.axds.co/v2/search?portalId=45&page=1&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-134.40150203577394%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C50.51342652633956%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&type=sensor_station&type=parameter_group&type=affiliate&pageSize=1'
# with specific variable
# url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=10&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B-134.40150203577394%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C50.51342652633956%5D%2C%5B-126.91416037374725%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C55.3541353102106%5D%2C%5B-134.40150203577394%2C50.51342652633956%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&tag=Parameter%20Group%3ATemperature%3A%20Water%20Temperature'
# with multiple variables


# user input
startDate = '2020-03-23 00:00'
endDate = '2021-03-23'
minLon, maxLon = -99, -88
minLat, maxLat = 20, 30

# convert input datetime to seconds since 1970 which is almost correct but isn't quite what the 
# system does apparently
startDateTime = (pd.Timestamp(startDate) - pd.Timestamp("1970-01-01 00:00")) // pd.Timedelta('1s')
endDateTime = (pd.Timestamp(endDate) - pd.Timestamp("1970-01-01 00:00")) // pd.Timedelta('1s')


# Also need sea ice coverage. Do I need to search another database?
# find these by looking at the searches that come up under developer tools and network
# e.g. https://sensors.ioos.us/#search/advanced?type_group=sensor_stations&mapBounds=-193.33040437082403%2C61.93895042666063%2C-116.871433045657%2C82.85338229176081&page=1
# do we also need to download QA/QC information?
vars_meta = ['Temperature: Water Temperature','Currents','Salinity',
             'Sea Surface Height', 'Water Level', 
             'Sea Ice: Northward Velocity','Sea Ice: Eastward Velocity']

# Vars = {'temp': {'meta': 'Temperature: Water Temperature', 'download': 'sea_water_temperature'},
#         'salt': {'meta'}}
                

# url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom=%7B%22type%22%3A%22Polygon%22%2C%22coordinates%22%3A%5B%5B%5B%f%2CminLat%5D%2C%5BmaxLon%2CminLat%5D%2C%5BmaxLon%2CmaxLat%5D%2C%5BminLon%2CmaxLat%5D%2C%5BminLon%2CminLat%5D%5D%5D%7D&startDateTime=1584939600&endDateTime=1616561999&tag=Parameter%20Group%3ATemperature%3A%20Water%20Temperature&tag=Parameter%20Group%3ACurrents' % (minLon, )
# url = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom={"type":"Polygon","coordinates":[[[%f,%f],[%f,%f],[%f,%f],[%f,%f],[%f,%f]]]}&startDateTime=1584939600&endDateTime=1616561999&tag=Parameter Group:Temperature: Water Temperature&tag=Parameter Group:Currents' % (minLon,minLat,maxLon,minLat,maxLon,maxLat,minLon,maxLat,minLon,minLat)#,startDateTime,endDateTime)
urlbase = 'https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom={"type":"Polygon","coordinates":'
url = '%s[[[%f,%f],[%f,%f],[%f,%f],[%f,%f],[%f,%f]]]}&startDateTime=%i&endDateTime=%i' % (urlbase,minLon,minLat,maxLon,minLat,maxLon,maxLat,minLon,maxLat,minLon,minLat,startDateTime,endDateTime)




# for var_meta in vars_meta:
var_meta = vars_meta[0]
url += '&tag=Parameter Group:' + var_meta
print(url)
search_headers = {'Accept': 'application/json'}
import requests
search_results = requests.get(url, headers = search_headers).json()

https://search.axds.co/v2/search?portalId=45&page=1&pageSize=1000&type=sensor_station&geom={"type":"Polygon","coordinates":[[[-99.000000,20.000000],[-88.000000,20.000000],[-88.000000,30.000000],[-99.000000,30.000000],[-99.000000,20.000000]]]}&startDateTime=1584921600&endDateTime=1616457600&tag=Parameter Group:Temperature: Water Temperature


In [728]:
len(search_results['results'])

190

In [822]:
df1 = pd.read_csv(Vars['salt']['downloadUrls'][0], index_col=0, parse_dates=True).sort_index()
datasetId = Vars['salt']['downloadUrls'][0].split('/')[-1].split('.')[0]
df1.rename(columns={df1.columns[0]: '%s: %s' % (df1.columns[0], datasetId)}, inplace=True)

df2 = pd.read_csv(Vars['salt']['downloadUrls'][1], index_col=0, parse_dates=True).sort_index()
datasetId = Vars['salt']['downloadUrls'][1].split('/')[-1].split('.')[0]
df2.rename(columns={df2.columns[0]: '%s: %s' % (df2.columns[0], datasetId)}, inplace=True)

df3 = df1.join(df2, how='outer')

In [749]:
for result in search_results['results']:
    
#     print(result['data']['id'])
#     if 'edu_ucsd_cdip_132' in result:
    print(result['data']['datasetId'])  # works for TABS
    print(result['data']['~uuid'])
    
    url_base = 'https://erddap.sensors.ioos.us/erddap/tabledap'
    url_csv = '%s/%s.csvp?time,sea_water_temperature' % (url_base,result['data']['datasetId'])
#     url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csv?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z'
    pd.read_csv(url_csv)
    
#

gov_usgs_waterdata_294045092492300
urn:ioos:station:gov.usgs.waterdata:294045092492300
nerrs_marcwwq
urn:ioos:station:nerrs:marcwwq
wmo_42876
urn:ioos:station:wmo:42876


HTTPError: HTTP Error 400: 

In [None]:
https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.html?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z
https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csv?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z
https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.nc?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z
https://sensors.axds.co/stationsensorservice/getExcelSheet?sensorids=14&sensorid=14&stationid=57544&stationids=57544&appregion=all&realtimeonly=false&next_gen_api=false

In [735]:
# url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/urn:ioos:station:com.chevron:42934.csvp?time%2Csea_water_temperature%2Csea_water_temperature_qc_agg'
# url_csv = 'https://sensors.axds.co/stationsensorservice/getExcelSheet?sensorsids=7&sensorid=7&stationids=57541&stationid=57541&appregion=all&realtimeonly=false&next_gen_api=false'
url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csv?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z'
pd.read_csv(url_csv)

# url_netcdf = 'https://sensors.axds.co/stationsensorservice/getSensorNetcdf?sensorsids=7&sensorid=7&stationids=57541&stationid=57541&appregion=all&realtimeonly=false&next_gen_api=false'
# xr.open_dataset(url_netcdf)

Unnamed: 0,time,sea_water_practical_salinity,z
0,UTC,0.001000,m
1,2020-11-19T13:00:00Z,0.032716,0.0
2,2020-11-19T12:30:00Z,0.032724,0.0
3,2020-11-19T12:00:00Z,0.032734,0.0
4,2020-11-19T11:30:00Z,0.032741,0.0
...,...,...,...
51016,2016-01-14T14:30:00Z,36.450000,0.0
51017,2016-01-14T14:00:00Z,36.450000,0.0
51018,2016-01-14T13:30:00Z,36.450000,0.0
51019,2016-01-14T13:00:00Z,36.450000,0.0


In [748]:
url_base = 'https://erddap.sensors.ioos.us/erddap/tabledap'
url_csv = '%s/%s.csvp?time,sea_water_temperature' % (url_base,'tabs_w')
# url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csv?time%2Csea_water_practical_salinity%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z'
# https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csv?time%2Csea_water_temperature%2Cz&time%3E%3D2016-01-14T12%3A00%3A00Z&time%3C%3D2020-11-19T14%3A00%3A00Z
print(url_csv)
pd.read_csv(url_csv)


https://erddap.sensors.ioos.us/erddap/tabledap/tabs_w.csvp?sea_water_temperature


Unnamed: 0,sea_water_temperature (degree_Celsius)
0,22.7
1,22.6
2,22.6
3,22.7
4,22.7
...,...
51015,23.1
51016,23.1
51017,23.1
51018,23.1


In [560]:
url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_132.csvp?time%2Csea_water_temperature%2Csea_water_temperature_qc_agg'
url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/edu_ucsd_cdip_132.csvp?time%2Csea_water_velocity_to_direction%2Csea_water_speed%2Csea_water_velocity_to_direction_qc_agg%2Csea_water_speed_qc_agg'
url_csv = 'https://erddap.sensors.ioos.us/erddap/tabledap/indian-river-lagoon-fort-pierce-.csv?time%2Csea_water_temperature%2Csea_water_temperature_qc_agg'
pd.read_csv(url_csv)

Unnamed: 0,time (UTC),sea_water_velocity_to_direction (degrees),sea_water_speed (m.s-1),sea_water_velocity_to_direction_qc_agg,sea_water_speed_qc_agg
0,2006-02-09T15:25:45Z,,,,
1,2006-02-09T15:54:45Z,,,,
2,2006-02-09T15:55:45Z,,,,
3,2006-02-09T16:24:45Z,,,,
4,2006-02-09T16:25:45Z,,,,
...,...,...,...,...,...
532802,2021-03-23T19:25:00Z,,,,
532803,2021-03-23T19:30:00Z,227.461533,0.139,1.0,1.0
532804,2021-03-23T19:35:00Z,,,,
532805,2021-03-23T19:40:00Z,220.516479,0.101,1.0,1.0
