<img style='float: left' width="150px" src="http://bostonlightswim.org/wp/wp-content/uploads/2011/08/BLS-front_4-color.jpg">
<br><br>

## [The Boston Light Swim](http://bostonlightswim.org/)

### Fetch Sea Surface Temperature time-series data

In [1]:
import os
import sys
import time
import warnings

pytools_path = os.path.abspath(os.path.join(os.path.pardir, os.path.pardir))
sys.path.append(pytools_path)

# Suppresing warnings for a "pretty output."
# Remove this line to debug any possible issues.
warnings.simplefilter("ignore")

start_time = time.time()

### Configuration

In [2]:
%%writefile config.yaml

date:
    start: 2016-8-11 00:00:00
    stop: 2016-8-21 00:00:00

run_name: 'latest'

region:
    # Boston harbor.
    bbox: [-71.3, 42.03, -70.57, 42.63]
    crs: 'urn:ogc:def:crs:OGC:1.3:CRS84'

sos_name: 'sea_water_temperature'

cf_names:
    - sea_water_temperature
    - sea_surface_temperature
    - sea_water_potential_temperature
    - equivalent_potential_temperature
    - sea_water_conservative_temperature
    - pseudo_equivalent_potential_temperature

units: 'celsius'

titles:
    BTMPB: 'http://oos.soest.hawaii.edu/thredds/dodsC/hioos/tide_pac'
    CBOFS: 'http://opendap.co-ops.nos.noaa.gov/thredds/dodsC/CBOFS/fmrc/Aggregated_7_day_CBOFS_Fields_Forecast_best.ncd'
    COAWST_4: 'http://geoport.whoi.edu/thredds/dodsC/coawst_4/use/fmrc/coawst_4_use_best.ncd'
    ESPRESSO: 'http://tds.marine.rutgers.edu/thredds/dodsC/roms/espresso/2013_da/his_Best/ESPRESSO_Real-Time_v2_History_Best_Available_best.ncd'
    ESTOFS: 'http://geoport-dev.whoi.edu/thredds/dodsC/estofs/atlantic'
    HYCOM: 'http://oos.soest.hawaii.edu/thredds/dodsC/pacioos/hycom/global'
    NECOFS_GOM3_FVCOM: 'http://www.smast.umassd.edu:8080/thredds/dodsC/FVCOM/NECOFS/Forecasts/NECOFS_GOM3_FORECAST.nc'
    NECOFS_GOM3_WAVE: 'http://www.smast.umassd.edu:8080/thredds/dodsC/FVCOM/NECOFS/Forecasts/NECOFS_WAVE_FORECAST.nc'
    SABGOM: 'http://omgsrv1.meas.ncsu.edu:8080/thredds/dodsC/fmrc/sabgom/SABGOM_Forecast_Model_Run_Collection_best.ncd'
    SABGOM_ARCHIVE: 'http://omgarch1.meas.ncsu.edu:8080/thredds/dodsC/fmrc/sabgom/SABGOM_Forecast_Model_Run_Collection_best.ncd'
    TBOFS: 'http://opendap.co-ops.nos.noaa.gov/thredds/dodsC/TBOFS/fmrc/Aggregated_7_day_TBOFS_Fields_Forecast_best.ncd'
    USEAST: 'http://omgsrv1.meas.ncsu.edu:8080/thredds/dodsC/fmrc/us_east/US_East_Forecast_Model_Run_Collection_best.ncd'
    USF_FVCOM: 'http://crow.marine.usf.edu:8080/thredds/dodsC/FVCOM-Nowcast-Agg.nc'
    USF_ROMS: 'http://crow.marine.usf.edu:8080/thredds/dodsC/WFS_ROMS_NF_model/USF_Ocean_Circulation_Group_West_Florida_Shelf_Daily_ROMS_Nowcast_Forecast_Model_Data_best.ncd'
    USF_SWAN: 'http://crow.marine.usf.edu:8080/thredds/dodsC/WFS_SWAN_NF_model/USF_Ocean_Circulation_Group_West_Florida_Shelf_Daily_SWAN_Nowcast_Forecast_Wave_Model_Data_best.ncd'

Overwriting config.yaml


In [3]:
from datetime import datetime
from pytools.ioos import parse_config

config_file = 'config.yaml'
config = parse_config(config_file)

save_dir = os.path.join(os.path.abspath(os.path.dirname(config_file)),
                        config['run_name'])

def _reload_log():
    """IPython workaround."""
    import imp
    import logging as log
    imp.reload(log)
    return log

def start_log(save_dir):
    import shutil
    log = _reload_log()
    if os.path.exists(save_dir):
        shutil.rmtree(save_dir)
    os.makedirs(save_dir)

    log.captureWarnings(True)
    LOG_FILENAME = 'log.txt'
    LOG_FILENAME = os.path.join(save_dir, LOG_FILENAME)
    formatter = '%(asctime)s %(levelname)s: %(message)s'
    log.basicConfig(filename=LOG_FILENAME,
                    filemode='w',
                    format=formatter,
                    datefmt='%I:%M:%S',
                    level=log.INFO)
    return log

log = start_log(save_dir)
fmt = '{:*^64}'.format
log.info(fmt('Saving data inside directory {}'.format(save_dir)))
log.info(fmt(' Run information '))
log.info('Run date: {:%Y-%m-%d %H:%M:%S}'.format(datetime.utcnow()))
log.info('Start: {:%Y-%m-%d %H:%M:%S}'.format(config['date']['start']))
log.info('Stop: {:%Y-%m-%d %H:%M:%S}'.format(config['date']['stop']))
log.info('Bounding box: {0:3.2f}, {1:3.2f},'
         '{2:3.2f}, {3:3.2f}'.format(*config['region']['bbox']))

### Create the data filter

In [4]:
def make_filter(config):
    from owslib import fes
    from pytools.ioos import fes_date_filter
    kw = dict(wildCard='*', escapeChar='\\',
              singleChar='?', propertyname='apiso:AnyText')

    or_filt = fes.Or([fes.PropertyIsLike(literal=('*%s*' % val), **kw)
                      for val in config['cf_names']])

    # Exclude ROMS Averages and History files.
    not_filt = fes.Not([fes.PropertyIsLike(literal='*Averages*', **kw)])

    begin, end = fes_date_filter(config['date']['start'],
                                 config['date']['stop'])
    bbox_crs = fes.BBox(config['region']['bbox'],
                        crs=config['region']['crs'])
    return [fes.And([bbox_crs, begin, end, or_filt, not_filt])]

filter_list = make_filter(config)

In [5]:
from pytools.ioos import service_urls
from owslib.csw import CatalogueServiceWeb

# Logging info.
fmt = '{:*^64}'.format
log.info(fmt(' Catalog information '))
log.info(fmt(' CSW '))

# http://data.ioos.us/csw is too old and does not support CRS.
endpoints = ['http://www.ngdc.noaa.gov/geoportal/csw',
             'http://geoport.whoi.edu/csw']

opendap = ['OPeNDAP:OPeNDAP',
           'urn:x-esri:specification:ServiceType:odp:url']
sos = ['urn:x-esri:specification:ServiceType:sos:url']

dap_urls = []
sos_urls = []
for endpoint in endpoints:
    log.info("URL: {}".format(endpoint))
    
    csw = CatalogueServiceWeb(endpoint, timeout=60)
    csw.getrecords2(constraints=filter_list, maxrecords=1000, esn='full')
    # Check for the strings in: https://raw.githubusercontent.com/OSGeo/Cat-Interop/master/LinkPropertyLookupTable.csv
    dap_urls.extend(service_urls(csw.records, services=opendap))
    sos_urls.extend(service_urls(csw.records, services=sos_urls))

    log.info("CSW version: {}".format(csw.version))
    log.info("Number of datasets available: {}".format(len(csw.records.keys())))
    
    for rec, item in csw.records.items():
        log.info('{}'.format(item.title))
    log.info(fmt(' SOS '))
    for url in sos_urls:
        log.info('{}'.format(url))
    log.info(fmt(' DAP '))
    for url in dap_urls:
        log.info('{}.html'.format(url))

# Get only unique endpoints.
dap_urls = list(set(dap_urls))
# FIXME: This is empty at the moment. Need to review the service string.
sos_urls = list(set(sos_urls))

In [6]:
from pytools.ioos import is_station

# Filter out some station endpoints.
non_stations = []
for url in dap_urls:
    try:
        if not is_station(url):
            non_stations.append(url)
    except RuntimeError as e:
        log.warn("Could not access URL {}. {!r}".format(url, e))

dap_urls = non_stations

log.info(fmt(' Filtered DAP '))
for url in dap_urls:
    log.info('{}.html'.format(url))

### NdbcSos

In [7]:
from pyoos.collectors.ndbc.ndbc_sos import NdbcSos

collector_ndbc = NdbcSos()

collector_ndbc.set_bbox(config['region']['bbox'])
collector_ndbc.end_time = config['date']['stop']
collector_ndbc.start_time = config['date']['start']
collector_ndbc.variables = [config['sos_name']]

ofrs = collector_ndbc.server.offerings
title = collector_ndbc.server.identification.title
log.info(fmt(' NDBC Collector offerings '))
log.info('{}: {} offerings'.format(title, len(ofrs)))

In [8]:
import pandas as pd
from owslib.ows import ExceptionReport

def collector2table(collector):
    """
    collector2table returns the station stable as a DataFrame.
    columns are station, sensor, lon, lat, and the index is the station
    number.

    """
    # This accepts only 1-day request but since we only want the
    # stations available that is OK.
    import copy
    from io import BytesIO
    
    c = copy.copy(collector)
    try:
        response = c.raw(responseFormat="text/csv")
    except ExceptionReport:
        response = c.filter(end=c.start_time).raw(responseFormat="text/csv")
    df = pd.read_csv(BytesIO(response),
                  parse_dates=True)
    columns = {'sensor_id': 'sensor',
               'station_id': 'station',
               'latitude (degree)': 'lat',
               'longitude (degree)': 'lon'}
    df.rename(columns=columns, inplace=True)
    df['sensor'] = [s.split(':')[-1] for s in df['sensor']]
    df['station'] = [s.split(':')[-1] for s in df['station']]

    df = df[['station', 'sensor', 'lon', 'lat']]
    g = df.groupby('station')
    df = dict()
    for station in g.groups.keys():
        df.update({station: g.get_group(station).iloc[0]})
    return pd.DataFrame.from_dict(df).T


def get_ndbc_longname(station):
    """
    Get long_name for specific station from NOAA NDBC.

    Examples
    --------
    >>> str(get_ndbc_longname(31005))
    'Sw Extension'
    >>> str(get_ndbc_longname(44013))
    'Boston 16 Nm East Of Boston'

    """
    import requests
    from bs4 import BeautifulSoup
    
    url = "http://www.ndbc.noaa.gov/station_page.php"
    params = dict(station=station)
    r = requests.get(url, params=params)
    r.raise_for_status()
    soup = BeautifulSoup(r.content, "lxml")
    # NOTE: Should be only one!
    long_name = soup.findAll("h1")[0]
    long_name = long_name.text.split(' - ')[1].strip()
    long_name = long_name.split(',')[0].strip()
    return long_name.title()

In [9]:
from pytools.ioos import to_html

ndbc = collector2table(collector=collector_ndbc)

if not ndbc.empty:
    names = []
    for s in ndbc['station']:
        try:
            name = get_ndbc_longname(s)
        except ValueError:
            name = s
        names.append(name)

    ndbc['name'] = names

    ndbc.set_index('name', inplace=True)
    to_html(ndbc.head())

### CoopsSoS

In [10]:
from pyoos.collectors.coops.coops_sos import CoopsSos

collector_coops = CoopsSos()

collector_coops.set_bbox(config['region']['bbox'])
collector_coops.end_time = config['date']['stop']
collector_coops.start_time = config['date']['start']
collector_coops.variables = [config['sos_name']]

ofrs = collector_coops.server.offerings
title = collector_coops.server.identification.title
log.info(fmt(' Collector offerings '))
log.info('{}: {} offerings'.format(title, len(ofrs)))

In [11]:
from pytools.ioos import get_coops_metadata

coops = collector2table(collector=collector_coops)

if not coops.empty:
    names = []
    for s in coops['station']:
        try:
            name = get_coops_metadata(s)[0]
        except ValueError:
            name = s
        names.append(name)

    coops['name'] = names

    coops.set_index('name', inplace=True)
    to_html(coops.head())

### Join CoopsSoS and NdbcSos

In [12]:
all_obs = pd.concat([coops, ndbc])

to_html(all_obs.head())

Unnamed: 0_level_0,station,sensor,lon,lat
name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Boston, MA",8443970,E1,-71.0534,42.3548
Boston 16 Nm East Of Boston,44013,watertemp1,-70.69,42.35
Buoy A01,44029,ct1,-70.57,42.52


In [13]:
fname = '{}-all_obs.csv'.format(config['run_name'])
fname = os.path.join(save_dir, fname)
all_obs.to_csv(fname)

### Download the observed data series

In [14]:
import iris
from pytools.ioos import pyoos2df
from pytools.tardis import save_timeseries

iris.FUTURE.netcdf_promote = True

log.info(fmt(' Observations '))
outfile = '{}-OBS_DATA.nc'.format(config['run_name'])
outfile = os.path.join(save_dir, outfile)


log.info(fmt(' Downloading to file {} '.format(outfile)))
data = dict()
col = 'sea_water_temperature (C)'
for station in all_obs.index:
    try:
        idx = all_obs['station'][station]
        df = pyoos2df(collector_ndbc, idx, df_name=station)
        if df.empty:
            df = pyoos2df(collector_coops, idx, df_name=station)
        data.update({idx: df[col]})
    except ExceptionReport as e:
        log.warning("[{}] {}:\n{}".format(idx, station, e))

### Uniform 1-hour time base for model/data comparison

In [15]:
index = pd.date_range(start=config['date']['start'].replace(tzinfo=None),
                   end=config['date']['stop'].replace(tzinfo=None), freq='1H')
for k, v in data.items():
    data[k] = v.reindex(index=index, limit=1, method='nearest')

obs_data = pd.DataFrame.from_dict(data)

In [16]:
comment = "Several stations from http://opendap.co-ops.nos.noaa.gov"
kw = dict(longitude=all_obs.lon,
          latitude=all_obs.lat,
          station_attr=dict(cf_role="timeseries_id"),
          cube_attr=dict(featureType='timeSeries',
                         Conventions='CF-1.6',
                         standard_name_vocabulary='CF-1.6',
                         cdm_data_type="Station",
                         comment=comment,
                         url=url))

save_timeseries(obs_data, outfile=outfile,
                standard_name=config['sos_name'], **kw)

to_html(obs_data.head())

Unnamed: 0,44013,44029,8443970
2016-08-11 00:00:00,19.4,18.8,19.8
2016-08-11 01:00:00,19.4,18.9,19.9
2016-08-11 02:00:00,19.3,18.6,19.9
2016-08-11 03:00:00,19.1,18.2,20.3
2016-08-11 04:00:00,19.3,18.0,20.5


### Loop discovered models and save the nearest time-series

In [17]:
from iris.exceptions import (CoordinateNotFoundError, ConstraintMismatchError,
                             MergeError)
from pytools.ioos import get_model_name
from pytools.tardis import quick_load_cubes, proc_cube, is_model, get_surface

log.info(fmt(' Models '))
cubes = dict()
for k, url in enumerate(dap_urls):
    log.info('\n[Reading url {}/{}]: {}'.format(k+1, len(dap_urls), url))
    try:
        cube = quick_load_cubes(url, config['cf_names'],
                                callback=None, strict=True)
        if is_model(cube):
            cube = proc_cube(cube,
                             bbox=config['region']['bbox'],
                             time=(config['date']['start'],
                                   config['date']['stop']),
                             units=config['units'])
        else:
            log.warning("[Not model data]: {}".format(url))
            continue
        cube = get_surface(cube)
        mod_name, model_full_name = get_model_name(cube, url, config['titles'])
        cubes.update({mod_name: cube})
    except (RuntimeError, ValueError,
            ConstraintMismatchError, CoordinateNotFoundError,
            IndexError) as e:
        log.warning('Cannot get cube for: {}\n{}'.format(url, e))

In [18]:
from iris.pandas import as_series
from pytools.tardis import (make_tree, get_nearest_water,
                            add_station, ensure_timeseries, remove_ssh)

for mod_name, cube in cubes.items():
    fname = '{}-{}.nc'.format(config['run_name'], mod_name)
    fname = os.path.join(save_dir, fname)
    log.info(fmt(' Downloading to file {} '.format(fname)))
    try:
        tree, lon, lat = make_tree(cube)
    except CoordinateNotFoundError as e:
        log.warning('Cannot make KDTree for: {}'.format(mod_name))
        continue
    # Get model series at observed locations.
    raw_series = dict()
    for station, obs in all_obs.iterrows():
        try:
            kw = dict(k=10, max_dist=0.08, min_var=0.01)
            args = cube, tree, obs.lon, obs.lat
            try:
                series, dist, idx = get_nearest_water(*args, **kw)
            except RuntimeError as e:
                log.info('Cannot download {!r}.\n{}'.format(cube, e))
                series = None
        except ValueError as e:
            status = "No Data"
            log.info('[{}] {}'.format(status, obs.name))
            continue
        if not series:
            status = "Land   "
        else:
            raw_series.update({obs['station']: series})
            series = as_series(series)
            status = "Water  "
        log.info('[{}] {}'.format(status, obs.name))
    if raw_series:  # Save cube.
        for station, cube in raw_series.items():
            cube = add_station(cube, station)
            cube = remove_ssh(cube)
        try:
            cube = iris.cube.CubeList(raw_series.values()).merge_cube()
        except MergeError as e:
            log.warning(e)
        ensure_timeseries(cube)
        iris.save(cube, fname)
        del cube
    log.info('Finished processing [{}]'.format(mod_name))

In [19]:
elapsed = time.time() - start_time
log.info('{:.2f} minutes'.format(elapsed/60.))
log.info('EOF')

with open('{}/log.txt'.format(config['run_name'])) as f:
    print(f.read())

04:08:41 INFO: Saving data inside directory /home/filipe/IOOS/notebooks_demos/notebooks/boston_light_swim/latest
04:08:41 INFO: *********************** Run information ************************
04:08:41 INFO: Run date: 2016-08-16 19:08:41
04:08:41 INFO: Start: 2016-08-11 00:00:00
04:08:41 INFO: Stop: 2016-08-21 00:00:00
04:08:41 INFO: Bounding box: -71.30, 42.03,-70.57, 42.63
04:08:41 INFO: ********************* Catalog information **********************
04:08:41 INFO: ***************************** CSW ******************************
04:08:41 INFO: URL: http://www.ngdc.noaa.gov/geoportal/csw
04:08:42 INFO: CSW version: 2.0.2
04:08:42 INFO: Number of datasets available: 1
04:08:42 INFO: HYbrid Coordinate Ocean Model (HYCOM): Global
04:08:42 INFO: ***************************** SOS ******************************
04:08:42 INFO: ***************************** DAP ******************************
04:08:42 INFO: http://oos.soest.hawaii.edu/thredds/dodsC/pacioos/hycom/global.html
04:08:42 INFO: URL