Solar Anywhere

In [None]:
from collections import OrderedDict
import glob
import os
import re
import sys

from bokeh.models import WMTSTileSource
from cartopy import crs as ccrs
from collections import defaultdict
from dask.distributed import Client
from holoviews.operation import decimate
from holoviews.operation.datashader import aggregate, shade, datashade, dynspread
from pyproj import Proj, transform
import dask
import dask.dataframe as dd
import datashader as ds
import datashader.transfer_functions as tf
import geoviews as gv
import holoviews as hv
import numpy as np
import pandas as pd
import rasterio as rio
import xarray as xr
import numpy as np
import holoviews as hv
import datashader as ds

hv.notebook_extension('bokeh')
decimate.max_samples=1000
dynspread.max_px=20
dynspread.threshold=0.5

client = Client()

In [None]:
NUM_STATIONS = 4 # adjust to limit to subset of SOLAR_FILES

In [None]:
SOLAR_FNAME_PATTERN = os.path.join('data', '72*', '*solar.csv')
SOLAR_FILES = glob.glob(SOLAR_FNAME_PATTERN)
META_FILE = os.path.join('data', 'NSRDB_StationsMeta.csv')

get_station_yr = lambda fname: tuple(map(int, os.path.basename(fname).split('_')[:2]))
STATION_COMBOS = defaultdict(lambda: [])
for fname in SOLAR_FILES:
    k, v = get_station_yr(fname)
    STATION_COMBOS[k].append([v, fname])
STATION_COMBOS = {k: STATION_COMBOS[k] for k in tuple(STATION_COMBOS)[:NUM_STATIONS]}
files_for_station = lambda station: [x[1] for x in STATION_COMBOS[station]]

In [None]:
def clean_col_names(dframe):
    cols = [re.sub('_$', '', re.sub('[/:\(\)_\s^-]+', '_', col.replace('%', '_pcent_'))).lower()
            for col in dframe.columns]
    dframe.columns = cols
    return dframe

In [None]:
meta_df = clean_col_names(pd.read_csv(META_FILE, index_col='USAF'))

In [None]:
meta_df.loc[list(STATION_COMBOS)]

In [None]:
@dask.delayed
def read_one_fname(usaf_station, fname):
    dframe = clean_col_names(pd.read_csv(fname))
    station_data = meta_df.loc[usaf_station]
    hour_offset = dframe.hh_mm_lst.map(lambda x:pd.Timedelta(hours=int(x.split(':')[0])))
    keep_cols = ['date', 'y', 'x', 'julian_hr', 'year', 'usaf']
    keep_cols += [col for col in dframe.columns
                  if ('metstat' in col or 'suny' in col or col in keep_cols)
                  and 'flg' not in col]
    dframe['date'] = pd.to_datetime(dframe.yyyy_mm_dd) + hour_offset
    dframe['usaf'] = usaf_station
    dframe['y'], dframe['x'] = station_data.nsrdb_lat_dd, station_data.nsrdb_lon_dd 
    dframe['julian_hr'] = dframe.date.dt.hour + (dframe.date.dt.dayofyear - 1) * 24
    dframe['year'] = dframe.date.dt.year
    dframe[dframe <= -999] = np.NaN
    return dframe.loc[:, keep_cols]

def read_one_station(station):
    files = files_for_station(station)
    return dd.from_delayed([read_one_fname(station, fname) for fname in files]).compute()

In [None]:
example_usaf = tuple(STATION_COMBOS)[0]
df = read_one_station(example_usaf)

In [None]:
df.head()

In [None]:
df.date.describe()

In [None]:
def get_station_quantiles(station=None, grouper='julian_hr', usaf_data=None):
    if usaf_data is None:
        usaf_data = read_one_station(station).groupby(grouper)
    low = usaf_data.quantile(0.25)
    median = usaf_data.median()
    hi = usaf_data.quantile(0.75)
    median[grouper] = median.index.values
    median['usaf'] = station
    summary_df = median.join(low, 
                             rsuffix='_low').join(hi, rsuffix='_hi')
    return summary_df

In [None]:
julian_summary = get_station_quantiles(station=example_usaf)
julian_summary.head()

In [None]:
direct, dif_h, glo_h = ('Direct Normal', 
                        'Diffuse Horizontal', 
                        'Global Horizontal',)
labels = []
watt_hrs_m2_cols = [col for col in df.columns if 'wh_m_2' in col and not 'suny' in col]
for col in watt_hrs_m2_cols:
    word1 = "Clear Sky " if 'csky' in col else "Measured"
    word2 = direct if '_dir_' in col else dif_h if '_dif_' else glo_h
    labels.append('{} - {}'.format(word1, word2))
watt_hrs_m2_cols, labels

In [None]:
def plot_gen():
    curves = {}
    kw = dict(style=dict(s=2,alpha=0.5))
    for col, label in zip(watt_hrs_m2_cols, labels):
        dates = pd.DatetimeIndex(start=pd.Timestamp('2001-01-01'),
                                 freq='H', 
                                 periods=julian_summary.shape[0])
        median_col = julian_summary[col]
        low_col = julian_summary[col + '_low']
        hi_col = julian_summary[col + '_hi']
        hi = hv.Curve((dates, hi_col), label=label + ' (75%)')(**kw)
        low = hv.Curve((dates, low_col),label=label + ' (25%)')(**kw)
        median = hv.Curve((dates, median_col), label=label)(**kw)
        curves[tuple(col.replace('metstat_', '').replace('_wh_m_2', '').split('_'))] = low * median * hi
    return curves

In [None]:
plots = plot_gen()

In [None]:
list(plots)

In [None]:
%%opts Curve [width=700 height=500]
%%opts Layout [sublabel_format="" tight=True]
plots[('csky', 'dir')]

In [None]:
%%opts Curve [width=700 height=500]
%%opts Layout [sublabel_format="" tight=True]
plots[('dir',)]

In [None]:
%%opts Curve [width=700 height=500]
%%opts Layout [sublabel_format="" tight=True]
plots[('dif',)] + plots[('csky', 'dif',)]