# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Ground-predictions" data-toc-modified-id="Ground-predictions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Ground predictions</a></div><div class="lev2 toc-item"><a href="#PVLib-Clearsky" data-toc-modified-id="PVLib-Clearsky-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>PVLib Clearsky</a></div>

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree

import pytz
import itertools
import visualize
import utils
import pydotplus
import xgboost as xgb

from sklearn import metrics

import pvlib
import cs_detection
# import visualize
# from bokeh.plotting import output_notebook
# output_notebook()

import visualize_plotly as visualize

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

# Ground predictions

## PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model.  NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('srrl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')

In [3]:
nsrdb.time_from_solar_noon('Clearsky GHI', 'tfn')

In [4]:
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')

In [5]:
ground.df.index

DatetimeIndex(['2001-08-04 00:00:00-07:00', '2001-08-04 00:01:00-07:00',
               '2001-08-04 00:02:00-07:00', '2001-08-04 00:03:00-07:00',
               '2001-08-04 00:04:00-07:00', '2001-08-04 00:05:00-07:00',
               '2001-08-04 00:06:00-07:00', '2001-08-04 00:07:00-07:00',
               '2001-08-04 00:08:00-07:00', '2001-08-04 00:09:00-07:00',
               ...
               '2014-12-31 23:50:00-07:00', '2014-12-31 23:51:00-07:00',
               '2014-12-31 23:52:00-07:00', '2014-12-31 23:53:00-07:00',
               '2014-12-31 23:54:00-07:00', '2014-12-31 23:55:00-07:00',
               '2014-12-31 23:56:00-07:00', '2014-12-31 23:57:00-07:00',
               '2014-12-31 23:58:00-07:00', '2014-12-31 23:59:00-07:00'],
              dtype='datetime64[ns, MST]', length=7053120, freq='T')

We will reduce the frequency of ground based measurements to match NSRDB.

In [6]:
ground.intersection(nsrdb.df.index)

In [7]:
nsrdb.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')

In [8]:
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)

In [9]:
ground.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)

In [10]:
feature_cols = [
'tfn',
'abs_ideal_ratio_diff',
'abs_ideal_ratio_diff mean',
'abs_ideal_ratio_diff std',
'abs_ideal_ratio_diff max',
'abs_ideal_ratio_diff min',
'abs_ideal_ratio_diff range',
'abs_ideal_ratio_diff gradient',
'abs_ideal_ratio_diff gradient mean',
'abs_ideal_ratio_diff gradient std',
'abs_ideal_ratio_diff gradient max',
'abs_ideal_ratio_diff gradient min',
'abs_ideal_ratio_diff gradient range',
'abs_ideal_ratio_diff gradient second',
'abs_ideal_ratio_diff gradient second mean',
'abs_ideal_ratio_diff gradient second std',
'abs_ideal_ratio_diff gradient second max',
'abs_ideal_ratio_diff gradient second min',
'abs_ideal_ratio_diff gradient second range',
'GHI Clearsky GHI pvlib gradient ratio', 
'GHI Clearsky GHI pvlib gradient ratio mean', 
'GHI Clearsky GHI pvlib gradient ratio std', 
'GHI Clearsky GHI pvlib gradient ratio min', 
'GHI Clearsky GHI pvlib gradient ratio max', 
'GHI Clearsky GHI pvlib gradient ratio range', 
'GHI Clearsky GHI pvlib gradient second ratio', 
'GHI Clearsky GHI pvlib gradient second ratio mean', 
'GHI Clearsky GHI pvlib gradient second ratio std', 
'GHI Clearsky GHI pvlib gradient second ratio min', 
'GHI Clearsky GHI pvlib gradient second ratio max', 
'GHI Clearsky GHI pvlib gradient second ratio range',
'GHI Clearsky GHI pvlib line length ratio',
'GHI Clearsky GHI pvlib line length ratio gradient',
'GHI Clearsky GHI pvlib line length ratio gradient second',
# 'abs_ideal_ratio_diff pct_change', 
# 'abs_ideal_ratio_diff pct_change mean', 
# 'abs_ideal_ratio_diff pct_change std', 
# 'abs_ideal_ratio_diff pct_change max', 
# 'abs_ideal_ratio_diff pct_change min', 
# 'abs_ideal_ratio_diff pct_change range'
]

target_cols = ['sky_status']

In [11]:
ground.trim_dates('10-01-2010', '10-08-2010')

In [12]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [13]:
train = cs_detection.ClearskyDetection(nsrdb.df)
test = cs_detection.ClearskyDetection(ground.df)

In [14]:
from sklearn import ensemble, linear_model
# clf = ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=100, max_leaf_nodes=40)  # max_leaf_nodes=30, n_estimators=100)
# clf = tree.DecisionTreeClassifier(min_samples_leaf=.001)
# clf = linear_model.LogisticRegression(C=.05)
# clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.01, n_estimators=24, n_jobs=-1)
clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.00275, n_estimators=64, n_jobs=-1)
clf = ensemble.GradientBoostingClassifier(learning_rate=.01, n_estimators=100)

In [15]:
import xgboost as xgb

In [16]:
# clf = xgb.XGBClassifier(max_depth=5, n_estimators=300, learning_rate=.0075, nthread=4, min_child_weight=1)
clf = xgb.XGBClassifier(max_depth=4, n_estimators=325, learning_rate=.01, nthread=4, min_child_weight=1)

In [17]:
# nsrdb.df = nsrdb.df[nsrdb.df['GHI'] > 0]

In [18]:
# train.df = train.df[train.df['Clearsky GHI pvlib'] > 0]

In [19]:
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=4, min_child_weight=1, missing=None, n_estimators=325,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [20]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3)
pred = pred.astype(bool)

2010-10-01



Scaling did not converge.



2010-10-02
2010-10-03
2010-10-04
2010-10-05
2010-10-06
2010-10-07


In [21]:
train.intersection(test.df.index)

In [22]:
vis = visualize.Visualizer()

In [23]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')

In [24]:
vis.show()

In [25]:
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')

In [26]:
ground.df.index = ground.df.index.tz_convert('MST')

In [27]:
ground.trim_dates('10-01-2010', '10-15-2010')

In [28]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [29]:
test = ground

In [30]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 11, smooth=False, tol=1.0e-6)
pred = pred.astype(bool)

2010-10-01



Scaling did not converge.



2010-10-02
2010-10-03
2010-10-04
2010-10-05
2010-10-06
2010-10-07
2010-10-08
2010-10-09
2010-10-10
2010-10-11
2010-10-12
2010-10-13
2010-10-14


In [31]:
vis = visualize.Visualizer()

In [32]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [33]:
vis.show()

In [34]:
for f, i in zip(feature_cols, clf.feature_importances_):
    print(f, i)

tfn 0.0857733
abs_ideal_ratio_diff 0.226318
abs_ideal_ratio_diff mean 0.0
abs_ideal_ratio_diff std 0.0268688
abs_ideal_ratio_diff max 0.0258353
abs_ideal_ratio_diff min 0.0792284
abs_ideal_ratio_diff range 0.0275577
abs_ideal_ratio_diff gradient 0.0
abs_ideal_ratio_diff gradient mean 0.0
abs_ideal_ratio_diff gradient std 0.0186014
abs_ideal_ratio_diff gradient max 0.000688942
abs_ideal_ratio_diff gradient min 0.00413365
abs_ideal_ratio_diff gradient range 0.00620048
abs_ideal_ratio_diff gradient second 0.0
abs_ideal_ratio_diff gradient second mean 0.00206683
abs_ideal_ratio_diff gradient second std 0.0110231
abs_ideal_ratio_diff gradient second max 0.00447813
abs_ideal_ratio_diff gradient second min 0.0310024
abs_ideal_ratio_diff gradient second range 0.00172236
GHI Clearsky GHI pvlib gradient ratio 0.0103341
GHI Clearsky GHI pvlib gradient ratio mean 0.0585601
GHI Clearsky GHI pvlib gradient ratio std 0.0
GHI Clearsky GHI pvlib gradient ratio min 0.122976
GHI Clearsky GHI pvlib gradie