# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Ground-predictions" data-toc-modified-id="Ground-predictions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Ground predictions</a></div><div class="lev2 toc-item"><a href="#PVLib-Clearsky" data-toc-modified-id="PVLib-Clearsky-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>PVLib Clearsky</a></div>

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree

import pytz
import itertools
import visualize
import utils
import pydotplus

from sklearn import metrics
from sklearn import ensemble
from sklearn import linear_model

import pvlib
import cs_detection
# import visualize
# from bokeh.plotting import output_notebook
# output_notebook()

import visualize_plotly as visualize

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

# Ground predictions

## PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model.  NSRDB model won't be available to ground measurements.

In [2]:
nsrdb_srrl = cs_detection.ClearskyDetection.read_pickle('srrl_nsrdb_1.pkl.gz')
nsrdb_srrl.df.index = nsrdb_srrl.df.index.tz_convert('MST')
nsrdb_srrl.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
nsrdb_abq = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')
nsrdb_abq.df.index = nsrdb_abq.df.index.tz_convert('MST')
nsrdb_abq.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')
nsrdb_ornl = cs_detection.ClearskyDetection.read_pickle('ornl_nsrdb_1.pkl.gz')
nsrdb_ornl.df.index = nsrdb_ornl.df.index.tz_convert('EST')
nsrdb_ornl.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

We will reduce the frequency of ground based measurements to match NSRDB.

In [3]:
nsrdb_srrl.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
nsrdb_abq.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')
nsrdb_ornl.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')

In [4]:
nsrdb_srrl.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)
nsrdb_abq.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)
nsrdb_ornl.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)

In [5]:
feature_cols = [
'GHI',
'Clearsky GHI pvlib',
'tfn',
'GHI mean',
'GHI std',
'GHI max',
'GHI min',
'GHI range',
'Clearsky GHI pvlib mean',
'Clearsky GHI pvlib std',
'Clearsky GHI pvlib max',
'Clearsky GHI pvlib min',
'Clearsky GHI pvlib range',
'GHI gradient',
'GHI gradient mean',
'GHI gradient std',
'GHI gradient max',
'GHI gradient min',
'GHI gradient range',
'GHI gradient second',
'GHI gradient second mean',
'GHI gradient second std',
'GHI gradient second max',
'GHI gradient second min',
'GHI gradient second range',
'Clearsky GHI pvlib gradient',
'Clearsky GHI pvlib gradient mean',
'Clearsky GHI pvlib gradient std',
'Clearsky GHI pvlib gradient max',
'Clearsky GHI pvlib gradient min',
'Clearsky GHI pvlib gradient second',
'Clearsky GHI pvlib gradient second mean',
'Clearsky GHI pvlib gradient second std',
'Clearsky GHI pvlib gradient second max',
'Clearsky GHI pvlib gradient second min',
'abs_ideal_ratio_diff',
'abs_ideal_ratio_diff mean',
'abs_ideal_ratio_diff std',
'abs_ideal_ratio_diff max',
'abs_ideal_ratio_diff min',
'abs_ideal_ratio_diff range',
'abs_ideal_ratio_diff gradient',
'abs_ideal_ratio_diff gradient mean',
'abs_ideal_ratio_diff gradient std',
'abs_ideal_ratio_diff gradient max',
'abs_ideal_ratio_diff gradient min',
'abs_ideal_ratio_diff gradient range',
'abs_ideal_ratio_diff gradient second',
'abs_ideal_ratio_diff gradient second mean',
'abs_ideal_ratio_diff gradient second std',
'abs_ideal_ratio_diff gradient second max',
'abs_ideal_ratio_diff gradient second min',
'abs_ideal_ratio_diff gradient second range',
'abs_diff',
'abs_diff mean',
'abs_diff std',
'abs_diff max',
'abs_diff min',
'abs_diff range',
'abs_diff gradient',
'abs_diff gradient mean',
'abs_diff gradient std',
'abs_diff gradient max',
'abs_diff gradient min',
'abs_diff gradient range',
'abs_diff gradient second',
'abs_diff gradient second mean',
'abs_diff gradient second std',
'abs_diff gradient second max',
'abs_diff gradient second min',
'abs_diff gradient second range',
'GHI line length',
'Clearsky GHI pvlib line length',
'GHI Clearsky GHI pvlib line length ratio',
# 'GHI bpct change',
# 'GHI bpct change mean', 'GHI bpct change std', 'GHI bpct change max', 'GHI bpct change min', 'GHI bpct change range',
# 'Clearsky GHI pvlib bpct change',
# 'Clearsky GHI pvlib bpct change mean', 'Clearsky GHI pvlib bpct change std', 'Clearsky GHI pvlib bpct change max', 'Clearsky GHI pvlib bpct change min', 'Clearsky GHI pvlib bpct change range',
# 'GHI Clearsky GHI pvlib bpct change ratio', 
# 'GHI Clearsky GHI pvlib bpct change ratio mean', 'GHI Clearsky GHI pvlib bpct change ratio std', 'GHI Clearsky GHI pvlib bpct change ratio max', 'GHI Clearsky GHI pvlib bpct change ratio min', 'GHI Clearsky GHI pvlib bpct change ratio range' 
]

target_cols = ['sky_status']

In [6]:
# clf = ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=64, min_samples_leaf=0.0035, n_jobs=-1)
clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=0.0035)

# from sklearn import model_selection
# gscv = model_selection.GridSearchCV(clf, {'min_samples_leaf': np.arange(.001, .011, .001)}, scoring='f1')

In [7]:
X = np.vstack((nsrdb_srrl.df[feature_cols].values, 
               nsrdb_abq.df[feature_cols].values,
               nsrdb_ornl.df[feature_cols].values))
y = np.vstack((nsrdb_srrl.df[target_cols].values, 
               nsrdb_abq.df[target_cols].values,
               nsrdb_ornl.df[target_cols].values))

In [8]:
clf.fit(X, y.flatten())

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=None, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=0.0035,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [9]:
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')

In [10]:
ground.df.index = ground.df.index.tz_convert('MST')

In [11]:
ground.trim_dates('10-01-2010', '10-08-2010')

In [12]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [13]:
test = ground

In [14]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)

1 1
2 0.940292243692
3 0.984904536537
4 0.99832636242
5 0.99980149983
6 0.999895749932
7 1.0
1 1
2 0.953177572675
3 0.981333553721
4 0.998465983879
5 1.0
1 1
2 0.986654968393
3 0.996291140069
4 0.999025000561
5 1.0
1 1
2 0.985012292249
3 0.981438952214
4 0.9882103522
5 0.994808989704
6 0.999070250213
7 1.0
1 1
2 0.953358143179
3 0.994815775595
4 0.999278698716
5 1.0
1 1
2 0.999070099451
3 1.0
1 1
2 0.959583715835
3 0.995672291759
4 0.998961268397
5 0.99951446771
6 0.999791886688
7 0.999756274204
8 1.0


In [15]:
vis = visualize.Visualizer()

In [16]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [17]:
vis.show()

In [18]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')

In [19]:
ground.df.index = ground.df.index.tz_convert('MST')

In [20]:
ground.trim_dates('10-01-2015', '10-08-2015')

In [21]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [22]:
test = ground

In [23]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)

1 1
2 0.951466320962
3 0.996446983778
4 0.999656928002
5 1.0
1 1
2 1.01203723115
3 1.00659349013
4 1.00398843391
5 1.00220879162
6 1.0015296388
7 1.00091277327
8 1.00138268413
9 1.0
1 1
2 0.966824933366
3 0.994839001422
4 0.999892010538
5 1.0
1 1
2 0.985683287907
3 0.975492248171
4 0.972981951558
5 0.979068652373
6 0.983556955914
7 0.999740340646
8 1.0
1 1
2 1.0048725587
3 1.02557783582
4 1.01842403868
5 1.02567040373
6 1.01555360509
7 1.00776979168
8 1.00313398565
9 1.00298424472
10 1.0
1 1
2 1.00125683951
3 1.00141558033
4 1.00102172378
5 1.00044952479
6 1.00094945609
7 1.00073749212
8 1.00073104314
9 1.0004095619
10 1.00073287968
11 1.00126915121
12 1.00258343151
13 1.0049691264
14 1.00398879949
15 1.00241624679
16 1.00488718778
17 1.00353200251
18 1.00276204901
19 1.00115611748
20 1.00056438447



Scaling did not converge.



1 1
2 0.987289745452
3 0.991404144362
4 0.988732195527
5 0.99389132879
6 0.993138673714
7 0.994738133716
8 0.991544296555
9 0.992401637307
10 0.997551252878
11 1.0


In [24]:
vis = visualize.Visualizer()

In [25]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [26]:
vis.show()

In [27]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')

In [28]:
ground.df.index.tz

<StaticTzInfo 'EST'>

In [29]:
ground.trim_dates('05-01-2008', '05-08-2008')

In [30]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [31]:
test = ground

In [32]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)

1 1
2 0.993651245561
3 0.994826773334
4 0.997572708322
5 0.998905673723
6 0.999419383017
7 1.0
1 1
2 0.992667127209
3 0.998839432025
4 0.999224632765
5 1.0
1 1
2 1.01463234072
3 1.00684214611
4 1.0
1 1
2 1.03008170042
3 1.00008567002
4 1.0
1 1
2 1.02292980915
3 1.00393108355
4 1.00040469245
5 1.00010977773
6 1.00011189467
7 1.0
1 1
2 1.00805075865
3 1.00054219913
4 1.0
1 1
2 0.996283632151
3 0.998567428275
4 0.999205946484
5 0.999679562527
6 1.0


In [33]:
vis = visualize.Visualizer()

In [34]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [35]:
vis.show()

In [36]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')

In [37]:
ground.df.index.tz

<StaticTzInfo 'EST'>

In [38]:
ground.trim_dates('05-01-2008', '05-08-2008')

In [39]:
ground.df = ground.df.resample('5min').mean()

In [40]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [41]:
test = ground

In [42]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 6, smooth=True)
pred = pred.astype(bool)

1 1
2 0.990470313727
3 0.991524100986
4 0.993527037986
5 0.999255448506
6 1.0
1 1
2 0.991274652048
3 0.996530189036
4 0.998864998475
5 1.0
1 1
2 1.00308548042
3 1.0
1 1
2 1.02970662949
3 1.00058122739
4 1.0
1 1
2 1.02125449321
3 1.00421458119
4 1.00054594846
5 1.0
1 1
2 1.00668659594
3 1.00013271324
4 1.0
1 1
2 0.991140670995
3 0.996879641211
4 0.999729509014
5 1.0


In [43]:
vis = visualize.Visualizer()

In [44]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(pred)]['GHI'], 'ML+PVLib clear only')

In [45]:
vis.show()

In [46]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')

In [47]:
ground.df.index.tz

<StaticTzInfo 'EST'>

In [48]:
ground.trim_dates('05-01-2008', '05-08-2008')

In [49]:
ground.df = ground.df.resample('10min').mean()

In [50]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [51]:
test = ground

In [52]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 6, smooth=True)
pred = pred.astype(bool)

1 1
2 0.984129492563
3 0.993676212712
4 0.997752176638
5 1.0
1 1
2 0.992952402262
3 0.995207498765
4 1.0
1 1
2 1.01027127746
3 1.00395831606
4 1.0
1 1
2 1.02970762674
3 1.00035223914
4 1.0
1 1
2 1.02068404889
3 1.00413143309
4 1.00111941688
5 1.0
1 1
2 1.0046768422
3 1.00131961537
4 1.0
1 1
2 0.985323085734
3 0.997223348882
4 1.0


In [53]:
vis = visualize.Visualizer()

In [54]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(pred)]['GHI'], 'ML+PVLib clear only')

In [55]:
vis.show()

In [56]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')

In [57]:
ground.df.index.tz

<StaticTzInfo 'EST'>

In [58]:
ground.trim_dates('05-01-2008', '05-08-2008')

In [59]:
ground.df = ground.df.resample('15min').mean()

In [60]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [61]:
test = ground

In [62]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 6, smooth=True)
pred = pred.astype(bool)

1 1
2 0.99240129233
3 0.994147158442
4 1.0
1 1
2 0.989540300766
3 1.0
1 1
2 1.00121012593
3 1.0
1 1
2 1.02989161713
3 1.0
1 1
2 1.02306299084
3 1.00160439581
4 1.0
1 1
2 1.00720866012
3 1.0
1 1
2 0.982084607162
3 0.994782082067
4 1.0


In [63]:
vis = visualize.Visualizer()

In [64]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(pred)]['GHI'], 'ML+PVLib clear only')

In [65]:
vis.show()

In [66]:
ground = cs_detection.ClearskyDetection.read_pickle('ornl_ground_1.pkl.gz')

In [67]:
ground.df.index.tz

<StaticTzInfo 'EST'>

In [68]:
ground.trim_dates('05-01-2008', '05-08-2008')

In [69]:
ground.df = ground.df.resample('30min').mean()

In [70]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [71]:
test = ground

In [72]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3, smooth=True)
pred = pred.astype(bool)

1 1
2 0.989265139321
3 0.993433452575
4 0.993114549383
5 1.0
1 1
2 0.98930321575
3 0.992724085085
4 1.0
1 1
2 1.00635317907
3 1.04009291648
4 1.0
1 1
2 1.02965882854
3 1.0004895806
4 1.0
1 1
2 1.01187801823
3 1.00608473593
4 1.00474602093
5 1.00296447944
6 1.0
1 1
2 1.00718587231
3 1.0
1 1
2 0.971891204646
3 0.992298998325
4 1.0


In [73]:
vis = visualize.Visualizer()

In [74]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
# vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(pred)]['GHI'], 'ML+PVLib clear only')

In [75]:
vis.show()

In [76]:
vis = visualize.Visualizer()
vis.add_bar(feature_cols, clf.feature_importances_)
vis.show()

In [77]:
len(feature_cols)

74

In [78]:
tree.export_graphviz(clf, 'dt.dot', feature_names=feature_cols, class_names=['cloudy', 'clear'], filled=True)