# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Ground-predictions" data-toc-modified-id="Ground-predictions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Ground predictions</a></div><div class="lev2 toc-item"><a href="#PVLib-Clearsky" data-toc-modified-id="PVLib-Clearsky-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>PVLib Clearsky</a></div>

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable
from sklearn import tree

import pytz
import itertools
import visualize
import utils
import pydotplus

from sklearn import metrics

import pvlib
import cs_detection
# import visualize
# from bokeh.plotting import output_notebook
# output_notebook()

import visualize_plotly as visualize

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

# Ground predictions

## PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model.  NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('abq_nsrdb_1.pkl.gz')

In [3]:
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')

In [4]:
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [5]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')

In [6]:
ground.df.index = ground.df.index.tz_convert('MST')

We will reduce the frequency of ground based measurements to match NSRDB.

In [7]:
ground.intersection(nsrdb.df.index)

In [8]:
nsrdb.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')

In [9]:
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', 
                              ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)

In [10]:
ground.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', 
                               ratio_label='ratio', abs_ratio_diff_label='abs_diff_ratio', overwrite=True)

In [11]:
feature_cols = ['GHI', 'Clearsky GHI pvlib', 'tfn',
                'abs_diff', 'abs_diff mean', 'abs_diff max', 'abs_diff min',
                'abs_diff gradient', 'abs_diff gradient mean', 'abs_diff gradient max', 'abs_diff gradient min',
                'abs_diff_ratio', 
                'GHI mean', 'GHI std', 'GHI max', 'GHI min',
                'Clearsky GHI pvlib mean', 'Clearsky GHI pvlib std',
                'Clearsky GHI pvlib max', 'Clearsky GHI pvlib min',
                'abs_diff_ratio mean', 'abs_diff_ratio std', 'abs_diff_ratio max',
                'abs_diff_ratio min',  
                'GHI gradient',
                'GHI gradient mean', 'GHI gradient std', 'GHI gradient max',
                'GHI gradient min',  
                'Clearsky GHI pvlib gradient',
                'Clearsky GHI pvlib gradient mean', 'Clearsky GHI pvlib gradient std',
                'Clearsky GHI pvlib gradient max', 'Clearsky GHI pvlib gradient min',
                'abs_diff_ratio gradient',
                'abs_diff_ratio gradient mean', 'abs_diff_ratio gradient std',
                'abs_diff_ratio gradient max', 'abs_diff_ratio gradient min',
                'Clearsky GHI pvlib line length', 'GHI line length', 
                'abs_ratio_diff line length', 
                'GHI gradient second', 'Clearsky GHI pvlib gradient second', 'abs_diff_ratio gradient second', 'abs_diff gradient',
                'GHI gradient second mean', 'GHI gradient second std', 'GHI gradient second max', 'GHI gradient second min',
                'Clearsky GHI pvlib gradient second mean', 'Clearsky GHI pvlib gradient second std', 
                'Clearsky GHI pvlib gradient second max', 'Clearsky GHI pvlib gradient second min',
                'abs_diff_ratio gradient second mean', 'abs_diff_ratio gradient second std', 
                'abs_diff_ratio gradient second max', 'abs_diff_ratio gradient second min',
                'abs_diff gradient second mean', 'abs_diff gradient second std', 
                'abs_diff gradient second max', 'abs_diff gradient second min',
                'GHI diff', 'Clearsky GHI pvlib diff', 'GHI Clearsky GHI pvlib ratio diff']
target_cols = ['sky_status']

In [12]:
ground.trim_dates('10-01-2015', '10-08-2015')

In [13]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [14]:
train = cs_detection.ClearskyDetection(nsrdb.df)
test = cs_detection.ClearskyDetection(ground.df)

In [15]:
# clf = tree.DecisionTreeClassifier(class_weight='balanced', max_leaf_nodes=30, max_depth=8)
from sklearn import ensemble
# clf = ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=100, max_leaf_nodes=40)  # max_leaf_nodes=30, n_estimators=100)
# clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=.005)

# clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.005, n_estimators=100)

clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=.005, max_depth=10)

# clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.15, max_leaf_nodes=20, n_estimators=100)#, max_leaf_nodes=30)#, max_leaf_nodes=22, max_depth=5, n_estimators=100)
# clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.1, min_samples_split=.1, n_estimators=100)#, max_leaf_nodes=30)#, max_leaf_nodes=22, max_depth=5, n_estimators=100)

# clf = ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=100, max_depth=10, min_samples_leaf=4, n_jobs=-1)
# clf = ensemble.RandomForestClassifier(class_weight={0: .05, 1: .95}, max_leaf_nodes=20, n_estimators=100)

# from sklearn import linear_model
# clf = linear_model.SGDClassifier(loss='log', alpha=100)

In [16]:
# nsrdb.df = nsrdb.df[nsrdb.df['GHI'] > 0]

In [17]:
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())

DecisionTreeClassifier(class_weight='balanced', criterion='gini',
            max_depth=10, max_features=None, max_leaf_nodes=None,
            min_impurity_split=1e-07, min_samples_leaf=0.005,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')

In [18]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3)
pred = pred.astype(bool)

1 1
2 0.95307486755
3 0.999866427061
4 1.0
1 1
2 1.00353805392
3 1.0
1 1
2 0.957658158725
3 1.0
1 1
2 2.61803396034
1 1
2 2.61803396034
1 1
2 1.03171094208
3 1.00301278624
4 1.0
1 1
2 0.642748716928
3 1.02968627534
4 1.0


In [19]:
train.intersection(test.df.index)

In [20]:
cm = metrics.confusion_matrix(train.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

In [21]:
metrics.accuracy_score(train.df['sky_status'].values, pred)

0.91666666666666663

In [22]:
vis = visualize.Visualizer()

In [23]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')

In [24]:
vis.show()

In [25]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')

In [26]:
ground = cs_detection.ClearskyDetection.read_pickle('abq_ground_1.pkl.gz')

In [27]:
ground.df.index = ground.df.index.tz_convert('MST')

In [28]:
ground.trim_dates('10-01-2015', '10-08-2015')

In [29]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [30]:
test = ground

In [31]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)

1 1
2 0.952917100679
3 0.996854529396
4 0.999656852155
5 0.999903609161
6 1.0
1 1
2 1.01094448436
3 1.00755545206
4 1.00668905442
5 1.00454329122
6 1.00165761057
7 1.00119000532
8 1.00060149503
9 1.0
1 1
2 0.972317720472
3 0.988672227119
4 0.998128020519
5 0.999401958981
6 1.0
1 1
2 0.987693317143
3 0.999185237548
4 1.0
1 1
2 0.991114037478
3 0.996647292074
4 1.0
1 1
2 1.00931003492
3 1.00559735
4 1.0041009485
5 1.00656138108
6 1.01018827237
7 1.01140392017
8 1.00793956372
9 1.00449086464
10 1.00174663082
11 1.00122466466
12 1.00177942702
13 1.0006008184
14 1.0
1 1
2 0.985317577843
3 0.989667860524
4 0.993465413722
5 1.0


In [32]:
vis = visualize.Visualizer()

In [33]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [34]:
vis.show()

In [35]:
tree.export_graphviz(clf, 'clf2.dot', feature_names=feature_cols, class_names=['cloudy', 'clear'], filled=True)