# Table of Contents
 <p><div class="lev1 toc-item"><a href="#Ground-predictions" data-toc-modified-id="Ground-predictions-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Ground predictions</a></div><div class="lev2 toc-item"><a href="#PVLib-Clearsky" data-toc-modified-id="PVLib-Clearsky-11"><span class="toc-item-num">1.1&nbsp;&nbsp;</span>PVLib Clearsky</a></div>

In [1]:
import pandas as pd
import numpy as np
import os
import datetime
import matplotlib
import matplotlib.pyplot as plt
from mpl_toolkits.axes_grid1 import make_axes_locatable

import pytz
import itertools
import visualize
import utils
import pydotplus

from sklearn import tree
from sklearn import preprocessing
from sklearn import metrics
from sklearn import decomposition
from sklearn import linear_model

import pvlib
import cs_detection
# import visualize
# from bokeh.plotting import output_notebook
# output_notebook()

import visualize_plotly as visualize

from IPython.display import Image

%load_ext autoreload
%autoreload 2

np.set_printoptions(precision=4)
%matplotlib notebook

# Ground predictions

## PVLib Clearsky

Only making ground predictions using PVLib clearsky model and statistical model.  NSRDB model won't be available to ground measurements.

In [2]:
nsrdb = cs_detection.ClearskyDetection.read_pickle('srrl_nsrdb_1.pkl.gz')
nsrdb.df.index = nsrdb.df.index.tz_convert('MST')

In [3]:
nsrdb.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [4]:
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')
ground.df.index = ground.df.index.tz_convert('MST')

We will reduce the frequency of ground based measurements to match NSRDB.

In [5]:
ground.intersection(nsrdb.df.index)

In [6]:
nsrdb.scale_model('GHI', 'Clearsky GHI pvlib', 'sky_status')

In [7]:
nsrdb.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)

In [8]:
ground.calc_all_window_metrics(3, 30, col1='GHI', col2='Clearsky GHI pvlib', overwrite=True)

In [9]:
feature_cols = [
'GHI',
# 'Clearsky GHI pvlib',
'tfn',
'GHI mean',
'GHI std',
'GHI max',
'GHI min',
'GHI range',
# 'Clearsky GHI pvlib mean',
# 'Clearsky GHI pvlib std',
# 'Clearsky GHI pvlib max',
# 'Clearsky GHI pvlib min',
# 'Clearsky GHI pvlib range',
'GHI gradient',
'GHI gradient mean',
'GHI gradient std',
'GHI gradient max',
'GHI gradient min',
'GHI gradient range',
'GHI gradient second',
'GHI gradient second mean',
'GHI gradient second std',
'GHI gradient second max',
'GHI gradient second min',
'GHI gradient second range',
# 'Clearsky GHI pvlib gradient',
# 'Clearsky GHI pvlib gradient mean',
# 'Clearsky GHI pvlib gradient std',
# 'Clearsky GHI pvlib gradient max',
# 'Clearsky GHI pvlib gradient min',
# 'Clearsky GHI pvlib gradient second',
# 'Clearsky GHI pvlib gradient second mean',
# 'Clearsky GHI pvlib gradient second std',
# 'Clearsky GHI pvlib gradient second max',
# 'Clearsky GHI pvlib gradient second min',
'abs_ideal_ratio_diff',
'abs_ideal_ratio_diff mean',
'abs_ideal_ratio_diff std',
'abs_ideal_ratio_diff max',
'abs_ideal_ratio_diff min',
'abs_ideal_ratio_diff range',
'abs_ideal_ratio_diff gradient',
'abs_ideal_ratio_diff gradient mean',
'abs_ideal_ratio_diff gradient std',
'abs_ideal_ratio_diff gradient max',
'abs_ideal_ratio_diff gradient min',
'abs_ideal_ratio_diff gradient range',
'abs_ideal_ratio_diff gradient second',
'abs_ideal_ratio_diff gradient second mean',
'abs_ideal_ratio_diff gradient second std',
'abs_ideal_ratio_diff gradient second max',
'abs_ideal_ratio_diff gradient second min',
'abs_ideal_ratio_diff gradient second range',
'abs_diff',
'abs_diff mean',
'abs_diff std',
'abs_diff max',
'abs_diff min',
'abs_diff range',
'abs_diff gradient',
'abs_diff gradient mean',
'abs_diff gradient std',
'abs_diff gradient max',
'abs_diff gradient min',
'abs_diff gradient range',
'abs_diff gradient second',
'abs_diff gradient second mean',
'abs_diff gradient second std',
'abs_diff gradient second max',
'abs_diff gradient second min',
'abs_diff gradient second range',
'GHI line length',
# 'Clearsky GHI pvlib line length',
'GHI Clearsky GHI pvlib line length ratio']

target_cols = ['sky_status']

In [10]:
ground.trim_dates('10-01-2010', '10-08-2010')

In [11]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [12]:
train = cs_detection.ClearskyDetection(nsrdb.df)
test = cs_detection.ClearskyDetection(ground.df)

In [13]:
from sklearn import ensemble
# clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=.01)
clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.0025, n_estimators=100)

from sklearn import ensemble
# clf = ensemble.RandomForestClassifier(class_weight='balanced', n_estimators=100, max_leaf_nodes=40)  # max_leaf_nodes=30, n_estimators=100)
clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=.0025)
clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.0025, n_estimators=100, max_depth=10)

# clf = ensemble.RandomForestClassifier(class_weight='balanced', min_samples_leaf=.05, n_estimators=100)
# clf = tree.DecisionTreeClassifier(class_weight='balanced', min_samples_leaf=.005, max_depth=10)


In [14]:
# nsrdb.df = nsrdb.df[nsrdb.df['GHI'] > 0]

In [15]:
clf.fit(train.df[feature_cols].values, train.df[target_cols].values.flatten())

RandomForestClassifier(bootstrap=True, class_weight='balanced',
            criterion='gini', max_depth=10, max_features='auto',
            max_leaf_nodes=None, min_impurity_split=1e-07,
            min_samples_leaf=0.0025, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [16]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 3)
pred = pred.astype(bool)

1 1
2 0.933655137563
3 0.990913077208
4 0.99860687245
5 1.0
1 1
2 0.945809059191
3 0.991974048887
4 0.998402227235
5 1.0
1 1
2 0.973590340511
3 0.993835105732
4 1.0
1 1
2 0.978478327058
3 0.989475648551
4 0.9968309732
5 1.0
1 1
2 0.947776570689
3 1.0
1 1
2 2.61803396034
1 1
2 0.952376395017
3 1.00002412393
4 1.0


In [17]:
train.intersection(test.df.index)

In [18]:
cm = metrics.confusion_matrix(train.df['sky_status'].values, pred)
vis = visualize.Visualizer()
vis.plot_confusion_matrix(cm, labels=['cloudy', 'clear'])

In [19]:
metrics.f1_score(train.df['sky_status'].values, pred)

0.83870967741935498

In [20]:
vis = visualize.Visualizer()

In [21]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (~pred)]['GHI'], 'NSRDB clear only')
vis.add_circle_ser(test.df[(train.df['sky_status'] == 1) & (pred)]['GHI'], 'ML+NSRDB clear only')

In [22]:
vis.show()

In [23]:
ground = cs_detection.ClearskyDetection.read_pickle('srrl_ground_1.pkl.gz')

In [24]:
ground.df.index = ground.df.index.tz_convert('MST')

In [25]:
ground.trim_dates('10-01-2010', '10-15-2010')

In [26]:
ground.time_from_solar_noon('Clearsky GHI pvlib', 'tfn')

In [27]:
test = ground

In [28]:
# pred = clf.predict(test.df[feature_cols].values)
pred = test.iter_predict_daily(feature_cols, 'GHI', 'Clearsky GHI pvlib', clf, 10, smooth=True)
pred = pred.astype(bool)

1 1
2 0.938510144344
3 0.985968174926
4 0.999585759627
5 0.999999890834
6 1.0
1 1
2 0.949723520277
3 0.983530200086
4 0.998818549336
5 0.999949069737
6 1.0
1 1
2 0.983611284451
3 0.997185130006
4 0.999917102976
5 1.0
1 1
2 0.983576968787
3 0.98589170069
4 0.989746454695
5 0.994271373924
6 0.998191432741
7 0.998171212334
8 0.998671914078
9 0.999834304853
10 1.00001146978
11 1.0
1 1
2 0.952945858049
3 0.993615734754
4 0.999570522703
5 1.0
1 1
2 1.00399604225
3 1.0083273601
4 1.00616429379
5 1.0
1 1
2 0.961667197464
3 0.992584322592
4 0.998189315834
5 0.999334812644
6 0.999916136551
7 0.999946092776
8 1.0
1 1
2 0.983983554642
3 0.995608924929
4 0.999853572923
5 1.0
1 1
2 0.996794798435
3 0.998198503948
4 1.0
1 1
2 1.00916695225
3 1.00965194618
4 1.0
1 1
2 0.946083933505
3 0.983416685248
4 0.99809137608
5 1.00006644627
6 0.999978484758
7 1.0
1 1
2 1.00215012271
3 1.00524005619
4 1.00575537965
5 1.00920480791
6 1.0126237802
7 1.00790404341
8 1.00559024249
9 1.00103229144
10 1.00023219051
11

In [29]:
pred.value_counts()

False    17099
True      3061
Name: sky_status iter, dtype: int64

In [30]:
vis = visualize.Visualizer()

In [31]:
vis.add_line_ser(test.df['GHI'], 'GHI')
vis.add_line_ser(test.df['Clearsky GHI pvlib'], 'GHI_cs')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 0) & (pred)]['GHI'], 'ML clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (~pred)]['GHI'], 'PVLib clear only')
vis.add_circle_ser(test.df[(test.df['sky_status pvlib'] == 1) & (pred)]['GHI'], 'ML+PVLib clear only')

In [32]:
vis.show()