In [86]:
import pandas as pd
import numpy as np

In [87]:
newark_df = pd.read_csv('./data/newark_merged.csv', index_col='Unnamed: 0')
newark_df.head()

Unnamed: 0,apparentTemperature,cloudCover,dewPoint,humidity,icon,precipAccumulation,precipIntensity,precipProbability,precipType,pressure,...,Glo Mod (Wh/m^2),Glo Mod Unc (%),Dir Mod (Wh/m^2),Dir Mod Unc (%),Dif Mod (Wh/m^2),Dif Mod Unc (%),clear_sky,yesterday,avg_last_week,last_week_median
2006-01-02 00:00:00,36.81,0.82,31.4,0.81,partly-cloudy-night,,0.0,0.0,,1022.75,...,0,0,0,0,0,0,0.0,0.0,,0.0
2006-01-02 01:00:00,35.79,0.22,30.8,0.82,clear-night,,0.0,0.0,,1022.8,...,0,0,0,0,0,0,0.0,0.0,,0.0
2006-01-02 02:00:00,35.67,0.32,30.43,0.81,partly-cloudy-night,,0.0,0.0,,1023.57,...,0,0,0,0,0,0,0.0,0.0,,0.0
2006-01-02 03:00:00,35.49,0.5,30.64,0.82,partly-cloudy-night,,0.0,0.0,,1023.57,...,0,0,0,0,0,0,0.0,0.0,,0.0
2006-01-02 04:00:00,35.66,0.88,31.25,0.84,partly-cloudy-night,,0.0,0.0,,1023.31,...,0,0,0,0,0,0,0.0,0.0,,0.0


In [88]:
newark_df.columns

Index(['apparentTemperature', 'cloudCover', 'dewPoint', 'humidity', 'icon',
       'precipAccumulation', 'precipIntensity', 'precipProbability',
       'precipType', 'pressure', 'summary', 'temperature', 'uvIndex',
       'visibility', 'windBearing', 'windGust', 'windSpeed', 'ETR (Wh/m^2)',
       'ETRN (Wh/m^2)', 'Glo Mod (Wh/m^2)', 'Glo Mod Unc (%)',
       'Dir Mod (Wh/m^2)', 'Dir Mod Unc (%)', 'Dif Mod (Wh/m^2)',
       'Dif Mod Unc (%)', 'clear_sky', 'yesterday', 'avg_last_week',
       'last_week_median'],
      dtype='object')

In [89]:
# Drop as these columns have a ton of NaNs and are columns of strings.
newark_df = newark_df.drop(columns=["precipAccumulation", "precipType", "windGust", "icon", "summary", "ETR (Wh/m^2)", "ETRN (Wh/m^2)", "Dir Mod (Wh/m^2)", "Dif Mod (Wh/m^2)"])

In [90]:
# Ensure only avg last week is na
newark_df.isna().sum()

apparentTemperature      0
cloudCover               0
dewPoint                 0
humidity                 0
precipIntensity          0
precipProbability        0
pressure                 0
temperature              0
uvIndex                  0
visibility               0
windBearing              0
windSpeed                0
Glo Mod (Wh/m^2)         0
Glo Mod Unc (%)          0
Dir Mod Unc (%)          0
Dif Mod Unc (%)          0
clear_sky                0
yesterday                0
avg_last_week          145
last_week_median         0
dtype: int64

In [91]:
newark_df.head()

Unnamed: 0,apparentTemperature,cloudCover,dewPoint,humidity,precipIntensity,precipProbability,pressure,temperature,uvIndex,visibility,windBearing,windSpeed,Glo Mod (Wh/m^2),Glo Mod Unc (%),Dir Mod Unc (%),Dif Mod Unc (%),clear_sky,yesterday,avg_last_week,last_week_median
2006-01-02 00:00:00,36.81,0.82,31.4,0.81,0.0,0.0,1022.75,36.81,0,10.0,246,1.75,0,0,0,0,0.0,0.0,,0.0
2006-01-02 01:00:00,35.79,0.22,30.8,0.82,0.0,0.0,1022.8,35.79,0,10.0,234,1.86,0,0,0,0,0.0,0.0,,0.0
2006-01-02 02:00:00,35.67,0.32,30.43,0.81,0.0,0.0,1023.57,35.67,0,10.0,216,1.55,0,0,0,0,0.0,0.0,,0.0
2006-01-02 03:00:00,35.49,0.5,30.64,0.82,0.0,0.0,1023.57,35.49,0,9.74,225,0.81,0,0,0,0,0.0,0.0,,0.0
2006-01-02 04:00:00,35.66,0.88,31.25,0.84,0.0,0.0,1023.31,35.66,0,9.68,216,0.63,0,0,0,0,0.0,0.0,,0.0


In [92]:
from sklearn.model_selection import train_test_split
newark_df = newark_df.dropna()
X = newark_df.drop(columns="Glo Mod (Wh/m^2)")
y = newark_df.loc[:, ["Glo Mod (Wh/m^2)"]]

X, X_test, y, y_test = train_test_split(X, y, test_size = 0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size = 0.1, random_state=1)

In [93]:
from sklearn.ensemble import RandomForestRegressor

rfr = RandomForestRegressor()
rfr.fit(X_train, y_train)

rfr_train_score = rfr.score(X_train, y_train)
rfr_val_score = rfr.score(X_val, y_val)

print('Train Score: ', rfr_train_score)
print('Validation Score: ', rfr_val_score)



Train Score:  0.9851148673876251
Validation Score:  0.9071328806981406


To visualize what features are used for the largest splits, let's restrict the depth of the decision trees and visualize a tree with graphviz.

In [95]:
rfr_limited_depth = RandomForestRegressor(max_depth=4)
rfr_limited_depth.fit(X_train, y_train)

# Extract single tree
estimator = rfr_limited_depth.estimators_[5]

  from ipykernel import kernelapp as app


Luckily, we can copy the code and visualize the tree on Webgraphviz. By running the following cell, you'll see a pretty long output -- follow the link and copy and paste the output to get a visualization of the decision tree we fit!

In [96]:
import graphviz
from sklearn import tree

print(tree.export_graphviz(estimator, feature_names=X.columns, out_file=None))

digraph Tree {
node [shape=box] ;
0 [label="uvIndex <= 1.5\nmse = 61656.694\nsamples = 19781\nvalue = 164.312"] ;
1 [label="avg_last_week <= 100.857\nmse = 8452.649\nsamples = 14349\nvalue = 43.274"] ;
0 -> 1 [labeldistance=2.5, labelangle=45, headlabel="True"] ;
2 [label="avg_last_week <= 32.143\nmse = 484.852\nsamples = 11642\nvalue = 6.927"] ;
1 -> 2 ;
3 [label="avg_last_week <= 8.214\nmse = 27.558\nsamples = 10660\nvalue = 1.312"] ;
2 -> 3 ;
4 [label="mse = 1.737\nsamples = 10104\nvalue = 0.309"] ;
3 -> 4 ;
5 [label="mse = 143.58\nsamples = 556\nvalue = 19.759"] ;
3 -> 5 ;
6 [label="avg_last_week <= 67.214\nmse = 1418.69\nsamples = 982\nvalue = 67.272"] ;
2 -> 6 ;
7 [label="mse = 655.153\nsamples = 542\nvalue = 51.905"] ;
6 -> 7 ;
8 [label="mse = 1708.162\nsamples = 440\nvalue = 86.674"] ;
6 -> 8 ;
9 [label="cloudCover <= 0.855\nmse = 12518.63\nsamples = 2707\nvalue = 200.236"] ;
1 -> 9 ;
10 [label="clear_sky <= 738.733\nmse = 10689.959\nsamples = 1873\nvalue = 237.702"] ;
9 -> 10 

We observe that uvIndex, avg_last_week, clear_sky, and cloudCover are among the most important features as they result in the best splits of data.

In [103]:
np.array(y_train).reshape(y_train.shape[0])

array([  0, 594, 269, ..., 182,   0,   0])

In [None]:
from sklearn.model_selection import RandomizedSearchCV

# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 101, num = 5)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in range(1, 10)]
max_depth.append(None)
# Minimum number of samples required to split a node
# min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
# min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
param_dict = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
#                'min_samples_split': min_samples_split,
#                'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

rf_random = RandomizedSearchCV(estimator = rfr, param_distributions = param_dict, n_iter = 50, cv = 10, random_state=42)
# Fit the random search model
rf_random.fit(X_train, np.array(y_train).reshape(y_train.shape[0]))

print(rf_random.best_params_)

In [None]:
# With best params found above
rfr_tree = RandomForestRegressor(n_estimators=, max_features=, max_depth=, bootstrap=)
rfr_tree.fit(X_train, y_train)

rfr_train_score = rfr_tree.score(X_train, y_train)
rfr_val_score = rfr_tree.score(X_val, y_val)

print('Train Score: ', rfr_train_score)
print('Validation Score: ', rfr_val_score)