# Test various models for performance on a six-month sample

## Determine all of the parameters

### Specify the locations of the files for X_train, X_test, and y_train. Also, there is a file that contains information about the individual stations that can be useful for models that learn for each station.

In [1]:
import os
os.chdir('..')

In [2]:
Xtrain_dir = 'solar/data/kaggle_solar/train/'
Xtest_dir = 'solar/data/kaggle_solar/test'
ytrain_file = 'solar/data/kaggle_solar/train.csv'
station_file = 'solar/data/kaggle_solar/station_info.csv'

### Import the various files. This is mostly done so that any file updates during testing are carried over to this notebook.

In [3]:
import solar.wrangle.wrangle
import solar.wrangle.subset
import solar.wrangle.engineer
import solar.analyze.model
import solar.report.submission
import numpy as np

### Use all of the variables but only over six months to reduce processing time. The test data is not important for this because nothing will be submitted.a

In [4]:
# Choose up to 98 stations; not specifying a station means to use all that fall within the given lats and longs. If the
# parameter 'all' is given, then it will use all stations no matter the provided lats and longs
station = ['all']

# Determine which dates will be used to train the model. No specified date means use the entire set from 1994-01-01 
# until 2007-12-31.
#train_dates = ['1994-01-01','2007-12-31']
train_dates = ['2000-01-01','2000-06-30']
# Determine the test X values to produce. There is no practical purpose to use fewer than all of the points other than
# for testing. Again, not choosing a date will use 2008-01-01 through 2012-11-30.
test_dates = ['2008-01-01', '2008-01-05']


# The last parameter that is not specifically involved in feature selection in the layout to be used for training
# I have switched to almost always training for each individual station rather than having a single row for a date.
# However, I am still not beating the benchmark, and the file would grow too large to have the benchmark laid out
# with a row for each station, so I'll keep the switch. True means that each station has a row (5113 dates X 98
# stations to train the data). False means that there are 5113 rows that are being used to train the data.
station_layout = True

### First, just duplicate the functionality of the basic grid analysis

In [5]:
# Use all variables
#var = ['uswrf_sfc', 'dswrf_sfc']
var = ['all']

# Keep model 0 (the default model) as a column for each of the variables (aggregated over other dimensions)
model = [0, 7]

# Aggregate over all times
times = ['all']

default_grid = {'type':'relative', 'axes':{'var':var, 'models':model, 'times':times,
                                          'station':station}}
just_grid = [default_grid]


### Run data extraction

In [6]:
# This just uses the station_names as another feature
stat_names = {'type':'station_names'}
stat_feats = [stat_names]

In [7]:
frac_dist = {'type':'frac_dist'}
dist_feats = [frac_dist]

In [8]:
days_solstice = {'type':'days_from_solstice'}
days_cold = {'type':'days_from_coldest'}
days_feats = [days_solstice, days_cold]

In [9]:
all_feats = [frac_dist, days_cold, days_solstice, default_grid, stat_names]

In [10]:
# if I am modifying code for any of these pythons
reload(solar.wrangle.wrangle)
reload(solar.wrangle.subset)
reload(solar.wrangle.engineer)
from solar.wrangle.wrangle import SolarData

%prun all_feats_data = SolarData.load(Xtrain_dir, ytrain_file, Xtest_dir, station_file, \
                                  train_dates, test_dates, station, \
                                  station_layout, all_feats)

INFO:solar.wrangle.wrangle:Started building test and training data
INFO:solar.wrangle.wrangle:Features: [{'type': 'frac_dist'}, {'type': 'days_from_coldest'}, {'type': 'days_from_solstice'}, {'axes': {'var': ['all'], 'models': [0, 7], 'station': ['all'], 'times': ['all']}, 'type': 'relative'}, {'type': 'station_names'}]
INFO:solar.wrangle.wrangle:Train dates: ['2000-01-01', '2000-06-30']
INFO:solar.wrangle.wrangle:Test dates: ['2008-01-01', '2008-01-05']
INFO:solar.wrangle.wrangle:Stations: ['all']
INFO:solar.wrangle.wrangle:Finished building test and training data


 

In [11]:
reload(solar.analyze.model)
from solar.analyze.model import Model

from sklearn.linear_model import LinearRegression

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', LinearRegression, {'fit_intercept': [True, False]}, cv_splits, \
                    error_formula, njobs, write)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [12]:
reload(solar.analyze.model)
from solar.analyze.model import Model

from sklearn.linear_model import LinearRegression

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', LinearRegression, {'fit_intercept': [True, False]}, cv_splits, \
                    error_formula, njobs, write)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [13]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Ridge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', Ridge, {'alpha':np.logspace(0,2,10,base=10)}, cv_splits, \
                    error_formula, njobs, write, normalize=True, random_state=1)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [15]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Ridge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', Ridge, {'alpha':np.logspace(-3,1,10,base=10)}, cv_splits, \
                    error_formula, njobs, write, normalize=True, random_state=1)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Lasso
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', Lasso, {'alpha':np.logspace(-2,2,10,base=10)}, cv_splits, \
                    error_formula, njobs, write, normalize=True, random_state=1, selection='random')

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Lars
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', Lars, {'n_nonzero_coefs': range(1,11)}, cv_splits, \
                    error_formula, njobs, write, normalize=True)

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import LassoLars
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', LassoLars, {'alpha':np.logspace(3,4,10,base=10)}, cv_splits, \
                    error_formula, njobs, write, normalize=True)

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import OrthogonalMatchingPursuit
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', OrthogonalMatchingPursuit, {'n_nonzero_coefs':[int(x) for x in np.logspace(0,2,10,base=10)]}, cv_splits, \
                    error_formula, njobs, write, normalize=True)

In [None]:
from solar.analyze.model import Model

from sklearn.linear_model import BayesianRidge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', BayesianRidge, {'alpha_1': np.logspace(-8,-4,5), \
                                                                                 'alpha_2': np.logspace(-6,-7,1), \
                                                                                 'lambda_1': np.logspace(-6,-7,1), \
                                                                                 'lambda_2': np.logspace(-6,-7,1)}, \
                                      cv_splits, error_formula, njobs, write, verbose=True)

In [None]:
from solar.analyze.model import Model

from sklearn.linear_model import ARDRegression
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', ARDRegression, {'alpha_1': np.logspace(-8,-4,5), \
                                                                                 'alpha_2': np.logspace(-6,-7,1), \
                                                                                 'lambda_1': np.logspace(-6,-7,1), \
                                                                                 'lambda_2': np.logspace(-6,-7,1)}, \
                                      cv_splits, error_formula, njobs, write, verbose=True)

In [17]:
import cPickle as pickle

In [18]:
test_data = pickle.load(open('solar/data/kaggle_solar/inputs/input_2016-02-15-10-30-00.p', 'rb'))

In [None]:
from sklearn.kernel_ridge import KernelRidge
model = KernelRidge(kernel='rbf')
%prun model.fit(test_data[0],test_data[1])

In [None]:
from solar.analyze.model import Model

from sklearn.kernel_ridge import KernelRidge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 3
njobs = 1
write = 'local'
#%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', KernelRidge, { \
#        'kernel': ['linear', 'RBF', 'laplacian', 'polynomial', 'exponential', 'chi2', 'sigmoid']}, \
#                                      cv_splits, error_formula, njobs, write)
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', KernelRidge, { \
        'kernel': ['rbf']}, \
                                      cv_splits, error_formula, njobs, write, gamma=0.1)

In [None]:
from sklearn.svm import SVR
model = SVR()
%prun model.fit(test_data[0],test_data[1])

In [None]:
from solar.analyze.model import Model

from sklearn.svm import SVR
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 3
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', SVR, {'verbose': [True]}, \
                                      cv_splits, error_formula, njobs, write)

In [None]:
from sklearn.svm import NuSVR
model = NuSVR()
%prun model.fit(test_data[0],test_data[1])

In [None]:
from solar.analyze.model import Model

from sklearn.svm import NuSVR
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 3
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', NuSVR, {'verbose': [True]}, \
                                      cv_splits, error_formula, njobs, write)

In [None]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor()
%prun model.fit(test_data[0],test_data[1])

In [None]:
from solar.analyze.model import Model

from sklearn.tree import DecisionTreeRegressor
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10
njobs = 1
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', DecisionTreeRegressor, {'splitter': ['best', 'random'], 'max_features': ['auto', 'sqrt', 'log2']}, \
                                      cv_splits, error_formula, njobs, write)

In [None]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor()
%prun model.fit(test_data[0],np.ravel(test_data[1]))

In [16]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.ensemble import RandomForestRegressor
from sklearn import metrics
cv_splits = 3

error_formula = 'mean_absolute_error'
njobs = 3
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', RandomForestRegressor, \
                                      {'n_estimators':range(1,31,5)}, \
                                      cv_splits, error_formula, njobs, write,\
                                      random_state=0, verbose=True)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
                                  (u'values', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [None]:
from sklearn.ensemble import ExtraTreesRegressor
model = ExtraTreesRegressor()
%prun model.fit(test_data[0], test_data[1])

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.ensemble import ExtraTreesRegressor
from sklearn import metrics
cv_splits = 3

error_formula = 'mean_absolute_error'
njobs = 3
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', ExtraTreesRegressor, \
                                      {'n_estimators':[50]}, \
                                      cv_splits, error_formula, njobs, write,\
                                      random_state=0, verbose=True)

In [None]:
from sklearn.ensemble import AdaBoostRegressor
model = AdaBoostRegressor()
%prun model.fit(test_data[0], np.ravel(test_data[1]))

In [None]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.ensemble import AdaBoostRegressor
from sklearn import metrics
cv_splits = 3

error_formula = 'mean_absolute_error'
njobs = 4
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', AdaBoostRegressor, \
                                      {'n_estimators':[int(x) for x in np.logspace(0,2.6,5)]}, \
                                      cv_splits, error_formula, njobs, write,\
                                      random_state=0, learning_rate=0.1)

In [None]:
from sklearn.ensemble import GradientBoostingRegressor
model = GradientBoostingRegressor(verbose=10, learning_rate=0.1)
%prun model.fit(test_data[0],np.ravel(test_data[1]))

In [13]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
cv_splits = 3

error_formula = 'mean_absolute_error'
njobs = 4
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-20-13-12-51.p', GradientBoostingRegressor, \
                                      {'n_estimators':[int(x) for x in np.logspace(0,2.6,5)]}, \
                                      cv_splits, error_formula, njobs, write, \
                                      random_state=0, learning_rate=.1, verbose=10, loss='lad')

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

      Iter       Train Loss   Remaining Time 
         1     6378948.4306           38.66s      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
      Iter       Train Loss   Remaining Time 
         1     6513357.4632            0.00s         1     5463694.0984            0.00s         1     6049682.9194            0.00s         1     6513357.4632            6.90s



      Iter       Train Loss   Remaining Time       Iter       Train Loss   Remaining Time       Iter       Train Loss   Remaining Time          2     6320867.5244            4.20s



         1     5463694.0984            5.56s         1     6049682.9194            5.60s         1     6513357.4632           34.12s         3     6179241.2006            2.01s



         2     5259301.2543            3.60s         2     5796951.6048            3.63s         2     6320867.5244           32.14s         4     6018093.9227            0.00s



   

INFO:solar.analyze.model:Best score: -6698815.16575
INFO:solar.analyze.model:Best params: {'n_estimators': 19}
INFO:solar.analyze.model:Best estimator: GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='lad',
             max_depth=3, max_features=None, max_leaf_nodes=None,
             min_samples_leaf=1, min_samples_split=2,
             min_weight_fraction_leaf=0.0, n_estimators=19, presort='auto',
             random_state=0, subsample=1.0, verbose=10, warm_start=False)



 

In [19]:
from sklearn.neighbors import KNeighborsRegressor
model = KNeighborsRegressor(n_neighbors=10, weights='uniform')
%prun model.fit(test_data[0],np.ravel(test_data[1]))

 

In [22]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.neighbors import KNeighborsRegressor
from sklearn import metrics
cv_splits = 3

error_formula = 'mean_absolute_error'
njobs = 4
write = 'local'
%prun model = Model.model_from_pickle('input_2016-02-15-10-30-00.p', KNeighborsRegressor, \
                                      {'n_neighbors':[int(x) for x in np.logspace(0,3,10)], 'weights': ['uniform', 'distance']}, \
                                      cv_splits, error_formula, njobs, write)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [24]:
var_import_model = pickle.load(open('solar/data/kaggle_solar/models/model_2016-02-18-12-26-18.p','rb'))

In [31]:
var_import_model.best_estimator_.train_score_

array([ 6378948.43058982,  6152719.31464454,  5923591.4955259 ,
        5748383.34328998,  5596869.41937363,  5453301.81395847,
        5326992.65101031,  5211199.29800708,  5109748.27297102,
        5043401.07123363,  4988726.72480982,  4934717.74951796,
        4881188.76117125,  4796177.41606346,  4727116.09248912,
        4656984.18289107,  4606678.2143224 ,  4556384.02639415,
        4524864.50017443])