# Analysis Walkthrough

## Determine all of the parameters

### Specify the locations of the files for X_train, X_test, and y_train. Also, there is a file that contains information about the individual stations that can be useful for models that learn for each station.

In [1]:
import os
os.chdir('..')

In [2]:
Xtrain_dir = 'solar/data/kaggle_solar/train/'
Xtest_dir = 'solar/data/kaggle_solar/test'
ytrain_file = 'solar/data/kaggle_solar/train.csv'
station_file = 'solar/data/kaggle_solar/station_info.csv'

### Import the various files. This is mostly done so that any file updates during testing are carried over to this notebook.

In [3]:
import solar.wrangle.wrangle
import solar.wrangle.subset
import solar.wrangle.engineer
import solar.analyze.model
import solar.report.submission
import numpy as np
from imp import reload

### Set the parameters that will be used to set up the data. There are some parameters that determine the size and shape of the data but have effects other than setting up feature columns. This includes the dates that are included for testing and training, the stations considered, and whether to have X values correspond to a date or to a specific date/station combination.

In [4]:
# Choose up to 98 stations; not specifying a station means to use all that fall within the given lats and longs. If the
# parameter 'all' is given, then it will use all stations no matter the provided lats and longs

#station = ['all']
station = ['ACME', 'BEAV']

# Determine which dates will be used to train the model. No specified date means use the entire set from 1994-01-01 
# until 2007-12-31.

#train_dates = ['1994-01-01','2007-12-31']
train_dates = ['1994-01-01','1994-03-01']

# Determine the test X values to produce. There is no practical purpose to use fewer than all of the points other than
# for testing. Again, not choosing a date will use 2008-01-01 through 2012-11-30.

#test_dates = ['2008-01-01','2012-11-30']
test_dates = ['2008-01-01', '2008-01-05']

# The last parameter that is not specifically involved in feature selection in the layout to be used for training
# I have switched to almost always training for each individual station rather than having a single row for a date.
# However, I am still not beating the benchmark, and the file would grow too large to have the benchmark laid out
# with a row for each station, so I'll keep the switch. True means that each station has a row (5113 dates X 98
# stations to train the data). False means that there are 5113 rows that are being used to train the data.
station_layout = True

### First, just duplicate the functionality of the basic grid analysis

In [5]:
# Use all variables

#var = ['all']
var = ['uswrf_sfc', 'dswrf_sfc', ]

# Keep model 0 (the default model) as a column for each of the variables (aggregated over other dimensions)
model = [0, 1, 2, 3]

# Aggregate over all times
#times = [12, 15, 18, 21, 24]
times = [12, 15, 21]

default_grid = {'type':'relative', 'axes':{'var':var, 'models':model, 'times':times,
                                          'station':station}}
just_grid = [default_grid]

### Run data extraction

In [6]:
# if I am modifying code for any of these pythons
reload(solar.wrangle.wrangle)
reload(solar.wrangle.subset)
reload(solar.wrangle.engineer)
from solar.wrangle.wrangle import SolarData

%prun input_data = SolarData.load(Xtrain_dir, ytrain_file, Xtest_dir, station_file, \
                                  train_dates, test_dates, station, \
                                  station_layout, just_grid, 'local')

 

In [7]:
input_data[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values,values
Unnamed: 0_level_1,var,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,...,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc,uswrf_sfc,dswrf_sfc
Unnamed: 0_level_2,times,12,12,12,12,12,12,12,12,15,15,...,15,15,21,21,21,21,21,21,21,21
Unnamed: 0_level_3,lat_longs,1_SE,1_SE,2_SW,2_SW,3_NE,3_NE,4_NW,4_NW,1_SE,1_SE,...,4_NW,4_NW,1_SE,1_SE,2_SW,2_SW,3_NE,3_NE,4_NW,4_NW
Unnamed: 0_level_4,models,0,0,0,0,0,0,0,0,0,0,...,3,3,3,3,3,3,3,3,3,3
train_dates,station,Unnamed: 2_level_5,Unnamed: 3_level_5,Unnamed: 4_level_5,Unnamed: 5_level_5,Unnamed: 6_level_5,Unnamed: 7_level_5,Unnamed: 8_level_5,Unnamed: 9_level_5,Unnamed: 10_level_5,Unnamed: 11_level_5,Unnamed: 12_level_5,Unnamed: 13_level_5,Unnamed: 14_level_5,Unnamed: 15_level_5,Unnamed: 16_level_5,Unnamed: 17_level_5,Unnamed: 18_level_5,Unnamed: 19_level_5,Unnamed: 20_level_5,Unnamed: 21_level_5,Unnamed: 22_level_5
1994-01-01,ACME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,30.0,...,9.0,30.0,116.0,540.0,110.0,530.0,114.0,520.0,108.0,520.0
1994-01-01,BEAV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,20.0,...,5.0,20.0,123.0,520.0,118.0,510.0,105.0,490.0,104.0,480.0
1994-01-02,ACME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,30.0,...,8.0,30.0,115.0,540.0,107.0,520.0,105.0,480.0,83.0,430.0
1994-01-02,BEAV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,10.0,...,5.0,20.0,123.0,520.0,112.0,490.0,104.0,490.0,78.0,400.0
1994-01-03,ACME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,30.0,...,9.0,30.0,117.0,540.0,113.0,540.0,116.0,530.0,110.0,530.0
1994-01-03,BEAV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,20.0,...,4.0,20.0,123.0,520.0,118.0,510.0,103.0,490.0,98.0,470.0
1994-01-04,ACME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,9.0,30.0,...,9.0,30.0,121.0,560.0,114.0,550.0,119.0,540.0,112.0,530.0
1994-01-04,BEAV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,20.0,...,5.0,20.0,123.0,520.0,120.0,520.0,103.0,490.0,106.0,500.0
1994-01-05,ACME,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,30.0,...,8.0,30.0,116.0,550.0,112.0,540.0,113.0,520.0,109.0,530.0
1994-01-05,BEAV,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,20.0,...,4.0,20.0,107.0,480.0,112.0,500.0,105.0,500.0,104.0,490.0


### Run through the full analysis

In [8]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Ridge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10

model = Model.model(input_data, Ridge, {'alpha':np.logspace(-3,1,8,base=10)}, cv_splits, 
                    error_formula, 1, 'local', normalize=True)

                       values                                          \
var                 uswrf_sfc dswrf_sfc uswrf_sfc dswrf_sfc uswrf_sfc   
times                      12        12        12        12        12   
lat_longs                1_SE      1_SE      2_SW      2_SW      3_NE   
models                      0         0         0         0         0   
train_dates station                                                     
1994-01-01  ACME          0.0       0.0       0.0       0.0       0.0   
            BEAV          0.0       0.0       0.0       0.0       0.0   
1994-01-02  ACME          0.0       0.0       0.0       0.0       0.0   
            BEAV          0.0       0.0       0.0       0.0       0.0   
1994-01-03  ACME          0.0       0.0       0.0       0.0       0.0   
            BEAV          0.0       0.0       0.0       0.0       0.0   
1994-01-04  ACME          0.0       0.0       0.0       0.0       0.0   
            BEAV          0.0       0.0       0.0  

In [9]:
reload(solar.report.submission)
from solar.report.submission import Submission

preds = Submission.make_submission_file(model, input_data[1], input_data[2], {'grid'}, 'local')

                      values                                          \
var                uswrf_sfc dswrf_sfc uswrf_sfc dswrf_sfc uswrf_sfc   
times                     12        12        12        12        12   
lat_longs               1_SE      1_SE      2_SW      2_SW      3_NE   
models                     0         0         0         0         0   
test_dates station                                                     
2008-01-01 ACME          0.0       0.0       0.0       0.0       0.0   
           BEAV          0.0       0.0       0.0       0.0       0.0   
2008-01-02 ACME          0.0       0.0       0.0       0.0       0.0   
           BEAV          0.0       0.0       0.0       0.0       0.0   
2008-01-03 ACME          0.0       0.0       0.0       0.0       0.0   
           BEAV          0.0       0.0       0.0       0.0       0.0   
2008-01-04 ACME          0.0       0.0       0.0       0.0       0.0   
           BEAV          0.0       0.0       0.0       0.0      

TypeError: a bytes-like object is required, not 'str'

In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline



ImportError: No module named 'seaborn'

In [None]:
# here we set some parameters to set figure size and style
plt.rcParams['figure.figsize'] = (8, 6)
plt.rcParams['font.size'] = 14
sns.set(style="white", context="talk")

In [None]:
y_pred = model.predict(input_data[0])

In [None]:
errors = abs(y_pred - input_data[1])/input_data[1]

In [None]:
#sns.distplot(errors, rug=True)

In [None]:
#input_data[0].shape

In [None]:
#errors.shape

In [None]:
#input_data[0][(errors > 0.8).values]

In [None]:
#input_data[1][(errors > 0.8).values]

In [None]:
#pd.DataFrame(y_pred, index=input_data[1].index, columns=input_data[1].columns)[(errors > 0.8).values]

In [None]:
#max(errors.values)

In [None]:
#unstacked = input_data[0].stack('time').stack('model').stack('variable').stack('lat_longs').reset_index('model').reset_index('time').reset_index('variable').reset_index('lat_longs')

In [None]:
#dswrf = unstacked[(unstacked['lat_longs'] == 'NE') & (unstacked['variable'] == 'dswrf_sfc') & (unstacked['time'] == 21)
#         & (unstacked['model'] == 0)]['relative']

In [None]:
#pd.DataFrame(dswrf).reset_index('station').rename(columns={'station':'location'}).unstack('location').set_index('location')

In [None]:
#dswrf

In [None]:
#pd.concat((errors,dswrf))

In [None]:
#pd.concat([dswrf,errors])

In [None]:
#sns.jointplot("total_solar","dswrf", data=drinks[(drinks.beer < 100) & (drinks.wine < 30)] , kind = "kde")

In [None]:
#import netCDF4 as nc
#X = nc.Dataset('solar/data/kaggle_solar/train/dswrf_sfc_latlon_subset_19940101_20071231.nc','r+').variables.values()
#X[-1][0:10,0,2:3,:,:]

In [6]:
# This just uses the station_names as another feature
stat_names = {'type':'station_names'}
stat_feats = [stat_names]

### Next, we start to layout the features to include. The two most important (and complicated) are 'absolute', which just reports out the weather variables at specific GEFS, times, and models, and 'relative' which uses a grid to identify nearby GEFS for weather measurements based on the location of the station. The second option makes the most sense when using the station_layout above, but it will work with either layout.

In [None]:
# A very simple model would just take the average value of all variables at all locations, using all models over the
# course of the day. Here, only the var parameter and one value of the model is expanded. 
# All of the other axes are aggregated using an aggregation function. In this case, the mean value. 
# This will provide a 15 aggregated columns for model 0 and 15 aggregated columns for the mean of models 1 though 10.
# In this case, setting station_layout to false would make the most sense because the measurements will be repeated
# for each station. However, for consistency in this walkthough, I will just keep it in the station_layout.

# Dimensions without aggregation

# Use all variables
var = ['all']

# Keep model 0 (the default model) as a column for each of the variables (aggregated over other dimensions)
model1 = [0]

# Dimensions with aggregation

# Aggregate over all other models (excluding model 0, which is used directly)
model2 = range(1,11)

# Aggregate over all times
times = ['all']

# Aggregate over all latitudes which surround Mesonet stations (exclude those that are outside of the main grid)
lats = range(33,38)

# Same as for lats
longs = range(257,267)

all_avgs = {'type':'absolute', 'full_axes':{'var':var, 'models':model1}, 
            'agg_axes':{'models':[model2,[np.mean]], 'times':[times, [np.mean, np.sum]], 'lats':[lats,[np.median]],
                        'longs':[longs,[np.median]]}}

avgs_feats = [all_avgs]

In [None]:
# A similar example using a surrounding grid for each station. There are no lat or long options for this type of
# feature set

# Dimensions without aggregation

# Use all variables
var = ['all']

# Keep model 0 (the default model) as a column for each of the variables (aggregated over other dimensions)
model1 = [0]

# Dimensions with aggregation

# Aggregate over all other models (excluding model 0, which is used directly)
model2 = range(1,11)

# Aggregate over all times
times = ['all']

# Create a column for each member of the grid. All or nothing for gefs now. Could specify but currently see no need
# for it. We could also take an aggregate measure of the gefs (including interpolate). That doesn't work for the
# other dimensions

gefs = ['all']

grid_avgs = {'type':'relative', 'full_axes':{'var':var, 'models':model1, 'gefs':gefs}, 
            'agg_axes':{'models':[model2,[np.mean]], 'times':[times, [np.mean, np.sum]]}}

grid_feats = [grid_avgs]

In [31]:
# A similar example using a surrounding grid for each station. Now, just average over the grid

# Dimensions without aggregation

# Use all variables
var = ['all']

# Keep model 0 (the default model) as a column for each of the variables (aggregated over other dimensions)
model1 = [0]

# Dimensions with aggregation

# Aggregate over all other models (excluding model 0, which is used directly)
model2 = range(1,11)

# Aggregate over all times
times = ['all']

# Create a column for each member of the grid. All or nothing for gefs now. Could specify but currently see no need
# for it. We could also take an aggregate measure of the gefs (including interpolate). That doesn't work for the
# other dimensions

gefs = ['all']

grid_avgs = {'type':'relative', 'full_axes':{'var':var, 'models':model1}, 
            'agg_axes':{'models':[model2,[np.mean]], 'times':[times, [np.mean, np.sum]], 'gefs':[gefs, [np.mean]]}}

gefs_mean_feats = [grid_avgs]

In [36]:
# This just uses the station_names as another feature
stat_names = {'type':'station_names'}
stat_feats = [stat_names]

In [37]:
frac_dist = {'type':'frac_dist'}
dist_feats = [frac_dist]

In [52]:
# if I am modifying code for any of these pythons
reload(solar.wrangle.wrangle)
reload(solar.wrangle.subset)
reload(solar.wrangle.engineer)
from solar.wrangle.wrangle import SolarData

dist_data = SolarData.load(Xtrain_dir, ytrain_file, Xtest_dir, station_file, \
                                  train_dates, test_dates, station, \
                                  station_layout, dist_feats)

INFO:solar.wrangle.wrangle:Started building test and training data
INFO:solar.wrangle.wrangle:Features: [{'type': 'frac_dist'}]
INFO:solar.wrangle.wrangle:Finished building test and training data


log/log_2016-02-06-16-04-32.log


In [38]:
days_solstice = {'type':'days_from_solstice'}
days_cold = {'type':'days_from_coldest'}
days_feats = [days_solstice, days_cold]

In [None]:
# if I am modifying code for any of these pythons
reload(solar.wrangle.wrangle)
reload(solar.wrangle.subset)
reload(solar.wrangle.engineer)
from solar.wrangle.wrangle import SolarData

dist_data = SolarData.load(Xtrain_dir, ytrain_file, Xtest_dir, station_file, \
                                  train_dates, test_dates, station, \
                                  station_layout, days_feats)

In [39]:
all_feats = [frac_dist, days_cold, days_solstice, default_grid, stat_names]

In [62]:
# if I am modifying code for any of these pythons
reload(solar.wrangle.wrangle)
reload(solar.wrangle.subset)
reload(solar.wrangle.engineer)
from solar.wrangle.wrangle import SolarData

all_feats_data = SolarData.load(Xtrain_dir, ytrain_file, Xtest_dir, station_file, \
                                  train_dates, test_dates, station, \
                                  station_layout, all_feats)

INFO:solar.wrangle.wrangle:Started building test and training data
INFO:solar.wrangle.wrangle:Features: [{'type': 'frac_dist'}, {'type': 'days_from_coldest'}, {'type': 'days_from_solstice'}, {'axes': {'var': ['all'], 'models': [0], 'station': ['all'], 'times': [12, 15, 18, 21, 24]}, 'type': 'relative'}, {'type': 'station_names'}]
INFO:solar.wrangle.wrangle:Train dates: ['1994-01-01', '1994-03-01']
INFO:solar.wrangle.wrangle:Test dates: ['2008-01-01', '2008-01-05']
INFO:solar.wrangle.wrangle:Stations: ['all']
INFO:solar.wrangle.wrangle:Finished building test and training data


In [63]:
all_feats_data[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,"(frac_dist, lat_dist)","(frac_dist, long_dist)","(days_from_coldest, from_coldest)","(days_from_solstice, from_solstice)","(relative, 0)","(relative, 0)","(relative, 0)","(relative, 0)","(relative, 0)","(relative, 0)",...,stat_TISH,stat_VINI,stat_WASH,stat_WATO,stat_WAUR,stat_WEAT,stat_WEST,stat_WILB,stat_WIST,stat_WOOD
train_dates,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
1994-01-01,ACME,0.80833,0.97675,3,11,0,261.018982,0,348.760559,253.599884,5.800000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,ADAX,0.79851,0.33091,3,11,0,277.018982,0,344.760559,230.599884,8.700000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,ALTU,0.58722,0.66192,3,11,0,255.018982,0,350.760559,257.599884,5.900000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,APAC,0.91418,0.70784,3,11,0,261.018982,0,348.760559,253.599884,5.800000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,ARNE,0.07204,0.09692,3,11,0,240.018982,0,331.760559,248.599884,6.800000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,BEAV,0.80253,0.46988,3,11,0,234.018982,0,323.760559,245.599884,5.900000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,BESS,0.40185,0.94153,3,11,0,247.018982,0,342.760559,253.599884,6.500000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,BIXB,0.96305,0.13379,3,11,0,278.018982,0,338.760559,208.599884,7.900000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,BLAC,0.75443,0.74548,3,11,0,249.018982,0,339.760559,248.599884,7.300000,...,0,0,0,0,0,0,0,0,0,0
1994-01-01,BOIS,0.69256,0.50287,3,11,0,220.018982,0,310.760559,237.599884,4.800000,...,0,0,0,0,0,0,0,0,0,0


In [64]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Ridge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10

write = 'local'
njobs = 1

model = Model.model(all_feats_data, Ridge, {'alpha':np.logspace(-5,-2,8,base=10)}, cv_splits, 
                    error_formula, njobs, write, normalize=True, random_state=1)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

In [67]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.ensemble import GradientBoostingRegressor
from sklearn import metrics
cv_splits = 10

error_formula = 'mean_absolute_error'
njobs = 1
write = 'local'
%prun model = Model.model(all_feats_data, GradientBoostingRegressor, {'n_estimators':[300]}, \
                    cv_splits, error_formula, njobs, write, loss='ls', max_depth=1, random_state=0, learning_rate=.001)

INFO:solar.analyze.model:Started building model
INFO:solar.analyze.model:Train input columns: Index([              (u'frac_dist', u'lat_dist'),
                    (u'frac_dist', u'long_dist'),
         (u'days_from_coldest', u'from_coldest'),
       (u'days_from_solstice', u'from_solstice'),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
                                (u'relative', 0),
       ...
                                    u'stat_TISH',
                                    u'stat_VINI',
                                    u'stat_WASH',
                                    u'stat_WATO',
                                    u'stat_WAUR',
                                    u'stat_WEAT',
                                    u'stat_WEST',
                                    u'stat_WI

 

In [61]:
reload(solar.analyze.model)
import numpy as np
from solar.analyze.model import Model

from sklearn.linear_model import Ridge
from sklearn import metrics

error_formula = 'mean_absolute_error'
cv_splits = 10

model = Model.model_from_pickle('input_2016-02-06-15-38-35.p', Ridge, {'alpha':np.logspace(5,10,10,base=10)}, cv_splits, 
                    error_formula, normalize=True, random_state=1)

IOError: [Errno 2] No such file or directory: 'solar/data/kaggle_solar/inputs/input_2016-02-06-15-38-35.p'

In [60]:
reload(solar.report.submission)
from solar.report.submission import Submission

preds = Submission.make_submission_file(model, all_feats_data[1], all_feats_data[2], True)

INFO:solar.report.submission:Started building submission file
INFO:solar.report.submission:Finished building submission file


                   frac_dist           days_from_coldest days_from_solstice  \
dist_ind            lat_dist long_dist      from_coldest      from_solstice   
test_dates station                                                            
2008-01-01 ACME      0.80833   0.97675                 3                 11   
           ADAX      0.79851   0.33091                 3                 11   
           ALTU      0.58722   0.66192                 3                 11   
           APAC      0.91418   0.70784                 3                 11   
           ARNE      0.07204   0.09692                 3                 11   
           BEAV      0.80253   0.46988                 3                 11   
           BESS      0.40185   0.94153                 3                 11   
           BIXB      0.96305   0.13379                 3                 11   
           BLAC      0.75443   0.74548                 3                 11   
           BOIS      0.69256   0.50287              

In [63]:
reload(solar.report.submission)
from solar.report.submission import Submission

preds = Submission.make_submission_file(model, all_feats_data[1], all_feats_data[2], True)

INFO:solar.report.submission:Started building submission file
INFO:solar.report.submission:Finished building submission file


                   frac_dist           days_from_coldest days_from_solstice  \
dist_ind            lat_dist long_dist      from_coldest      from_solstice   
test_dates station                                                            
2008-01-01 ACME      0.80833   0.97675                 3                 11   
           ADAX      0.79851   0.33091                 3                 11   
           ALTU      0.58722   0.66192                 3                 11   
           APAC      0.91418   0.70784                 3                 11   
           ARNE      0.07204   0.09692                 3                 11   
           BEAV      0.80253   0.46988                 3                 11   
           BESS      0.40185   0.94153                 3                 11   
           BIXB      0.96305   0.13379                 3                 11   
           BLAC      0.75443   0.74548                 3                 11   
           BOIS      0.69256   0.50287              

In [64]:
import solar.report.submission
reload(solar.report.submission)
from solar.report.submission import Submission
from solar.analyze.model import Model
import cPickle as pickle

model = pickle.load(open('solar/data/kaggle_solar/models/model_2016-02-21-16-11-01.p', 'rb'))

In [66]:
model.best_estimator_.feature_importances_

array([ 0.        ,  0.        ,  0.01798295,  0.04040199,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.01585286,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.01245387,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.01208461,  0.        ,  0.        ,  0.        ,  0.00557763,
        0.        ,  0.        ,  0.        ,  0.        ,  0.0165461 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.01336939,  0.        ,  0.        ,  0.  

In [76]:
import numpy as np
coefs = model.best_estimator_.feature_importances_.ravel()
var_import = []
for col, val in enumerate(list(coefs)):
    var_import.append((abs(val), col))
var_import.sort(reverse=True)
for i, entry in enumerate(var_import):
    print i+1, entry

1 (0.049477846554571825, 499)
2 (0.04756906447284228, 289)
3 (0.045787987915036396, 651)
4 (0.040401990174583381, 3)
5 (0.033065838140069748, 229)
6 (0.026877830373224734, 269)
7 (0.025231823153174772, 293)
8 (0.023962145423843468, 139)
9 (0.023959812390356758, 328)
10 (0.02387688407345134, 484)
11 (0.02365410548240118, 324)
12 (0.023176512068311343, 574)
13 (0.018861678759893231, 350)
14 (0.018524554008903592, 156)
15 (0.018128435428584767, 199)
16 (0.017982947825859758, 2)
17 (0.017701298909982181, 254)
18 (0.017674970929760132, 244)
19 (0.017655940796584963, 161)
20 (0.017566170581311727, 111)
21 (0.017065337961658777, 277)
22 (0.016546101935391168, 59)
23 (0.015852861826440193, 29)
24 (0.01552991516527288, 294)
25 (0.014108997751528921, 274)
26 (0.014096421753054334, 154)
27 (0.013707874287751545, 529)
28 (0.013564974189330791, 466)
29 (0.013369392552833146, 66)
30 (0.013315149742729328, 299)
31 (0.012914731003345314, 171)
32 (0.012697451532124657, 174)
33 (0.012453867432696364, 44

In [84]:
my_input = pickle.load(open('solar/data/kaggle_solar/inputs/input_2016-02-21-09-27-29.p', 'rb'))

In [81]:
[(((var[1]-4)/15)/4)/5 for var in var_import][0:20]

[1, 0, 2, -1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, -1, 0, 0, 0, 0]

In [99]:
my_input[0]

Unnamed: 0_level_0,Unnamed: 1_level_0,"(frac_dist, lat_dist)","(frac_dist, long_dist)","(days_from_coldest, from_coldest)","(days_from_solstice, from_solstice)","(values, 0)","(values, 0)","(values, 0)","(values, 0)","(values, 0)","(values, 0)",...,stat_TISH,stat_VINI,stat_WASH,stat_WATO,stat_WAUR,stat_WEAT,stat_WEST,stat_WILB,stat_WIST,stat_WOOD
train_dates,station,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
2000-01-01,ACME,0.80833,0.97675,3,11,0,274,0,351.347,252.952,12.7524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,ADAX,0.79851,0.33091,3,11,0,270,0,349.347,254.952,15.0524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,ALTU,0.58722,0.66192,3,11,0,278,0,355.347,252.952,12.7524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,APAC,0.91418,0.70784,3,11,0,274,0,351.347,252.952,12.7524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,ARNE,0.07204,0.09692,3,11,0,262,0,344.347,239.952,13.6524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,BEAV,0.80253,0.46988,3,11,0,267,0,345.347,231.952,13.2524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,BESS,0.40185,0.94153,3,11,0,267,0,344.347,247.952,12.8524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,BIXB,0.96305,0.13379,3,11,0,265,0,346.347,253.952,12.0524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,BLAC,0.75443,0.74548,3,11,0,256,0,340.347,247.952,12.3524,...,0,0,0,0,0,0,0,0,0,0
2000-01-01,BOIS,0.69256,0.50287,3,11,0,251,0,330.347,224.952,10.8524,...,0,0,0,0,0,0,0,0,0,0


In [63]:
my_input[0].iloc[0:10,4+8+15*4]

train_dates  station
2000-01-01   ACME       101283
             ADAX       101496
             ALTU       101159
             APAC       101283
             ARNE       100877
             BEAV       100779
             BESS       101082
             BIXB       101525
             BLAC       101138
             BOIS       100849
Name: (values, 0), dtype: object

In [74]:
import cPickle as pickle
model = pickle.load(open('./solar/data/kaggle_solar/models/model_2016-02-21-16-11-01.p','rb'))

In [75]:
model.best_estimator_.feature_importances_

array([ 0.        ,  0.        ,  0.01798295,  0.04040199,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.01585286,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.01245387,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.01208461,  0.        ,  0.        ,  0.        ,  0.00557763,
        0.        ,  0.        ,  0.        ,  0.        ,  0.0165461 ,
        0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
        0.        ,  0.01336939,  0.        ,  0.        ,  0.  

In [108]:
from sklearn.metrics import mean_absolute_error

In [109]:
mean_absolute_error(my_input[1], [16526096]*len(my_input[1]))

6661931.484862077

In [107]:
np.sum((abs(my_input[1].iloc[0:98]-16526096)).values)

521578408

In [110]:
averages = [16877461.91,16237533.98,17119188.79,17010565.32,17560172.95,17612143.11,17304074.16,15969634.24,16061706.56,
 18688943.33,16034080.62,16655128.52,16014657.87,17304512.26,16104884.18,16438717.4,17331768.39,16597616.48,17515474.77,16055064.62,
 15993253.01,17015639.69,17484171.18,16541992.87,15486166.08,15656934.05,15667279.87,15896897.03,16348532.74,
 16707117.53,17440381.55,15718914.29,16444023.44,16082322.25,17182922.64,16936510.22,17943529.54,16526024.35,16066904.41,
 16924262.95,17184734.64,17988183.75,18041482.55,16184172.02,15849509.66,15679748.78,18700559.48,16715044.14,16741635.54,15952446.49,
 16367520.26,17439983.73,16440133.58,16786662.48,15709795.78,16240623.78,17040788.48,15766297.46,16626777.31,14795920.57,16228119.75,
 16659807.74,15986616.42,16009687.23,15959177.06,15622161.7,16284096.52,16192237.83,16322837.71,15657444.98,16890165.69,16355147.03,
 17303377.43,17170127.35,15721493.12,16952355.28,16359889.91,15904617.52,17262330.16,16441963.35,15716436.06,16101732.84,15903216.55,
 16688078.17,15826361.38,15590790.71,17739361.48,15941223.45,15737219.14,16648738.63,16959415.48,17039770.7,17074429.12,15331710.15,
 16000551.34,15657185.09,17423402.11,16085012.47]

In [112]:
mean_absolute_error(my_input[1], averages*182)

6618281.0173525456

In [113]:
lin_model = pickle.load(open('./solar/data/kaggle_solar/models/model_2016-02-22-11-01-56.p','rb'))

In [116]:
mean_absolute_error(my_input[1], lin_model.predict(my_input[0]))

2231368.1034811228

In [117]:
ridge_model = pickle.load(open('./solar/data/kaggle_solar/models/model_2016-02-22-12-03-14.p','rb'))

In [118]:
mean_absolute_error(my_input[1], ridge_model.predict(my_input[0]))

2314578.601704394

In [119]:
dt_model = pickle.load(open('./solar/data/kaggle_solar/models/model_2016-02-22-12-11-36.p','rb'))

In [120]:
dt_model

GridSearchCV(cv=3, error_score='raise',
       estimator=RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=0,
           verbose=True, warm_start=False),
       fit_params={}, iid=True, n_jobs=3,
       param_grid={'n_estimators': [50]}, pre_dispatch='2*n_jobs',
       refit=True, scoring='mean_absolute_error', verbose=0)

In [123]:
mean_absolute_error(my_input[1], dt_model.predict(my_input[0]))

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished


447373.93607535324

In [124]:
dt_model.predict(my_input[0]).shape

[Parallel(n_jobs=1)]: Done  49 tasks       | elapsed:    0.4s
[Parallel(n_jobs=1)]: Done  50 out of  50 | elapsed:    0.4s finished


(17836,)

In [125]:
gbm_model = pickle.load(open('./solar/data/kaggle_solar/models/model_2016-02-22-12-23-26.p','rb'))

In [126]:
mean_absolute_error(my_input[1], gbm_model.predict(my_input[0]))

2839787.6824809983