In [44]:
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os
from copy import copy

import numpy as np
import pandas as pd

from random import choice

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import RandomizedSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle
from utils.generic_utils import pickle_out,pickle_in
import utils.preprocessing_utils as prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [45]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train = train.drop(["count","registered","casual"], axis=1)
Y_train = train[["count"]].values.ravel()

X_valid = valid.drop(["count","registered","casual"], axis=1)
Y_valid = valid[["count"]].values.ravel()

print X_train.shape,Y_train.shape
print X_valid.shape,Y_valid.shape
print X_train.head()

(7452, 9) (7452,)
(1722, 9) (1722,)
              datetime  season  holiday  workingday  weather  temp   atemp  \
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3  2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4  2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  
0        81        0.0  
1        80        0.0  
2        80        0.0  
3        75        0.0  
4        75        0.0  


In [46]:
rolling_lags = [5,10,20,50,300,1000]
edge_gap_varnames = ['edge_lag_%s'%l for l in rolling_lags]
edge_gap_steps = [('add_edge_gap_lag_%s'%lag,prep.AddTimeGaps(lag=lag))for lag in rolling_lags]

max_lag=10
numerical_lagging_variables = ['atemp','humidity','windspeed']
categorical_lagging_variables = ['weather','holiday','workingday','season']
categorical_lagging_varnames = ['%s_lag_%s'%(var,l) for var in categorical_lagging_variables for l in range(1,max_lag,1)]
all_lagging_variables = numerical_lagging_variables + categorical_lagging_variables

lagging_values_steps = [('%s_lag_%s'%(var,lag),
                         prep.LaggingValues(colname = var, lag = max_lag))for var in all_lagging_variables]

stat_calculation_modes = ["whole","day","night"]
median_steps = [('median_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMedian(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
max_steps = [('max_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMax(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
min_steps =[('min_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMin(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
mode_steps =[('mode_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMode(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in categorical_lagging_variables 
                for mode in stat_calculation_modes]

lagging_mode_varnames = ["%s_%s_mode_%s"%(var,mode,lag) for lag in rolling_lags
                for var in categorical_lagging_variables 
                for mode in stat_calculation_modes]

encode_variables = ['season','weather','holiday','workingday','date_year','day_night']
encode_variables.extend(edge_gap_varnames)
encode_variables.extend(categorical_lagging_varnames)
encode_variables.extend(lagging_mode_varnames)

encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]


prep_steps = [('extract_times', prep.ExtractTimes())]+\
            edge_gap_steps+lagging_values_steps+\
            median_steps+max_steps+min_steps+mode_steps+\
            encoding_steps
#                [('drop_columns', prep.DropColumns(colnames = ["datetime"])),
#               ('feature_selection',SelectKBest(k=100))
#              ]
              
prep_pipe = Pipeline(prep_steps)
X_train = prep_pipe.fit_transform(X_train,Y_train)
X_valid = prep_pipe.transform(X_valid)
print X_train.shape,X_valid.shape

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  min_periods=1).median()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  min_periods=1).max()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  min_periods=1).min()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pan

(7452, 635) (1722, 635)


In [47]:
pickle_out(os.path.join("../","datasets","generated_features","train_lagging_all.pkl"),(X_train,Y_train))
pickle_out(os.path.join("../","datasets","generated_features","valid_lagging_all.pkl"),(X_valid,Y_valid))

In [54]:
(X_train,Y_train) = pickle_in(os.path.join("../","datasets","generated_features","train_lagging_all.pkl"))
(X_valid,Y_valid) = pickle_in(os.path.join("../","datasets","generated_features","valid_lagging_all.pkl"))

In [49]:
drop_variables = ['datetime','season','weather','holiday','workingday','date_year','day_night']
X_train=X_train.drop(drop_variables,axis=1)
X_valid=X_valid.drop(drop_variables,axis=1)

In [55]:
print X_train.shape,Y_train.shape,X_valid.shape,Y_valid.shape

(7452, 635) (7452,) (1722, 635) (1722,)


In [26]:
selection_steps = [("rf_selector",prep.RandomForestFeatureSelector(n_estimators = 100,
                                                                   drop_rate = 1,
                                                                   feature_threshold = 10,
                                                                   max_error_increase = 0.01
                                                                  ))
                  ]

select_pipe = Pipeline(selection_steps)
X_train = select_pipe.fit_transform(X_train,Y_train)
X_valid = select_pipe.transform(X_valid)
print X_train.shape,X_valid.shape

Number of features: 14
-9.60706110952
current error: 0.39293889048
Number of features: 13
-0.0173598590601
current error: 0.37557903142
Number of features: 12
0.00777558246123
current error: 0.383354613881
Number of features: 11
-0.00871980776973
current error: 0.374634806111
(7452, 10) (1722, 10)


In [27]:
print X_train.columns.values

['time_hour' 'day_night_1' 'day_night_0' 'day_night' 'atemp_night_max_300'
 'workingday_0' 'date_year' 'humidity' 'temp_day_median_5'
 'atemp_day_median_300']


In [51]:
rf =  RandomForestRegressor(n_jobs=3,oob_score=True,
                            n_estimators=200,
#                             max_features=100,
#                             min_samples_split = 3,
#                             min_samples_leaf=3,
#                             max_depth=25
                           )
rf.fit(X_train,Y_train)

JoblibValueError: JoblibValueError
___________________________________________________________________________
Multiprocessing exception:
...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x7f7205bcc8b0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/jakubc...python2.7/site-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x7f7205bcc8b0, file "/...2.7/site-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/home/jakubc...python2.7/site-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    584         
    585         If a global instance already exists, this reinitializes and starts it
    586         """
    587         app = cls.instance(**kwargs)
    588         app.initialize(argv)
--> 589         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    590 
    591 #-----------------------------------------------------------------------------
    592 # utility functions, for convenience
    593 #-----------------------------------------------------------------------------

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    400         
    401         if self.poller is not None:
    402             self.poller.start()
    403         self.kernel.start()
    404         try:
--> 405             ioloop.IOLoop.instance().start()
    406         except KeyboardInterrupt:
    407             pass
    408 
    409 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    255         if self.control_stream:
    256             self.control_stream.on_recv(self.dispatch_control, copy=False)
    257 
    258         def make_dispatcher(stream):
    259             def dispatcher(msg):
--> 260                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    261             return dispatcher
    262 
    263         for s in self.shell_streams:
    264             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {'allow_stdin': True, 'code': 'rf =  RandomForestRegressor(n_jobs=3,oob_score=T...                        )\nrf.fit(X_train,Y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-08-22T16:11:43.298454', 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'session': '1C5EF7641C7D4BBFAF234FCE2418ECB1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'parent_header': {}})
    207             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    208         else:
    209             self.log.debug("%s: %s", msg_type, msg)
    210             self.pre_handler_hook()
    211             try:
--> 212                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['1C5EF7641C7D4BBFAF234FCE2418ECB1']
        msg = {'buffers': [], 'content': {'allow_stdin': True, 'code': 'rf =  RandomForestRegressor(n_jobs=3,oob_score=T...                        )\nrf.fit(X_train,Y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-08-22T16:11:43.298454', 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'session': '1C5EF7641C7D4BBFAF234FCE2418ECB1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'parent_header': {}}
    213             except Exception:
    214                 self.log.error("Exception in message handler:", exc_info=True)
    215             finally:
    216                 self.post_handler_hook()

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['1C5EF7641C7D4BBFAF234FCE2418ECB1'], parent={'buffers': [], 'content': {'allow_stdin': True, 'code': 'rf =  RandomForestRegressor(n_jobs=3,oob_score=T...                        )\nrf.fit(X_train,Y_train)', 'silent': False, 'stop_on_error': True, 'store_history': True, 'user_expressions': {}}, 'header': {'date': '2016-08-22T16:11:43.298454', 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'session': '1C5EF7641C7D4BBFAF234FCE2418ECB1', 'username': 'username', 'version': '5.0'}, 'metadata': {}, 'msg_id': 'AB5F3F995C194890BE2C7E24DABD6D13', 'msg_type': 'execute_request', 'parent_header': {}})
    365         if not silent:
    366             self.execution_count += 1
    367             self._publish_execute_input(code, parent, self.execution_count)
    368 
    369         reply_content = self.do_execute(code, silent, store_history,
--> 370                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    371 
    372         # Flush output before sending the reply.
    373         sys.stdout.flush()
    374         sys.stderr.flush()

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'rf =  RandomForestRegressor(n_jobs=3,oob_score...                      )\nrf.fit(X_train,Y_train)', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    170 
    171         reply_content = {}
    172         # FIXME: the shell calls the exception handler itself.
    173         shell._reply_content = None
    174         try:
--> 175             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'rf =  RandomForestRegressor(n_jobs=3,oob_score...                      )\nrf.fit(X_train,Y_train)'
        store_history = True
        silent = False
    176         except:
    177             status = u'error'
    178             # FIXME: this code right now isn't being used yet by default,
    179             # because the run_cell() call above directly fires off exception

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'rf =  RandomForestRegressor(n_jobs=3,oob_score...                      )\nrf.fit(X_train,Y_train)', store_history=True, silent=False, shell_futures=True)
   2897                 self.displayhook.exec_result = result
   2898 
   2899                 # Execute the user code
   2900                 interactivity = "none" if silent else self.ast_node_interactivity
   2901                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2902                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2903 
   2904                 # Reset this so later displayed values do not modify the
   2905                 # ExecutionResult
   2906                 self.displayhook.exec_result = None

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.Assign object>, <_ast.Expr object>], cell_name='<ipython-input-51-0aaeb19fbe78>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3007                     return True
   3008 
   3009             for i, node in enumerate(to_run_interactive):
   3010                 mod = ast.Interactive([node])
   3011                 code = compiler(mod, cell_name, "single")
-> 3012                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7f71cd3932b0, file "<ipython-input-51-0aaeb19fbe78>", line 8>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   3013                     return True
   3014 
   3015             # Flush softspace
   3016             if softspace(sys.stdout, 0):

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7f71cd3932b0, file "<ipython-input-51-0aaeb19fbe78>", line 8>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   3061         outflag = 1  # happens in more places, so it's easier as default
   3062         try:
   3063             try:
   3064                 self.hooks.pre_run_code_hook()
   3065                 #rprint('Running code', repr(code_obj)) # dbg
-> 3066                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7f71cd3932b0, file "<ipython-input-51-0aaeb19fbe78>", line 8>
        self.user_global_ns = {'In': ['', u'get_ipython().magic(u\'load_ext autoreload\')\...kle_in\nimport utils.preprocessing_utils as prep', u'data_folder = os.path.join("../","datasets","i..._valid.shape,Y_valid.shape\nprint X_train.head()', u'get_ipython().magic(u\'load_ext autoreload\')\...kle_in\nimport utils.preprocessing_utils as prep', u'data_folder = os.path.join("../","datasets","i..._valid.shape,Y_valid.shape\nprint X_train.head()', u'rolling_lags = [3,5,7,10,20,50,300,1000]\nedge...form(X_valid)\nprint X_train.shape,X_valid.shape', u'pickle_out(os.path.join("../","datasets","gene...res","valid_lagging_all.pkl"),(X_valid,Y_valid))', u'X_train,Y_train = pickle_in(os.path.join("../"...","generated_features","valid_lagging_all.pkl"))', u'X_train = X_train.drop(["datetime"],axis=1)\nX_valid = X_valid.drop(["datetime"],axis=1)', u'print X_train.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'rf =  RandomForestRegressor(n_jobs=3,oob_score...                      )\nrf.fit(X_train,Y_train)', u'Y_train_pred = rf.predict(X_train)\nY_valid_pr...int train_error,valid_error\nprint rf.oob_score_', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', ...], 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {15: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 28: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 31:                      time_hour  day_night_1  day...               17.4250  

[1722 rows x 9 columns], 32: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 40:                       temp   atemp  humidity  wi...                1.0  

[1722 rows x 1059 columns]}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'RandomForestRegressor': <class 'sklearn.ensemble.forest.RandomForestRegressor'>, 'RandomizedSearchCV': <class 'sklearn.grid_search.RandomizedSearchCV'>, 'SelectKBest': <class 'sklearn.feature_selection.univariate_selection.SelectKBest'>, 'X_train':                       temp   atemp  humidity  wi...                 1.0  

[1722 rows x 628 columns], 'X_valid':                                datetime  season ...                 1.0  

[1722 rows x 635 columns], 'Y_train': array([ 16,  40,  32, ..., 213, 148, 120]), ...}
        self.user_ns = {'In': ['', u'get_ipython().magic(u\'load_ext autoreload\')\...kle_in\nimport utils.preprocessing_utils as prep', u'data_folder = os.path.join("../","datasets","i..._valid.shape,Y_valid.shape\nprint X_train.head()', u'get_ipython().magic(u\'load_ext autoreload\')\...kle_in\nimport utils.preprocessing_utils as prep', u'data_folder = os.path.join("../","datasets","i..._valid.shape,Y_valid.shape\nprint X_train.head()', u'rolling_lags = [3,5,7,10,20,50,300,1000]\nedge...form(X_valid)\nprint X_train.shape,X_valid.shape', u'pickle_out(os.path.join("../","datasets","gene...res","valid_lagging_all.pkl"),(X_valid,Y_valid))', u'X_train,Y_train = pickle_in(os.path.join("../"...","generated_features","valid_lagging_all.pkl"))', u'X_train = X_train.drop(["datetime"],axis=1)\nX_valid = X_valid.drop(["datetime"],axis=1)', u'print X_train.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'rf =  RandomForestRegressor(n_jobs=3,oob_score...                      )\nrf.fit(X_train,Y_train)', u'Y_train_pred = rf.predict(X_train)\nY_valid_pr...int train_error,valid_error\nprint rf.oob_score_', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', u'selection_steps = [("rf_selector",prep.RandomF...form(X_valid)\nprint X_train.shape,X_valid.shape', ...], 'OneHotEncoder': <class 'sklearn.preprocessing.data.OneHotEncoder'>, 'Out': {15: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 28: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 31:                      time_hour  day_night_1  day...               17.4250  

[1722 rows x 9 columns], 32: RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), 40:                       temp   atemp  humidity  wi...                1.0  

[1722 rows x 1059 columns]}, 'Pipeline': <class 'sklearn.pipeline.Pipeline'>, 'RandomForestRegressor': <class 'sklearn.ensemble.forest.RandomForestRegressor'>, 'RandomizedSearchCV': <class 'sklearn.grid_search.RandomizedSearchCV'>, 'SelectKBest': <class 'sklearn.feature_selection.univariate_selection.SelectKBest'>, 'X_train':                       temp   atemp  humidity  wi...                 1.0  

[1722 rows x 628 columns], 'X_valid':                                datetime  season ...                 1.0  

[1722 rows x 635 columns], 'Y_train': array([ 16,  40,  32, ..., 213, 148, 120]), ...}
   3067             finally:
   3068                 # Reset our crash handler in place
   3069                 sys.excepthook = old_excepthook
   3070         except SystemExit as e:

...........................................................................
/home/jakubczakon/projects/bike_sharing_codilime/notebooks/<ipython-input-51-0aaeb19fbe78> in <module>()
      3 #                             max_features=100,
      4 #                             min_samples_split = 3,
      5 #                             min_samples_leaf=3,
      6 #                             max_depth=25
      7                            )
----> 8 rf.fit(X_train,Y_train)
      9 
     10 
     11 
     12 

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/sklearn/ensemble/forest.py in fit(self=RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), X=array([[  5.73999977,   6.05999994,  59.        ...       0.        ,   1.        ]], dtype=float32), y=array([[  16.],
       [  40.],
       [  32.],
...
       [ 213.],
       [ 148.],
       [ 120.]]), sample_weight=None)
    285             trees = Parallel(n_jobs=self.n_jobs, verbose=self.verbose,
    286                              backend="threading")(
    287                 delayed(_parallel_build_trees)(
    288                     t, self, X, y, sample_weight, i, len(trees),
    289                     verbose=self.verbose, class_weight=self.class_weight)
--> 290                 for i, t in enumerate(trees))
        i = 199
    291 
    292             # Collect newly grown trees
    293             self.estimators_.extend(trees)
    294 

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=3), iterable=<generator object <genexpr>>)
    807             if pre_dispatch == "all" or n_jobs == 1:
    808                 # The iterable was consumed all at once by the above for loop.
    809                 # No need to wait for async callbacks to trigger to
    810                 # consumption.
    811                 self._iterating = False
--> 812             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=3)>
    813             # Make sure that we get a last message telling us we are done
    814             elapsed_time = time.time() - self._start_time
    815             self._print('Done %3i out of %3i | elapsed: %s finished',
    816                         (len(self._output), len(self._output),

---------------------------------------------------------------------------
Sub-process traceback:
---------------------------------------------------------------------------
ValueError                                         Mon Aug 22 16:11:44 2016
PID: 8739             Python 2.7.11: /home/jakubczakon/anaconda2/bin/python
...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self=<sklearn.externals.joblib.parallel.BatchedCalls object>)
     67     def __init__(self, iterator_slice):
     68         self.items = list(iterator_slice)
     69         self._size = len(self.items)
     70 
     71     def __call__(self):
---> 72         return [func(*args, **kwargs) for func, args, kwargs in self.items]
     73 
     74     def __len__(self):
     75         return self._size
     76 

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/sklearn/ensemble/forest.pyc in _parallel_build_trees(tree=DecisionTreeRegressor(criterion='mse', max_depth...          random_state=55558179, splitter='best'), forest=RandomForestRegressor(bootstrap=True, criterion=...ate=None,
           verbose=0, warm_start=False), X=array([[  5.73999977,   6.05999994,  59.        ...       0.        ,   1.        ]], dtype=float32), y=array([[  16.],
       [  40.],
       [  32.],
...
       [ 213.],
       [ 148.],
       [ 120.]]), sample_weight=None, tree_idx=0, n_trees=200, verbose=0, class_weight=None)
    111                 warnings.simplefilter('ignore', DeprecationWarning)
    112                 curr_sample_weight *= compute_sample_weight('auto', y, indices)
    113         elif class_weight == 'balanced_subsample':
    114             curr_sample_weight *= compute_sample_weight('balanced', y, indices)
    115 
--> 116         tree.fit(X, y, sample_weight=curr_sample_weight, check_input=False)
    117     else:
    118         tree.fit(X, y, sample_weight=sample_weight, check_input=False)
    119 
    120     return tree

...........................................................................
/home/jakubczakon/anaconda2/lib/python2.7/site-packages/sklearn/tree/tree.pyc in fit(self=DecisionTreeRegressor(criterion='mse', max_depth...          random_state=55558179, splitter='best'), X=array([[  5.73999977,   6.05999994,  59.        ...       0.        ,   1.        ]], dtype=float32), y=array([[  16.],
       [  40.],
       [  32.],
...
       [ 213.],
       [ 148.],
       [ 120.]]), sample_weight=array([ 0.,  0.,  1., ...,  5.,  1.,  0.]), check_input=False, X_idx_sorted=None)
    235 
    236         self.max_features_ = max_features
    237 
    238         if len(y) != n_samples:
    239             raise ValueError("Number of labels=%d does not match "
--> 240                              "number of samples=%d" % (len(y), n_samples))
    241         if self.min_samples_split <= 0:
    242             raise ValueError("min_samples_split must be greater than zero.")
    243         if self.min_samples_leaf <= 0:
    244             raise ValueError("min_samples_leaf must be greater than zero.")

ValueError: Number of labels=7452 does not match number of samples=1722
___________________________________________________________________________

In [None]:
Y_train_pred = rf.predict(X_train)
Y_valid_pred = rf.predict(X_valid)
train_error = rmsle(Y_train_pred,Y_train)
valid_error = rmsle(Y_valid_pred,Y_valid)
print train_error,valid_error
print rf.oob_score_

In [30]:
param_dist = {"max_depth": [15,20,25,30,35,None],
              "max_features": range(1,9,1),
              "min_samples_split": range(1,3,1),
              "min_samples_leaf": range(1,3,1)
              }

rolling_hyperparams = []
with open(os.path.join("../","models","random_forest_simple_hyperparams.txt"),"wb") as f:
    for i in range(50):
        print "Iteration:%s\n"%i
        md = choice(param_dist["max_depth"])
        print "max_depth:%s"%md
        mf = choice(param_dist["max_features"])
        print "max_features:%s"%mf
        ms = choice(param_dist["min_samples_split"])
        print "min_samples_split:%s"%ms
        ml = choice(param_dist["min_samples_leaf"])
        print "min_samples_leaf:%s\n"%ml

        rf = RandomForestRegressor(n_estimators=200,n_jobs=3,verbose=0,
                max_depth = md,
                max_features=mf,
                min_samples_split=ms,
                min_samples_leaf=ml,
               )
        rf.fit(X_train,Y_train)

        Y_train_pred = rf.predict(X_train)
        Y_valid_pred = rf.predict(X_valid)
        train_error = rmsle(Y_train_pred,Y_train)
        valid_error = rmsle(Y_valid_pred,Y_valid)
        print "Train rmsle:%s"%train_error
        print "Train rmsle:%s\n"%valid_error
        f.write("iteration:%s max_depth:%s max_features:%s min_samples_split:%s "\
        "min_samples_leaf:%s "\
        "train_error:%s valid_error:%s\n"%(i,md,mf,ms,ml,train_error,valid_error))
        
        rolling_hyperparams.append([md,mf,ms,ml,train_error,valid_error])
rolling_hyperparams = rolling_hyperparams.vstack(rolling_hyperparams)
pickle_out(os.path.join("../","models","random_forest_simple_hyperparams.pkl"),rolling_hyperparams)

Iteration:0

max_depth:25
max_features:6
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.251414805789
Train rmsle:0.436118448016

Iteration:1

max_depth:15
max_features:8
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.2406344701
Train rmsle:0.423872534794

Iteration:2

max_depth:20
max_features:1
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.4313074541
Train rmsle:0.651272035327

Iteration:3

max_depth:15
max_features:2
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.406570403119
Train rmsle:0.597207053841

Iteration:4

max_depth:None
max_features:3
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.338777142417
Train rmsle:0.535690873446

Iteration:5

max_depth:None
max_features:5
min_samples_split:1
min_samples_leaf:1

Train rmsle:0.178352178661
Train rmsle:0.445361293465

Iteration:6

max_depth:35
max_features:7
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.240863210202
Train rmsle:0.427386749421

Iteration:7

max_depth:25
max_features:7
min_sam

AttributeError: 'list' object has no attribute 'vstack'

In [82]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RandomForestRegressor(n_estimators=nr,
#             max_features = 9,
#             min_samples_split = 1,
#             min_samples_leaf = 1,
#             max_depth = 20
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.524246358104
2 0.434469563019
3 0.395799150779
5 0.380644031519
10 0.351612307916
50 0.356765725173
100 0.347071524506
200 0.341765233624
300 0.34752623493
500 0.344099122685


KeyboardInterrupt: 

In [218]:
encode_variables = ['season','weather','holiday','workingday','date_year','date_weekday','day_night',
                    'date_month']
encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]

final_steps = [('extract_times', prep.ExtractTimes())]+\
                encoding_steps+\
             [('drop_columns', prep.DropColumns(colnames = ["datetime"])),
             ('random_forest',RandomForestRegressor(n_jobs=3,oob_score=True,
                                                    n_estimators=500,
                                                    max_features=49,
                                                    min_samples_split = 3,
                                                    min_samples_leaf=3,
                                                    max_depth=25))]

final_pipe = Pipeline(final_steps)
final_pipe.fit(X_train,Y_train)

Y_train_pred = final_pipe.predict(X_train)
Y_valid_pred = final_pipe.predict(X_valid)
result_train = rmsle(Y_train_pred,Y_train)
result_valid = rmsle(Y_valid_pred,Y_valid)
print result_train
print result_valid

0.231232339306
0.281295493245


In [219]:
model_filepath = os.path.join("../","models","random_forest_weekdays_pipeline.pkl")
pickle_out(model_filepath,final_pipe,compresion_mode=5)