In [1]:
import pdb
import numpy as np
import gcp.bigquery as bq
import gcp.storage as storage
from sklearn.metrics import make_scorer
from sklearn.pipeline import Pipeline
try:
   import cPickle as pickle
except:
   import pickle
EST_PICKLE_FILENAME = 'baseline_final_estimator.pkl'

# Put all categorical data first for easier implementation of One Hot Encoding.
fields_str = """
gap	day_in_week	weather_1_slots_ago	weather_2_slots_ago	weather_3_slots_ago	busy_time	
tj_level1_1_slots_ago	tj_level2_1_slots_ago	tj_level3_1_slots_ago	tj_level4_1_slots_ago	
tj_level1_2_slots_ago	tj_level2_2_slots_ago	tj_level3_2_slots_ago	tj_level4_2_slots_ago	
tj_level1_3_slots_ago	tj_level2_3_slots_ago	tj_level3_3_slots_ago	tj_level4_3_slots_ago	
temperature_1_slots_ago	pm25_1_slots_ago	
temperature_2_slots_ago	pm25_2_slots_ago	
temperature_3_slots_ago	pm25_3_slots_ago	
gap_1_slots_ago	sum_price_1_slots_ago	
gap_2_slots_ago	sum_price_2_slots_ago	
gap_3_slots_ago	sum_price_3_slots_ago	
f1	f11	f11_1	f11_2	f11_3	f11_4	f11_5	f11_6	f11_7	
f11_8	f13_4	f13_8	f14	f14_1	f14_10	f14_2	f14_3	f14_6	f14_8	f15	f15_1	
f15_2	f15_3	f15_4	f15_6	f15_7	f15_8	f16	f16_1	f16_10	f16_11	f16_12	f16_3	
f16_4	f16_6	f17	f17_2	f17_3	f17_4	f17_5	f19	f19_1	f19_2	f19_3	f19_4	f1_1	
f1_10	f1_11	f1_2	f1_3	f1_4	f1_5	f1_6	f1_7	f1_8	f20	f20_1	f20_2	
f20_4	f20_5	f20_6	f20_7	f20_8	f20_9	f21_1	f21_2	f22	f22_1	f22_2	f22_3	
f22_4	f22_5	f23	f23_1	f23_2	f23_3	f23_4	f23_5	f23_6	f24	f24_1	f24_2	f24_3	
f25	f25_1	f25_3	f25_7	f25_8	f25_9	f2_1	f2_10	f2_11	f2_12	f2_13	f2_2	
f2_4	f2_5	f2_6	f2_7	f2_8	f3_1	f3_2	f3_3	f4	f4_1	f4_10	f4_11	
f4_13	f4_14	f4_16	f4_17	f4_18	f4_2	f4_3	f4_5	f4_6	f4_7	f4_8	f4_9	
f5	f5_1	f5_3	f5_4	f6	f6_1	f6_2	f6_4	f7	f8	f8_1	f8_2	f8_3	f8_4	
f8_5
"""
fields = map(lambda x: x.strip(), fields_str.split('\t'))
features = fields[1:]

# Use this instead of len(features) since this variable can change
# e.g. when one hot encoding is used and/or new features are added.
n_features = len(features)

# Scorer Creation (MAPE)

In [8]:
def mape(y, predictions):
#   num_timeslots = 43
#   num_districts = 66
  if len(y.shape) == 1:
    y = np.asmatrix(y)
  if len(predictions.shape) == 1:
    predictions = np.asmatrix(predictions)
  y = y.astype(float)
  predictions = predictions.astype(float)
  return np.sum(np.absolute((y-predictions)/y)) / (y.shape[0]*predictions.shape[0])

# from keras import backend as K

# def mape(y, predictions):
#   return K.mean(K.abs(y-predictions/K.clip(K.abs(y), K.epsilon(), np.inf)), axis=-1)

mape_scorer = make_scorer(mape, greater_is_better=False)

Testing MAPE

In [9]:
from sklearn.linear_model import LogisticRegression

predictions = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')
y = np.array([1.0, 2.0, 3.0, 4.0]).astype('float32')

# Should return 0.0
print mape(y, predictions)

# Should return higher score
predictions = np.array([1.0, 2.0, 2.0, 3.0]).astype('float32')
print(mape(y, predictions))

# Should return highest score
predictions = np.array([1000.0, 22.0, 11.0, 31.0]).astype('float32')
print(mape(y, predictions))

# est = LogisticRegression()
# X = np.random.rand(10,4)
# y = X.sum(axis=1)
# est.fit(X, y)
# predictions = est.predict(X)
# print(mape(y, predictions))

0.0
0.583333333333
1018.41666667


# Feature Selection

In [10]:
%%sql --module q_all

SELECT *, HASH(CAST(district_id AS STRING) +timeslot) AS hash_value,
  IF(ABS(HASH(CAST(district_id AS STRING) + timeslot)) % 2 == 1, 'True', 'False')
    AS included_in_sample, IF(timeofday_slot >= 50 AND timeofday_slot <= 53, 1, 0) AS busy_time
FROM [datalab-projects-1331:xjk_algo_comp.gaps]
WHERE gap > 0
LIMIT 5000

# The above query randomizes its outputs.

In [11]:
query = bq.Query(q_all)
tableresult = query.results()

all_data = np.zeros((tableresult.length, len(fields)))
print 'there are {} rows'.format(tableresult.length)
for rcounter, row in enumerate(tableresult):
  for fcounter, field in enumerate(fields):
    all_data[rcounter, fcounter] = row[field]
  if rcounter % 5000 == 0:
    print 'processed {} rows'.format(rcounter)
all_data_original = np.copy(all_data)

there are 5000 rows
processed 0 rows


In [12]:
# This chunk does further wrangling to dataset to produce training and test sets.

# Useful code to check NaN and Inf values. This is needed since these values would
# cause "Input contains NaN, infinity or a value too large for dtype('float32')
# errors when left unchecked.
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Impute all NaN with numbers (not sure what to replace inf yet)
all_data[np.isnan(all_data)] = 0
# all_data[np.isinf(all_data)] = 0

# See that NaN and Inf values replaced
print "Checkinf for NaN and Inf"
print "np.nan=", np.where(np.isnan(all_data))
print "is.inf=", np.where(np.isinf(all_data))
print "np.max=", np.max(abs(all_data))

# Split the data into train and test sets.
data_size = all_data.shape[0]
training_size = data_size * 80/100
indices = np.random.permutation(data_size)
training_idx, test_idx = indices[:training_size], indices[training_size:]
all_data_train, all_data_test = all_data[training_idx,:], all_data[test_idx,:]

data_train = all_data_train[:,1:]
targets_train = all_data_train[:,0]
data_test = all_data_test[:,1:]
targets_test = all_data_test[:,0]
data_train_original = np.copy(data_train)
data_test_original = np.copy(data_test)

Checkinf for NaN and Inf
np.nan= (array([  25,   25,   32, ..., 4993, 4993, 4993]), array([24, 25,  6, ..., 21, 22, 23]))
is.inf= (array([], dtype=int64), array([], dtype=int64))
np.max= nan
Checkinf for NaN and Inf
np.nan= (array([], dtype=int64), array([], dtype=int64))
is.inf= (array([], dtype=int64), array([], dtype=int64))
np.max= 461563.0


In [13]:
from sklearn.preprocessing import OneHotEncoder
# one_hot = OneHotEncoder(categorical_features=[0, 1, 14, 17, 20], n_values='auto')
one_hot = OneHotEncoder(categorical_features=[0, 1, 2, 3], sparse=False)
one_hot.fit(data_train)

OneHotEncoder(categorical_features=[0, 1, 2, 3], dtype=<type 'float'>,
       handle_unknown='error', n_values='auto', sparse=True)

In [14]:
one_hot.n_values_

array([ 7, 10, 10, 10])

In [15]:
one_hot.feature_indices_

array([ 0,  7, 17, 27, 37])

In [16]:
data_train = one_hot.transform(data_train_original).todense()
data_test = one_hot.transform(data_test_original).todense()
n_features = data_train.shape[1]
print 'new number of features: {}'.format(n_features)

new number of features: 192


# Building and Testing Algorithm(s)

In [17]:
from sklearn.base import BaseEstimator, RegressorMixin
from scipy.sparse import coo_matrix, hstack

class CustomRegressor(BaseEstimator, RegressorMixin):
  def __init__(self):
    pass

  def fit(self, X, y):
#     self.classes_, indices = np.unique(["foo", "bar", "foo"],
#                                     return_inverse=True)
#     self.majority_ = np.argmax(np.bincount(indices))
    return self

  def predict(self, X):
    # 56: gap_1_slots_ago
    # 58: gap_2_slots_ago
    # 60: gap_3_slots_ago
#     X = X.tocsr()
#     v1 = coo_matrix(np.asmatrix(np.ones(X.shape[0])).T)
    v1 = np.asmatrix(np.ones(X.shape[0]))
    v2 = np.asmatrix((X[:, 23]*0.65+X[:, 25]*0.25+X[:, 27]*0.15)/2)
    predictions = np.asarray(np.concatenate((v1, v2), axis=0).max(axis=0))
    
    return predictions
  
custom_est = CustomRegressor()
custom_est.fit(data_train_original, data_test_original)
custom_predictions = custom_est.predict(data_test_original)
print(mape(targets_test, custom_predictions))

423.136508905


In [18]:
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_classif
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import Imputer, OneHotEncoder
from sklearn.grid_search import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV
from sklearn.decomposition import PCA

steps = [
  ('one_hot', OneHotEncoder(categorical_features=[0, 1, 2, 3], n_values='auto')),
#   ('impute', Imputer(0)),
#   ('feature_selection', SelectKBest(f_classif)),
  ('pca', PCA(n_components=120)),
  ('estimate', DecisionTreeRegressor())
]

est = Pipeline(steps)

params = {
#   'one_hot__n_values': [7, 10, 20],
#   "feature_selection__k": [i for i in range(1, n_features - 1)]
  'estimate__max_features': [i for i in range(110, 120)],
#   'estimate__learning_rate': [0.1, 0.5, 1, 5, 10],
#   'estimate__n_estimators': [i for i in range(110, 120, 2)],
#   'estimate__loss': ['linear', 'square', 'exponential']
}
# cross_validation_iter = StratifiedShuffleSplit(y=targets_train, test_size=0.3,
#                                                random_state=RANDOM_STATE, n_iter=10)
# search_params = RandomizedSearchCV(
#   estimator=est,
#   param_distributions=params,
#   cv=5,
#   scoring=mape_scorer,
#   n_jobs=2,
#   verbose=1
# )

search_params = GridSearchCV(
  estimator=est,
  param_grid=params,
  cv=5,
  scoring=mape_scorer,
  n_jobs=5,
  verbose=3
)

search_params.fit(data_train_original, targets_train)
print(search_params.grid_scores_)
print(search_params.best_params_)
print(search_params.best_score_)
search_params.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] estimate__max_features=110 ......................................
[CV] estimate__max_features=110 ......................................
[CV] estimate__max_features=110 ......................................
[CV] estimate__max_features=110 ......................................
[CV] estimate__max_features=110 ......................................
[CV] estimate__max_features=111 ......................................
[CV] estimate__max_features=111 ......................................
[CV] estimate__max_features=111 ......................................
[CV] estimate__max_features=111 ......................................


JoblibTypeError: JoblibTypeError
___________________________________________________________________________
Multiprocessing exception:
    ...........................................................................
/usr/lib/python2.7/runpy.py in _run_module_as_main(mod_name='ipykernel.__main__', alter_argv=1)
    157     pkg_name = mod_name.rpartition('.')[0]
    158     main_globals = sys.modules["__main__"].__dict__
    159     if alter_argv:
    160         sys.argv[0] = fname
    161     return _run_code(code, main_globals, None,
--> 162                      "__main__", fname, loader, pkg_name)
        fname = '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py'
        loader = <pkgutil.ImpLoader instance>
        pkg_name = 'ipykernel'
    163 
    164 def run_module(mod_name, init_globals=None,
    165                run_name=None, alter_sys=False):
    166     """Execute a module's code without importing it

...........................................................................
/usr/lib/python2.7/runpy.py in _run_code(code=<code object <module> at 0x7fe0039a1330, file "/...2.7/dist-packages/ipykernel/__main__.py", line 1>, run_globals={'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.pyc'>}, init_globals=None, mod_name='__main__', mod_fname='/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', mod_loader=<pkgutil.ImpLoader instance>, pkg_name='ipykernel')
     67         run_globals.update(init_globals)
     68     run_globals.update(__name__ = mod_name,
     69                        __file__ = mod_fname,
     70                        __loader__ = mod_loader,
     71                        __package__ = pkg_name)
---> 72     exec code in run_globals
        code = <code object <module> at 0x7fe0039a1330, file "/...2.7/dist-packages/ipykernel/__main__.py", line 1>
        run_globals = {'__builtins__': <module '__builtin__' (built-in)>, '__doc__': None, '__file__': '/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py', '__loader__': <pkgutil.ImpLoader instance>, '__name__': '__main__', '__package__': 'ipykernel', 'app': <module 'ipykernel.kernelapp' from '/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.pyc'>}
     73     return run_globals
     74 
     75 def _run_module_code(code, init_globals=None,
     76                     mod_name=None, mod_fname=None,

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/__main__.py in <module>()
      1 
      2 
----> 3 
      4 if __name__ == '__main__':
      5     from ipykernel import kernelapp as app
      6     app.launch_new_instance()
      7 
      8 
      9 
     10 

...........................................................................
/usr/local/lib/python2.7/dist-packages/traitlets/config/application.py in launch_instance(cls=<class 'ipykernel.kernelapp.IPKernelApp'>, argv=None, **kwargs={})
    591         
    592         If a global instance already exists, this reinitializes and starts it
    593         """
    594         app = cls.instance(**kwargs)
    595         app.initialize(argv)
--> 596         app.start()
        app.start = <bound method IPKernelApp.start of <ipykernel.kernelapp.IPKernelApp object>>
    597 
    598 #-----------------------------------------------------------------------------
    599 # utility functions, for convenience
    600 #-----------------------------------------------------------------------------

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelapp.py in start(self=<ipykernel.kernelapp.IPKernelApp object>)
    437         
    438         if self.poller is not None:
    439             self.poller.start()
    440         self.kernel.start()
    441         try:
--> 442             ioloop.IOLoop.instance().start()
    443         except KeyboardInterrupt:
    444             pass
    445 
    446 launch_new_instance = IPKernelApp.launch_instance

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    157             PollIOLoop.configure(ZMQIOLoop)
    158         return PollIOLoop.current(*args, **kwargs)
    159     
    160     def start(self):
    161         try:
--> 162             super(ZMQIOLoop, self).start()
        self.start = <bound method ZMQIOLoop.start of <zmq.eventloop.ioloop.ZMQIOLoop object>>
    163         except ZMQError as e:
    164             if e.errno == ETERM:
    165                 # quietly return on ETERM
    166                 pass

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/ioloop.py in start(self=<zmq.eventloop.ioloop.ZMQIOLoop object>)
    878                 self._events.update(event_pairs)
    879                 while self._events:
    880                     fd, events = self._events.popitem()
    881                     try:
    882                         fd_obj, handler_func = self._handlers[fd]
--> 883                         handler_func(fd_obj, events)
        handler_func = <function null_wrapper>
        fd_obj = <zmq.sugar.socket.Socket object>
        events = 1
    884                     except (OSError, IOError) as e:
    885                         if errno_from_exception(e) == errno.EPIPE:
    886                             # Happens when the client closes the connection
    887                             pass

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py in null_wrapper(*args=(<zmq.sugar.socket.Socket object>, 1), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = (<zmq.sugar.socket.Socket object>, 1)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _handle_events(self=<zmq.eventloop.zmqstream.ZMQStream object>, fd=<zmq.sugar.socket.Socket object>, events=1)
    435             # dispatch events:
    436             if events & IOLoop.ERROR:
    437                 gen_log.error("got POLLERR event on ZMQStream, which doesn't make sense")
    438                 return
    439             if events & IOLoop.READ:
--> 440                 self._handle_recv()
        self._handle_recv = <bound method ZMQStream._handle_recv of <zmq.eventloop.zmqstream.ZMQStream object>>
    441                 if not self.socket:
    442                     return
    443             if events & IOLoop.WRITE:
    444                 self._handle_send()

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _handle_recv(self=<zmq.eventloop.zmqstream.ZMQStream object>)
    467                 gen_log.error("RECV Error: %s"%zmq.strerror(e.errno))
    468         else:
    469             if self._recv_callback:
    470                 callback = self._recv_callback
    471                 # self._recv_callback = None
--> 472                 self._run_callback(callback, msg)
        self._run_callback = <bound method ZMQStream._run_callback of <zmq.eventloop.zmqstream.ZMQStream object>>
        callback = <function null_wrapper>
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    473                 
    474         # self.update_state()
    475         
    476 

...........................................................................
/usr/local/lib/python2.7/dist-packages/zmq/eventloop/zmqstream.py in _run_callback(self=<zmq.eventloop.zmqstream.ZMQStream object>, callback=<function null_wrapper>, *args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    409         close our socket."""
    410         try:
    411             # Use a NullContext to ensure that all StackContexts are run
    412             # inside our blanket exception handler rather than outside.
    413             with stack_context.NullContext():
--> 414                 callback(*args, **kwargs)
        callback = <function null_wrapper>
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    415         except:
    416             gen_log.error("Uncaught exception, closing connection.",
    417                           exc_info=True)
    418             # Close the socket on an uncaught exception from a user callback

...........................................................................
/usr/local/lib/python2.7/dist-packages/tornado/stack_context.py in null_wrapper(*args=([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],), **kwargs={})
    270         # Fast path when there are no active contexts.
    271         def null_wrapper(*args, **kwargs):
    272             try:
    273                 current_state = _state.contexts
    274                 _state.contexts = cap_contexts[0]
--> 275                 return fn(*args, **kwargs)
        args = ([<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>],)
        kwargs = {}
    276             finally:
    277                 _state.contexts = current_state
    278         null_wrapper._wrapped = True
    279         return null_wrapper

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in dispatcher(msg=[<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>])
    271         if self.control_stream:
    272             self.control_stream.on_recv(self.dispatch_control, copy=False)
    273 
    274         def make_dispatcher(stream):
    275             def dispatcher(msg):
--> 276                 return self.dispatch_shell(stream, msg)
        msg = [<zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>, <zmq.sugar.frame.Frame object>]
    277             return dispatcher
    278 
    279         for s in self.shell_streams:
    280             s.on_recv(make_dispatcher(s), copy=False)

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in dispatch_shell(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, msg={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-16T19:07:27.115341', u'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', u'msg_type': u'execute_request', u'session': u'3AD45FA1ABDD44C181DDC60AF66E1EAA', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', 'msg_type': u'execute_request', 'parent_header': {}})
    223             self.log.error("UNKNOWN MESSAGE TYPE: %r", msg_type)
    224         else:
    225             self.log.debug("%s: %s", msg_type, msg)
    226             self.pre_handler_hook()
    227             try:
--> 228                 handler(stream, idents, msg)
        handler = <bound method IPythonKernel.execute_request of <ipykernel.ipkernel.IPythonKernel object>>
        stream = <zmq.eventloop.zmqstream.ZMQStream object>
        idents = ['3AD45FA1ABDD44C181DDC60AF66E1EAA']
        msg = {'buffers': [], 'content': {u'allow_stdin': True, u'code': u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-16T19:07:27.115341', u'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', u'msg_type': u'execute_request', u'session': u'3AD45FA1ABDD44C181DDC60AF66E1EAA', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', 'msg_type': u'execute_request', 'parent_header': {}}
    229             except Exception:
    230                 self.log.error("Exception in message handler:", exc_info=True)
    231             finally:
    232                 self.post_handler_hook()

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/kernelbase.py in execute_request(self=<ipykernel.ipkernel.IPythonKernel object>, stream=<zmq.eventloop.zmqstream.ZMQStream object>, ident=['3AD45FA1ABDD44C181DDC60AF66E1EAA'], parent={'buffers': [], 'content': {u'allow_stdin': True, u'code': u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_', u'silent': False, u'stop_on_error': True, u'store_history': True, u'user_expressions': {}}, 'header': {'date': '2016-06-16T19:07:27.115341', u'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', u'msg_type': u'execute_request', u'session': u'3AD45FA1ABDD44C181DDC60AF66E1EAA', u'username': u'username', u'version': u'5.0'}, 'metadata': {}, 'msg_id': u'A645D5D0A45A45B48145C4BCFD70CA5C', 'msg_type': u'execute_request', 'parent_header': {}})
    386         if not silent:
    387             self.execution_count += 1
    388             self._publish_execute_input(code, parent, self.execution_count)
    389 
    390         reply_content = self.do_execute(code, silent, store_history,
--> 391                                         user_expressions, allow_stdin)
        user_expressions = {}
        allow_stdin = True
    392 
    393         # Flush output before sending the reply.
    394         sys.stdout.flush()
    395         sys.stderr.flush()

...........................................................................
/usr/local/lib/python2.7/dist-packages/ipykernel/ipkernel.py in do_execute(self=<ipykernel.ipkernel.IPythonKernel object>, code=u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_', silent=False, store_history=True, user_expressions={}, allow_stdin=True)
    194 
    195         reply_content = {}
    196         # FIXME: the shell calls the exception handler itself.
    197         shell._reply_content = None
    198         try:
--> 199             shell.run_cell(code, store_history=store_history, silent=silent)
        shell.run_cell = <bound method ZMQInteractiveShell.run_cell of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_'
        store_history = True
        silent = False
    200         except:
    201             status = u'error'
    202             # FIXME: this code right now isn't being used yet by default,
    203             # because the run_cell() call above directly fires off exception

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_cell(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, raw_cell=u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_', store_history=True, silent=False, shell_futures=True)
   2718                 self.displayhook.exec_result = result
   2719 
   2720                 # Execute the user code
   2721                 interactivity = "none" if silent else self.ast_node_interactivity
   2722                 self.run_ast_nodes(code_ast.body, cell_name,
-> 2723                    interactivity=interactivity, compiler=compiler, result=result)
        interactivity = 'last_expr'
        compiler = <IPython.core.compilerop.CachingCompiler instance>
   2724 
   2725                 # Reset this so later displayed values do not modify the
   2726                 # ExecutionResult
   2727                 self.displayhook.exec_result = None

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_ast_nodes(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, nodelist=[<_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.ImportFrom object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Assign object>, <_ast.Expr object>, <_ast.Print object>, <_ast.Print object>, <_ast.Print object>, <_ast.Expr object>], cell_name='<ipython-input-18-a95ff53b18b8>', interactivity='last', compiler=<IPython.core.compilerop.CachingCompiler instance>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2820 
   2821         try:
   2822             for i, node in enumerate(to_run_exec):
   2823                 mod = ast.Module([node])
   2824                 code = compiler(mod, cell_name, "exec")
-> 2825                 if self.run_code(code, result):
        self.run_code = <bound method ZMQInteractiveShell.run_code of <ipykernel.zmqshell.ZMQInteractiveShell object>>
        code = <code object <module> at 0x7fdfcc9020b0, file "<ipython-input-18-a95ff53b18b8>", line 49>
        result = <IPython.core.interactiveshell.ExecutionResult object>
   2826                     return True
   2827 
   2828             for i, node in enumerate(to_run_interactive):
   2829                 mod = ast.Interactive([node])

...........................................................................
/usr/local/lib/python2.7/dist-packages/IPython/core/interactiveshell.py in run_code(self=<ipykernel.zmqshell.ZMQInteractiveShell object>, code_obj=<code object <module> at 0x7fdfcc9020b0, file "<ipython-input-18-a95ff53b18b8>", line 49>, result=<IPython.core.interactiveshell.ExecutionResult object>)
   2880         outflag = 1  # happens in more places, so it's easier as default
   2881         try:
   2882             try:
   2883                 self.hooks.pre_run_code_hook()
   2884                 #rprint('Running code', repr(code_obj)) # dbg
-> 2885                 exec(code_obj, self.user_global_ns, self.user_ns)
        code_obj = <code object <module> at 0x7fdfcc9020b0, file "<ipython-input-18-a95ff53b18b8>", line 49>
        self.user_global_ns = {'AdaBoostRegressor': <class 'sklearn.ensemble.weight_boosting.AdaBoostRegressor'>, 'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CustomRegressor': <class '__main__.CustomRegressor'>, 'DecisionTreeRegressor': <class 'sklearn.tree.tree.DecisionTreeRegressor'>, 'EST_PICKLE_FILENAME': 'baseline_final_estimator.pkl', 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'Imputer': <class 'sklearn.preprocessing.imputation.Imputer'>, 'In': ['', u'import pdb\nimport numpy as np\nimport gcp.big... features are added.\nn_features = len(features)', u'# def mape(y, predictions):\n#   num_timeslots...rer = make_scorer(mape, greater_is_better=False)', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u'def mape(y, predictions):\n#   num_timeslots =...rer = make_scorer(mape, greater_is_better=False)', u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u'get_ipython().run_cell_magic(u\'sql\', u\'--mo...\n\\n# The above query randomizes its outputs.")', u"query = bq.Query(q_all)\ntableresult = query.r...rcounter)\nall_data_original = np.copy(all_data)", u'# This chunk does further wrangling to dataset..._train)\ndata_test_original = np.copy(data_test)', u"from sklearn.preprocessing import OneHotEncode...2, 3], n_values='auto')\none_hot.fit(data_train)", u'one_hot.n_values_', u'one_hot.feature_indices_', u"data_train = one_hot.transform(data_train_orig... 'new number of features: {}'.format(n_features)", u'from sklearn.base import BaseEstimator, Regres...)\nprint(mape(targets_test, custom_predictions))', u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_'], 'K': <module 'keras.backend' from '/usr/local/lib/python2.7/dist-packages/keras/backend/__init__.pyc'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, ...}
        self.user_ns = {'AdaBoostRegressor': <class 'sklearn.ensemble.weight_boosting.AdaBoostRegressor'>, 'BaseEstimator': <class 'sklearn.base.BaseEstimator'>, 'CustomRegressor': <class '__main__.CustomRegressor'>, 'DecisionTreeRegressor': <class 'sklearn.tree.tree.DecisionTreeRegressor'>, 'EST_PICKLE_FILENAME': 'baseline_final_estimator.pkl', 'GridSearchCV': <class 'sklearn.grid_search.GridSearchCV'>, 'Imputer': <class 'sklearn.preprocessing.imputation.Imputer'>, 'In': ['', u'import pdb\nimport numpy as np\nimport gcp.big... features are added.\nn_features = len(features)', u'# def mape(y, predictions):\n#   num_timeslots...rer = make_scorer(mape, greater_is_better=False)', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u'from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))', u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u'def mape(y, predictions):\n#   num_timeslots =...rer = make_scorer(mape, greater_is_better=False)', u"from sklearn.linear_model import LogisticRegre... = est.predict(X)\n# print(mape(y, predictions))", u'get_ipython().run_cell_magic(u\'sql\', u\'--mo...\n\\n# The above query randomizes its outputs.")', u"query = bq.Query(q_all)\ntableresult = query.r...rcounter)\nall_data_original = np.copy(all_data)", u'# This chunk does further wrangling to dataset..._train)\ndata_test_original = np.copy(data_test)', u"from sklearn.preprocessing import OneHotEncode...2, 3], n_values='auto')\none_hot.fit(data_train)", u'one_hot.n_values_', u'one_hot.feature_indices_', u"data_train = one_hot.transform(data_train_orig... 'new number of features: {}'.format(n_features)", u'from sklearn.base import BaseEstimator, Regres...)\nprint(mape(targets_test, custom_predictions))', u'from sklearn.feature_selection import SelectKB...rams.best_score_)\nsearch_params.best_estimator_'], 'K': <module 'keras.backend' from '/usr/local/lib/python2.7/dist-packages/keras/backend/__init__.pyc'>, 'LogisticRegression': <class 'sklearn.linear_model.logistic.LogisticRegression'>, ...}
   2886             finally:
   2887                 # Reset our crash handler in place
   2888                 sys.excepthook = old_excepthook
   2889         except SystemExit as e:

...........................................................................
/content/TeguhWPurwanto@gmail.com/Xiaojukeji Algorithm Competition/<ipython-input-18-a95ff53b18b8> in <module>()
     44   scoring=mape_scorer,
     45   n_jobs=5,
     46   verbose=3
     47 )
     48 
---> 49 search_params.fit(data_train_original, targets_train)
     50 print(search_params.grid_scores_)
     51 print(search_params.best_params_)
     52 print(search_params.best_score_)
     53 search_params.best_estimator_

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.py in fit(self=GridSearchCV(cv=5, error_score='raise',
       e...scorer(mape, greater_is_better=False), verbose=3), X=array([[  3.00000000e+00,   4.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]]), y=array([  8.,   1.,   1., ...,  12.,   1.,   2.]))
    727         y : array-like, shape = [n_samples] or [n_samples, n_output], optional
    728             Target relative to X for classification or regression;
    729             None for unsupervised learning.
    730 
    731         """
--> 732         return self._fit(X, y, ParameterGrid(self.param_grid))
        self._fit = <bound method GridSearchCV._fit of GridSearchCV(...corer(mape, greater_is_better=False), verbose=3)>
        X = array([[  3.00000000e+00,   4.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]])
        y = array([  8.,   1.,   1., ...,  12.,   1.,   2.])
        self.param_grid = {'estimate__max_features': [110, 111, 112, 113, 114, 115, 116, 117, 118, 119]}
    733 
    734 
    735 class RandomizedSearchCV(BaseSearchCV):
    736     """Randomized search on hyper parameters.

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/grid_search.py in _fit(self=GridSearchCV(cv=5, error_score='raise',
       e...scorer(mape, greater_is_better=False), verbose=3), X=array([[  3.00000000e+00,   4.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]]), y=array([  8.,   1.,   1., ...,  12.,   1.,   2.]), parameter_iterable=<sklearn.grid_search.ParameterGrid object>)
    500         )(
    501             delayed(_fit_and_score)(clone(base_estimator), X, y, self.scorer_,
    502                                     train, test, self.verbose, parameters,
    503                                     self.fit_params, return_parameters=True,
    504                                     error_score=self.error_score)
--> 505                 for parameters in parameter_iterable
        parameters = undefined
        parameter_iterable = <sklearn.grid_search.ParameterGrid object>
    506                 for train, test in cv)
    507 
    508         # Out is a list of triplet: score, estimator, n_test_samples
    509         n_fits = len(out)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/externals/joblib/parallel.py in __call__(self=Parallel(n_jobs=5), iterable=<itertools.islice object>)
    661             if pre_dispatch == "all" or n_jobs == 1:
    662                 # The iterable was consumed all at once by the above for loop.
    663                 # No need to wait for async callbacks to trigger to
    664                 # consumption.
    665                 self._iterating = False
--> 666             self.retrieve()
        self.retrieve = <bound method Parallel.retrieve of Parallel(n_jobs=5)>
    667             # Make sure that we get a last message telling us we are done
    668             elapsed_time = time.time() - self._start_time
    669             self._print('Done %3i out of %3i | elapsed: %s finished',
    670                         (len(self._output),

    ---------------------------------------------------------------------------
    Sub-process traceback:
    ---------------------------------------------------------------------------
    TypeError                                          Thu Jun 16 19:07:27 2016
PID: 7252                                     Python 2.7.9: /usr/bin/python
...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator=Pipeline(steps=[('one_hot', OneHotEncoder(catego...random_state=None,
           splitter='best'))]), X=array([[  3.00000000e+00,   4.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]]), y=array([  8.,   1.,   1., ...,  12.,   1.,   2.]), scorer=make_scorer(mape, greater_is_better=False), train=array([ 800,  801,  802, ..., 3997, 3998, 3999]), test=array([  0,   1,   2,   3,   4,   5,   6,   7,  ..., 792,
       793, 794, 795, 796, 797, 798, 799]), verbose=3, parameters={'estimate__max_features': 110}, fit_params={}, return_train_score=False, return_parameters=True, error_score='raise')
   1454 
   1455     try:
   1456         if y_train is None:
   1457             estimator.fit(X_train, **fit_params)
   1458         else:
-> 1459             estimator.fit(X_train, y_train, **fit_params)
   1460 
   1461     except Exception as e:
   1462         if error_score == 'raise':
   1463             raise

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.pyc in fit(self=Pipeline(steps=[('one_hot', OneHotEncoder(catego...random_state=None,
           splitter='best'))]), X=array([[  0.00000000e+00,   2.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]]), y=array([  5.,   6.,   3., ...,  12.,   1.,   2.]), **fit_params={})
    135             pipeline.
    136         y : iterable, default=None
    137             Training targets. Must fulfill label requirements for all steps of
    138             the pipeline.
    139         """
--> 140         Xt, fit_params = self._pre_transform(X, y, **fit_params)
    141         self.steps[-1][-1].fit(Xt, y, **fit_params)
    142         return self
    143 
    144     def fit_transform(self, X, y=None, **fit_params):

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/pipeline.pyc in _pre_transform(self=Pipeline(steps=[('one_hot', OneHotEncoder(catego...random_state=None,
           splitter='best'))]), X=array([[  0.00000000e+00,   2.00000000e+00,   4....200000e+03,   1.90900000e+03,   0.00000000e+00]]), y=array([  5.,   6.,   3., ...,  12.,   1.,   2.]), **fit_params={})
    116             step, param = pname.split('__', 1)
    117             fit_params_steps[step][param] = pval
    118         Xt = X
    119         for name, transform in self.steps[:-1]:
    120             if hasattr(transform, "fit_transform"):
--> 121                 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
    122             else:
    123                 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
    124                               .transform(Xt)
    125         return Xt, fit_params_steps[self.steps[-1][0]]

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/pca.pyc in fit_transform(self=PCA(copy=True, n_components=120, whiten=False), X=<3200x192 sparse matrix of type '<type 'numpy.fl...with 419806 stored elements in COOrdinate format>, y=array([  5.,   6.,   3., ...,  12.,   1.,   2.]))
    233         Returns
    234         -------
    235         X_new : array-like, shape (n_samples, n_components)
    236 
    237         """
--> 238         U, S, V = self._fit(X)
    239         U = U[:, :self.n_components_]
    240 
    241         if self.whiten:
    242             # X_new = X * V / S * sqrt(n_samples) = U * sqrt(n_samples)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/decomposition/pca.pyc in _fit(self=PCA(copy=True, n_components=120, whiten=False), X=<3200x192 sparse matrix of type '<type 'numpy.fl...with 419806 stored elements in COOrdinate format>)
    260         -------
    261         U, s, V : ndarrays
    262             The SVD of the input data, copied and centered when
    263             requested.
    264         """
--> 265         X = check_array(X)
    266         n_samples, n_features = X.shape
    267         X = as_float_array(X, copy=self.copy)
    268         # Center data
    269         self.mean_ = np.mean(X, axis=0)

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in check_array(array=<3200x192 sparse matrix of type '<type 'numpy.fl...with 419806 stored elements in COOrdinate format>, accept_sparse=None, dtype=None, order=None, copy=False, force_all_finite=True, ensure_2d=True, allow_nd=False, ensure_min_samples=1, ensure_min_features=1)
    329 
    330     if sp.issparse(array):
    331         if dtype_numeric:
    332             dtype = None
    333         array = _ensure_sparse_format(array, accept_sparse, dtype, order,
--> 334                                       copy, force_all_finite)
        array = <3200x192 sparse matrix of type '<type 'numpy.fl...with 419806 stored elements in COOrdinate format>
    335     else:
    336         if ensure_2d:
    337             array = np.atleast_2d(array)
    338         if dtype_numeric:

...........................................................................
/usr/local/lib/python2.7/dist-packages/sklearn/utils/validation.pyc in _ensure_sparse_format(spmatrix=<3200x192 sparse matrix of type '<type 'numpy.fl...with 419806 stored elements in COOrdinate format>, accept_sparse=None, dtype=None, order=None, copy=False, force_all_finite=True)
    234     -------
    235     spmatrix_converted : scipy sparse matrix.
    236         Matrix that is ensured to have an allowed type.
    237     """
    238     if accept_sparse is None:
--> 239         raise TypeError('A sparse matrix was passed, but dense '
        dtype = None
    240                         'data is required. Use X.toarray() to '
    241                         'convert to a dense numpy array.')
    242     sparse_type = spmatrix.format
    243     if dtype is None:

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
___________________________________________________________________________

Test data's prediction MAPE score:

In [None]:
final_est = search_params.best_estimator_
test_predictions = final_est.predict(data_test_original)
print(mape(targets_test, test_predictions))

In [None]:
pickle.dump(final_est, open(EST_PICKLE_FILENAME, "w") )

Run "Process Final Test Data With Final Algorithm" to use pickled final algorithm against final test data to produce csv required by this competition.

In [None]:
from sklearn.tree import DecisionTreeRegressor
# Just testing Imputer. Turns out somehow Imputer causes number of features reduced, weird.

# imputer = Imputer()
est = DecisionTreeRegressor(max_features=len(features))

data_train_i = np.copy(data_train)
print(data_train.shape)
print(data_train[0:10])
# data_train_i = imputer.fit_transform(data_train)
data_train_i[np.isnan(data_train_i)] = 0
data_train_i.astype('float32')
print(data_train_i.shape)
print(data_train_i[0:10])
est.fit(data_train_i, targets_train)
predictions = est.predict(data_test)
print(mape(data_test, predictions, targets_test))

# Results

- DecisionTreeRegressor + all features (31951 training data): 2.141
- DecisionTreeRegressor + all features (44202 training data): 3.065
- DecisionTreeRegressor + all + one hot encoded features (81832 training data): 5.726
- AdaBoostRegressor + all + one hot encoded features + PCA (102592 training data): 