In [43]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os
from copy import copy

import numpy as np
import pandas as pd

from matplotlib import pylab as plt
import seaborn as sns

from random import choice

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from sklearn.grid_search import RandomizedSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error as mse
from sklearn.feature_selection import SelectKBest

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle,log_pandas,inv_log_pandas
from utils.generic_utils import pickle_out
import utils.preprocessing_utils as prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [44]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))
test = pd.read_csv(os.path.join(data_folder,"test.csv"))

X_train = train.drop(["registered","casual"], axis=1)
Y_train = train[["count"]]
Y_train_count = train[["count"]].apply(log_pandas)
Y_train_casual = train[["casual"]].apply(log_pandas)
Y_train_registered = train[["registered"]].apply(log_pandas)

X_valid = valid.drop(["registered","casual"], axis=1)
Y_valid = valid[["count"]]
Y_valid_count = valid[["count"]].apply(log_pandas)
Y_valid_casual = valid[["casual"]].apply(log_pandas)
Y_valid_registered = valid[["registered"]].apply(log_pandas)

X_test = test.drop(["registered","casual"], axis=1)
Y_test  = test[["count"]]
Y_test_count = test[["count"]].apply(log_pandas)
Y_test_casual = test[["casual"]].apply(log_pandas)
Y_test_registered = test[["registered"]].apply(log_pandas)

print X_train.shape,Y_train.shape
print X_valid.shape,Y_valid.shape
print X_test.shape,Y_test.shape

(7452, 10) (7452, 1)
(1722, 10) (1722, 1)
(2286, 10) (2286, 1)


In [45]:
vars_to_bin = ['atemp','humidity','windspeed']
bins_per_var = [[0,10,20,30,100],[0,20,40,60,120],[0,30,1000]]
binning_params = zip(vars_to_bin,bins_per_var)

binning_steps = [('binning_%s'%var,prep.BinVariable(colname = var,
                                                    bins = b,
                                                    drop_column = True)) for var,b in binning_params]

binned_varnames = ["%s_binned"%var for var,b in binning_params]

get_variables =['weather','date_year','workingday','season','holiday','time_hour',
                'atemp','humidity','date_weekday','date_month','windspeed']
encode_variables =['weather','date_year','workingday','season','holiday']
encode_variables.extend(binned_varnames)

encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var,
                                                                  drop_colname=True))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]

prep_steps = [('extract_times', prep.ExtractTimes()),
               ('get_variables',prep.ExtractColumns(colnames = get_variables))
             ]+\
               binning_steps+\
                encoding_steps

prep_pipe = Pipeline(prep_steps)
prep_pipe.fit(X_train)

Pipeline(steps=[('extract_times', ExtractTimes()), ('get_variables', ExtractColumns(colnames=['weather', 'date_year', 'workingday', 'season', 'holiday', 'time_hour', 'atemp', 'humidity', 'date_weekday', 'date_month', 'windspeed'])), ('binning_atemp', BinVariable(bins=[0, 10, 20, 30, 100], colname='atemp', dr...code_one_hot_windspeed_binned', PandasOneHotEncoder(colname='windspeed_binned', drop_colname=True))])

In [34]:
X_tr_save = prep_pipe.transform(X_train)
X_vd_save = prep_pipe.transform(X_valid)
X_ts_save = prep_pipe.transform(X_test)

pickle_out(os.path.join("../","datasets","generated_features","train_binned.pkl"),(X_tr_save,Y_train))
pickle_out(os.path.join("../","datasets","generated_features","valid_binned.pkl"),(X_vd_save,Y_valid))
pickle_out(os.path.join("../","datasets","generated_features","test_binned.pkl"),(X_ts_save,Y_test))

In [46]:
X_tr = X_train.copy()
X_vd = X_valid.copy()
X_tr = prep_pipe.transform(X_tr)
X_vd = prep_pipe.transform(X_vd)

In [48]:
selection_steps = [("rf_selector",prep.RandomForestFeatureSelector(n_estimators = 500,
                                                                   drop_rate = 1,
                                                                   feature_threshold = 10,
                                                                   max_error_increase = 0.02
                                                                  ))
                  ]

select_pipe = Pipeline(selection_steps)

X_tr = select_pipe.fit_transform(X_tr,Y_train.apply(log_pandas).values.ravel())
X_vd = select_pipe.transform(X_vd)
print X_tr.shape,X_vd.shape

KeyError: 'datetime'

In [13]:
classification_steps_rf = [('random_forest',RandomForestRegressor(n_estimators=500,n_jobs=3))]

classification_pipe = Pipeline(classification_steps_rf)

param_dist = {"random_forest__max_depth": [10,15,20,None],
              "random_forest__max_features": sp_randint(5, 20),
              "random_forest__min_samples_split": sp_randint(1, 4),
              "random_forest__min_samples_leaf": sp_randint(1, 4),
              "random_forest__bootstrap": [True, False]
             }

random_search = RandomizedSearchCV(classification_pipe, 
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   n_jobs=3)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

random_search.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",random_search.best_estimator_
Y_pred_valid_count = pd.DataFrame(random_search.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('random_forest', RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=12, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=3, oob_score=False, random_state=None,
           verbose=0, warm_start=False))])
Best model error on valid 0.281643370371


In [13]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RandomForestRegressor(n_estimators=nr,
            bootstrap = True,
            max_features = 11,
            min_samples_split = 1,
            min_samples_leaf = 1,
            max_depth = 15,
            n_jobs=3
           )
    rf.fit(X_tr,Y_train.apply(log_pandas).values.ravel())
    
    Y_pred_valid_count = pd.DataFrame(rf.predict(X_vd)).apply(inv_log_pandas)
    valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
    print nr, valid_error_count

1 0.425439222986
2 0.355814864039
3 0.345535399068
5 0.316738339043
10 0.293629554869
50 0.261220532839
100 0.260434100401
200 0.26177390232
300 0.265106811838
500 0.260916452728
1000 0.261336940465


In [20]:
classification_steps_gbr = [('gradient_boosting',GradientBoostingRegressor(n_estimators=500))]

classification_pipe = Pipeline(classification_steps_gbr)

param_dist = {"gradient_boosting__max_depth": [5,10,15,20,None],
              "gradient_boosting__max_features": range(3,11,1)+[None],
              "gradient_boosting__min_samples_split": sp_randint(1, 4),
              "gradient_boosting__min_samples_leaf": sp_randint(1, 4),
              "gradient_boosting__warm_start": [True, False]
             }

random_search = RandomizedSearchCV(classification_pipe, 
                                   param_distributions=param_dist,
                                   n_iter=50,
                                   n_jobs=3)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

random_search.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",random_search.best_estimator_
Y_pred_valid_count = pd.DataFrame(random_search.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('gradient_boosting', GradientBoostingRegressor(alpha=0.9, init=None, learning_rate=0.1, loss='ls',
             max_depth=5, max_features=8, max_leaf_nodes=None,
             min_samples_leaf=2, min_samples_split=3,
             min_weight_fraction_leaf=0.0, n_estimators=500,
             presort='auto', random_state=None, subsample=1.0, verbose=0,
             warm_start=False))])
Best model error on valid 0.265854628586


In [None]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    gbr = GradientBoostingRegressor(n_estimators=nr,
            warm_start = True,
            max_features = 11,
            min_samples_split = 1,
            min_samples_leaf = 1,
            max_depth = 15
           )
    gbr.fit(X_tr,Y_train.apply(log_pandas).values.ravel())
    
    Y_pred_valid_count = pd.DataFrame(gbr.predict(X_vd)).apply(inv_log_pandas)
    valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
    print nr, valid_error_count

In [14]:
classification_steps_best = [('random_forest',RandomForestRegressor(n_estimators=500,
                                                                    bootstrap = True,
                                                                    max_features = 11,
                                                                    min_samples_split = 1,
                                                                    min_samples_leaf = 1,
                                                                    max_depth = 15,
                                                                    n_jobs=3
                                                                     ))]

In [15]:
final_steps = prep_steps+classification_steps_best
final_pipe = Pipeline(final_steps)

final_pipe.fit(X_train,Y_train.apply(log_pandas).values.ravel())

Y_pred_valid_count = pd.DataFrame(final_pipe.predict(X_valid)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                       Y_valid_count.apply(inv_log_pandas).values.ravel()
                          )
print "Final model valid results:",valid_error_count

Y_pred_test_count = pd.DataFrame(final_pipe.predict(X_test)).apply(inv_log_pandas)
test_error_count = rmsle(Y_pred_test_count.values.ravel(),
                       Y_test_count.apply(inv_log_pandas).values.ravel()
                          )
print "Final model test results:",test_error_count

Final model valid results: 0.259172114371
Final model test results: 0.342106285347


In [16]:
model_filepath = os.path.join("../","models","random_forest_simple_features_5_pipeline.pkl")
pickle_out(model_filepath,final_pipe,compresion_mode=5)