In [2]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os
from copy import copy

import numpy as np
import pandas as pd

from matplotlib import pylab as plt
import seaborn as sns

from random import choice

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor 
from sklearn.grid_search import RandomizedSearchCV ,GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, make_scorer
from sklearn.feature_selection import SelectKBest

from xgboost import XGBRegressor

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle,rmsle_on_logs,log_pandas,inv_log_pandas
from utils.generic_utils import pickle_out,pickle_in
import utils.preprocessing_utils as prep

In [3]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))
test = pd.read_csv(os.path.join(data_folder,"test.csv"))

X_train = train.drop(["registered","casual"], axis=1)
Y_train = train[["count"]]
Y_train_count = train[["count"]].apply(log_pandas)
Y_train_casual = train[["casual"]].apply(log_pandas)
Y_train_registered = train[["registered"]].apply(log_pandas)

X_valid = valid.drop(["registered","casual"], axis=1)
Y_valid = valid[["count"]]
Y_valid_count = valid[["count"]].apply(log_pandas)
Y_valid_casual = valid[["casual"]].apply(log_pandas)
Y_valid_registered = valid[["registered"]].apply(log_pandas)

X_test = test.drop(["registered","casual"], axis=1)
Y_test  = test[["count"]]
Y_test_count = test[["count"]].apply(log_pandas)
Y_test_casual = test[["casual"]].apply(log_pandas)
Y_test_registered = test[["registered"]].apply(log_pandas)

print X_train.shape,Y_train.shape
print X_valid.shape,Y_valid.shape
print X_test.shape,Y_test.shape

(7452, 10) (7452, 1)
(1722, 10) (1722, 1)
(2286, 10) (2286, 1)


In [4]:
vars_to_bin = ['atemp','humidity','windspeed']
bins_per_var = [[0,10,20,30,100],[0,20,40,60,120],[0,30,1000]]
binning_params = zip(vars_to_bin,bins_per_var)

binning_steps = [('binning_%s'%var,prep.BinVariable(colname = var,
                                                    bins = b,
                                                    drop_column = False)) for var,b in binning_params]

binned_varnames = ["%s_binned"%var for var,b in binning_params]

get_variables =['weather','date_year','workingday','season','holiday','time_hour',
                'atemp','humidity','date_weekday','date_month','windspeed']
encode_variables =['weather','date_year','workingday','season','holiday']
encode_variables.extend(binned_varnames)

encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var,
                                                                  drop_colname=True))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]

prep_steps = [('extract_times', prep.ExtractTimes()),
               ('get_variables',prep.ExtractColumns(colnames = get_variables))
             ]+\
               binning_steps+\
                encoding_steps

prep_pipe = Pipeline(prep_steps)
prep_pipe.fit(X_train)

Pipeline(steps=[('extract_times', ExtractTimes()), ('get_variables', ExtractColumns(colnames=['weather', 'date_year', 'workingday', 'season', 'holiday', 'time_hour', 'atemp', 'humidity', 'date_weekday', 'date_month', 'windspeed'])), ('binning_atemp', BinVariable(bins=[0, 10, 20, 30, 100], colname='atemp', dr...code_one_hot_windspeed_binned', PandasOneHotEncoder(colname='windspeed_binned', drop_colname=True))])

### All in one

In [29]:
classification_steps_all_xgb = [('xgboost',XGBRegressor(n_estimators=200,
                                                         learning_rate=0.1))]

classification_pipe_all = Pipeline(classification_steps_all_xgb)

param_dist_all= {"xgboost__max_depth": range(2,10,1),
                 "xgboost__min_child_weight":range(1,8,2),
                 "xgboost__gamma": [0,0.1,0.2],
                 "xgboost__subsample": [i/10.0 for i in range(6,10)],
                 "xgboost__colsample_bytree": [i/10.0 for i in range(6,10)],
                 "xgboost__reg_alpha":[1e-5, 1e-2, 0.1, 1, 100]
                }

random_search_all = RandomizedSearchCV(classification_pipe_all, 
                                   param_distributions = param_dist_all,
                                   scoring = make_scorer(rmsle_on_logs),
                                   n_iter=100,
                                   n_jobs=3,
                                   cv=10)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

random_search_all.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",random_search_all.best_estimator_
Y_pred_valid_count = pd.DataFrame(random_search_all.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('xgboost', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0.2, learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=100, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9))])
Best model error on valid 0.53155104223


In [30]:
print random_search_all.grid_scores_

[mean: 0.36822, std: 0.07278, params: {'xgboost__colsample_bytree': 0.9, 'xgboost__max_depth': 9, 'xgboost__min_child_weight': 5, 'xgboost__reg_alpha': 0.01, 'xgboost__subsample': 0.9, 'xgboost__gamma': 0}, mean: 0.50386, std: 0.09870, params: {'xgboost__colsample_bytree': 0.8, 'xgboost__max_depth': 8, 'xgboost__min_child_weight': 5, 'xgboost__reg_alpha': 100, 'xgboost__subsample': 0.8, 'xgboost__gamma': 0.1}, mean: 0.55696, std: 0.08568, params: {'xgboost__colsample_bytree': 0.8, 'xgboost__max_depth': 8, 'xgboost__min_child_weight': 7, 'xgboost__reg_alpha': 100, 'xgboost__subsample': 0.6, 'xgboost__gamma': 0}, mean: 0.34990, std: 0.05898, params: {'xgboost__colsample_bytree': 0.7, 'xgboost__max_depth': 7, 'xgboost__min_child_weight': 1, 'xgboost__reg_alpha': 0.1, 'xgboost__subsample': 0.7, 'xgboost__gamma': 0}, mean: 0.46321, std: 0.04601, params: {'xgboost__colsample_bytree': 0.8, 'xgboost__max_depth': 2, 'xgboost__min_child_weight': 1, 'xgboost__reg_alpha': 0.1, 'xgboost__subsample'

### Following tutorial 

In [23]:
classification_steps_tree_specific_xgb = [('xgboost',XGBRegressor(n_estimators=200,
                                                                  learning_rate=0.1,
                                                                  gamma = 0,
                                                                  subsample=0.8, 
                                                                  colsample_bytree=0.8,
                                                                  scale_pos_weight=1))]

classification_pipe_tree_specific = Pipeline(classification_steps_tree_specific_xgb)

param_dist_tree_specific = {"xgboost__max_depth": range(2,10,1),
                            "xgboost__min_child_weight":range(1,8,2)
                           }

grid_search_tree_specific = GridSearchCV(classification_pipe_tree_specific, 
                                   param_grid = param_dist_tree_specific,
                                   scoring = make_scorer(rmsle_on_logs),
                                   n_jobs=3,
                                   cv=10)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

grid_search_tree_specific.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",grid_search_tree_specific.best_estimator_
Y_pred_valid_count = pd.DataFrame(grid_search_tree_specific.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

KeyboardInterrupt: 

In [8]:
classification_steps_gamma_xgb = [('xgboost',XGBRegressor(n_estimators=10,
                                                         learning_rate=0.1,
                                                         max_depth = 4,
                                                         min_child_weight = 3,
                                                         subsample=0.8, 
                                                         colsample_bytree=0.8,
                                                         scale_pos_weight=1))]

classification_pipe_gamma = Pipeline(classification_steps_gamma_xgb)

param_grid_gamma = { "xgboost__gamma": [0,0.1,0.2]}

grid_search_gamma = GridSearchCV(classification_pipe_gamma, 
                                   param_grid=param_grid_gamma,
                                   scoring = make_scorer(rmsle_on_logs),
                                   n_jobs=3,
                                   cv=3)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

grid_search_gamma.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",grid_search_gamma.best_estimator_
Y_pred_valid_count = pd.DataFrame(grid_search_gamma.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('xgboost', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.8,
       gamma=0.1, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=3, missing=None, n_estimators=10, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8))])
Best model error on valid 1.66012738575


In [12]:
classification_steps_subsample_xgb = [('xgboost',XGBRegressor(n_estimators=10,
                                                         learning_rate=0.1,
                                                         gamma = 0.1,
                                                         max_depth = 4,
                                                         min_child_weight = 3,
                                                         scale_pos_weight=1,
                                                             ))]

classification_pipe_subsample = Pipeline(classification_steps_subsample_xgb)

param_grid_subsample = { "xgboost__subsample": [i/10.0 for i in range(6,10)],
                    "xgboost__colsample_bytree": [i/10.0 for i in range(6,10)],
                   }

grid_search_subsample = GridSearchCV(classification_pipe_gamma, 
                                   param_grid=param_grid_subsample,
                                   scoring = make_scorer(rmsle_on_logs),
                                   n_jobs=3,
                                   cv=3)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

grid_search_subsample.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",grid_search_subsample.best_estimator_
Y_pred_valid_count = pd.DataFrame(grid_search_subsample.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('xgboost', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=3, missing=None, n_estimators=10, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8))])
Best model error on valid 1.68318089282


In [16]:
classification_steps_regul_xgb = [('xgboost',XGBRegressor(n_estimators=10,
                                                         learning_rate=0.1,
                                                         gamma = 0,
                                                         max_depth = 4,
                                                         min_child_weight = 3,
                                                         scale_pos_weight=1,
                                                         subsample = 0.8,
                                                         colsample_bytree = 0.6,
                                                             ))]

classification_pipe_regul = Pipeline(classification_steps_regul_xgb)

param_grid_regul = { "xgboost__reg_alpha":[1e-5, 1e-2, 0.1, 1, 100]}

grid_search_regul = GridSearchCV(classification_pipe_regul, 
                                   param_grid=param_grid_regul,
                                   scoring = make_scorer(rmsle_on_logs),
                                   n_jobs=3,
                                   cv=3)

X_tr = prep_pipe.transform(X_train)
X_vd = prep_pipe.transform(X_valid)

grid_search_regul.fit(X_tr,Y_train.apply(log_pandas).values.ravel())

print "Best Params",grid_search_regul.best_estimator_
Y_pred_valid_count = pd.DataFrame(grid_search_regul.predict(X_vd)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                           Y_valid_count.apply(inv_log_pandas).values.ravel())
print "Best model error on valid",valid_error_count

Best Params Pipeline(steps=[('xgboost', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0, learning_rate=0.1, max_delta_step=0, max_depth=4,
       min_child_weight=3, missing=None, n_estimators=10, nthread=-1,
       objective='reg:linear', reg_alpha=100, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.8))])
Best model error on valid 1.77159671304


In [32]:
print random_search_all.best_estimator_

Pipeline(steps=[('xgboost', XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=0.6,
       gamma=0.2, learning_rate=0.1, max_delta_step=0, max_depth=2,
       min_child_weight=1, missing=None, n_estimators=200, nthread=-1,
       objective='reg:linear', reg_alpha=100, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=0.9))])


In [6]:
classification_steps_final_xgb = [('xgboost',XGBRegressor(n_estimators=2000,
                                                         learning_rate=0.01,
#                                                          gamma = 0.2,
#                                                          max_depth = 2,
#                                                          min_child_weight = 1,
#                                                          scale_pos_weight=1,
#                                                          subsample = 0.9,
#                                                          colsample_bytree = 0.6,
#                                                          reg_alpha= 100
                                                             ))]

classification_pipe_final = Pipeline(classification_steps_final_xgb)

In [7]:
final_steps = prep_steps+classification_steps_final_xgb
final_pipe = Pipeline(final_steps)

final_pipe.fit(X_train,Y_train.apply(log_pandas).values.ravel())

Y_pred_valid_count = pd.DataFrame(final_pipe.predict(X_valid)).apply(inv_log_pandas)
valid_error_count = rmsle(Y_pred_valid_count.values.ravel(),
                       Y_valid_count.apply(inv_log_pandas).values.ravel()
                          )
print "Final model valid results:",valid_error_count

Y_pred_test_count = pd.DataFrame(final_pipe.predict(X_test)).apply(inv_log_pandas)
test_error_count = rmsle(Y_pred_test_count.values.ravel(),
                       Y_test_count.apply(inv_log_pandas).values.ravel()
                          )
print "Final model test results:",test_error_count

Final model valid results: 0.319986817945
Final model test results: 0.33280726894


In [8]:
model_filepath = os.path.join("../","models","xgboost_pipeline.pkl")
pickle_out(model_filepath,final_pipe,compresion_mode=5)

In [9]:
kaggle_folder = os.path.join("../","datasets","kaggle_sets")
kaggle_test = pd.read_csv(os.path.join(kaggle_folder,"test.csv"))
full_pipeline = pickle_in(os.path.join("../","models","xgboost_pipeline.pkl"),
                          compresion_mode=5)

X_test = kaggle_test
kaggle_datetime = kaggle_test[["datetime"]]
Y_kaggle = pd.DataFrame(full_pipeline.predict(X_test)).apply(inv_log_pandas)
kaggle_datetime["count"] = Y_kaggle
kaggle_datetime.to_csv(os.path.join("../","submissions",
                                    "xgboost_submission.csv"),index=False)

print kaggle_datetime.head()

              datetime     count
0  2011-01-20 00:00:00  7.200272
1  2011-01-20 01:00:00  4.248402
2  2011-01-20 02:00:00  2.501111
3  2011-01-20 03:00:00  1.824848
4  2011-01-20 04:00:00  1.685915


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
