In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os
from copy import copy

import numpy as np
import pandas as pd

from matplotlib import pylab as plt
import seaborn as sns

from random import choice

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import RandomizedSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle
from utils.generic_utils import pickle_out
import utils.preprocessing_utils as prep

In [15]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train = train.drop(["registered","casual"], axis=1)
Y_train = train[["count"]].values.ravel()

X_valid = valid.drop(["registered","casual"], axis=1)
Y_valid = valid[["count"]].values.ravel()

print X_train.shape,Y_train.shape
print X_valid.shape,Y_valid.shape

(7452, 10) (7452,)
(1722, 10) (1722,)


In [3]:
get_variables =['weather','date_year','workingday','time_hour',
                'temp','date_weekday','date_month','datetime']
encode_variables =['weather','date_year','workingday',]

encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var,
                                                                  drop_colname=True))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]

prep_steps = [('extract_times', prep.ExtractTimes()),
#               ('lagging_count',prep.LaggingValues(colname="count",lag=5)),
              ('date_to_number',prep.DateToNumber(colname="datetime")),
               ('get_variables',prep.ExtractColumns(colnames = get_variables))
             ]+\
               encoding_steps


prep_pipe= Pipeline(prep_steps)
# prep_pipe.fit(X_train)
X_train = prep_pipe.fit_transform(X_train,Y_train)
X_valid = prep_pipe.transform(X_valid)
print X_train.shape,X_valid.shape

(7452, 13) (1722, 13)


In [4]:
print X_train[['datetime']].describe()

          datetime
count  7452.000000
mean    510.535427
std       6.908918
min     499.000000
25%     505.000000
50%     511.000000
75%     517.000000
max     522.000000


In [5]:
rf =  RandomForestRegressor(n_jobs=3,oob_score=True,
                            n_estimators=100)
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=3, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [6]:
Y_train_pred = rf.predict(X_train)
Y_valid_pred = rf.predict(X_valid)
train_error = rmsle(Y_train_pred,Y_train)
valid_error = rmsle(Y_valid_pred,Y_valid)
print train_error,valid_error
print rf.oob_score_

0.154123268732 0.271054530755
0.945675005422


In [80]:
selection_steps = [("rf_selector",prep.RandomForestFeatureSelector(n_estimators = 10,
                                                                   drop_rate = 5,
                                                                   feature_threshold = 20,
                                                                   max_error_increase = 0.05
                                                                  ))
                  ]

select_pipe = Pipeline(selection_steps)
X_train = select_pipe.fit_transform(X_train,Y_train)
X_valid = select_pipe.transform(X_valid)
print X_train.shape,X_valid.shape

Number of features: 59
-9.58856639326
current error: 0.411433606739
Number of features: 54
-0.0315911617154
current error: 0.379842445024
Number of features: 49
0.0109851264268
current error: 0.390827571451
Number of features: 44
2.01518193372e-05
current error: 0.39084772327
Number of features: 39
0.021804108721
current error: 0.412651831991
Number of features: 34
-0.00378202340047
current error: 0.408869808591
Number of features: 29
-0.0313766956934
current error: 0.377493112897
Number of features: 24
-0.0289780341286
current error: 0.348515078768
(7452, 19) (1722, 19)


In [11]:
param_dist = {"max_depth": [15,20,25,30,35,None],
              "max_features": range(3,8,1),
              "min_samples_split": range(1,3,1),
              "min_samples_leaf": range(1,3,1)
              }

rolling_hyperparams = []
with open(os.path.join("../","models","random_forest_simple_hyperparams.txt"),"wb") as f:

    for i in range(10):
        print "Iteration:%s\n"%i
        md = choice(param_dist["max_depth"])
        print "max_depth:%s"%md
        mf = choice(param_dist["max_features"])
        print "max_features:%s"%mf
        ms = choice(param_dist["min_samples_split"])
        print "min_samples_split:%s"%ms
        ml = choice(param_dist["min_samples_leaf"])
        print "min_samples_leaf:%s\n"%ml

        rf = RandomForestRegressor(n_estimators=200,n_jobs=3,verbose=0,
                max_depth = md,
                max_features=mf,
                min_samples_split=ms,
                min_samples_leaf=ml,
               )
        rf.fit(X_train,Y_train)

        Y_train_pred = rf.predict(X_train)
        Y_valid_pred = rf.predict(X_valid)
        train_error = rmsle(Y_train_pred,Y_train)
        valid_error = rmsle(Y_valid_pred,Y_valid)
        print "Train rmsle:%s"%train_error
        print "Train rmsle:%s\n"%valid_error
        f.write("iteration:%s max_depth:%s max_features:%s min_samples_split:%s "\
        "min_samples_leaf:%s "\
        "train_error:%s valid_error:%s\n"%(i,md,mf,ms,ml,train_error,valid_error))
        
        rolling_hyperparams.append([md,mf,ms,ml,train_error,valid_error])
rolling_hyperparams = pd.DataFrame(np.vstack(np.array(rolling_hyperparams)))
rolling_hyperparams.columns = ['max_depth', 'max_features','min_samples_split',
                               'min_samples_leaf','train_error','valid_error'
                              ]

pickle_out(os.path.join("../","models","random_forest_simple_faetures_hyperparams.pkl"),rolling_hyperparams)

Iteration:0

max_depth:25
max_features:3
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.405002092758
Train rmsle:0.499152273436

Iteration:1

max_depth:20
max_features:4
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.336402269032
Train rmsle:0.414324218522

Iteration:2

max_depth:15
max_features:6
min_samples_split:1
min_samples_leaf:1

Train rmsle:0.183744721425
Train rmsle:0.29880916273

Iteration:3

max_depth:None
max_features:4
min_samples_split:2
min_samples_leaf:1

Train rmsle:0.182718175763
Train rmsle:0.359539002204

Iteration:4

max_depth:30
max_features:5
min_samples_split:1
min_samples_leaf:1

Train rmsle:0.167130104214
Train rmsle:0.31685154171

Iteration:5

max_depth:35
max_features:7
min_samples_split:2
min_samples_leaf:2

Train rmsle:0.235117609317
Train rmsle:0.292993649833

Iteration:6

max_depth:20
max_features:6
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.258526771879
Train rmsle:0.317355028062

Iteration:7

max_depth:25
max_features:5
min_sam

In [12]:
rolling_hyperparams = rolling_hyperparams.sort_values(['valid_error'])
print rolling_hyperparams.head()

  max_depth max_features min_samples_split min_samples_leaf train_error  \
5        35            7                 2                2    0.235118   
2        15            6                 1                1    0.183745   
4        30            5                 1                1     0.16713   
6        20            6                 1                2    0.258527   
7        25            5                 2                1    0.169729   

  valid_error  
5    0.292994  
2    0.298809  
4    0.316852  
6    0.317355  
7    0.320896  


In [13]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RandomForestRegressor(n_estimators=nr,
            max_features = 7,
            min_samples_split = 2,
            min_samples_leaf = 2,
            max_depth = 35
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.387333203703
2 0.359489317704
3 0.345646284073
5 0.323617221437
10 0.298700182011
50 0.300759223498
100 0.298561385674
200 0.292897655445
300 0.293232347168
500 0.294254389928
1000 0.295025799827


In [18]:
classification_steps = [('random_forest',RandomForestRegressor(n_estimators=500,
                                                     max_features = 6,
                                                     min_samples_split = 1,
                                                     min_samples_leaf = 1,
                                                     max_depth = 15))]

final_steps = prep_steps+classification_steps

final_pipe = Pipeline(final_steps)
final_pipe.fit(X_train,Y_train)

Y_train_pred = final_pipe.predict(X_train)
Y_valid_pred = final_pipe.predict(X_valid)
result_train = rmsle(Y_train_pred,Y_train)
result_valid = rmsle(Y_valid_pred,Y_valid)
print result_train
print result_valid

0.18190918406
0.294356562954


In [19]:
model_filepath = os.path.join("../","models","random_forest_simple_features_3_pipeline.pkl")
pickle_out(model_filepath,final_pipe,compresion_mode=5)