In [80]:
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os
from copy import copy

import numpy as np
import pandas as pd

from random import choice

from sklearn.preprocessing import OneHotEncoder
from sklearn.ensemble import RandomForestRegressor 
from sklearn.grid_search import RandomizedSearchCV 
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import SelectKBest

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle
from utils.generic_utils import pickle_out,pickle_in
import utils.preprocessing_utils as prep

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [76]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train = train.drop(["count","registered","casual"], axis=1)
Y_train = train[["count"]].values.ravel()

X_valid = valid.drop(["count","registered","casual"], axis=1)
Y_valid = valid[["count"]].values.ravel()

print X_train.shape,Y_train.shape
print X_valid.shape,Y_valid.shape
print X_train.head()

(7452, 9) (7452,)
(1722, 9) (1722,)
              datetime  season  holiday  workingday  weather  temp   atemp  \
0  2011-01-01 00:00:00       1        0           0        1  9.84  14.395   
1  2011-01-01 01:00:00       1        0           0        1  9.02  13.635   
2  2011-01-01 02:00:00       1        0           0        1  9.02  13.635   
3  2011-01-01 03:00:00       1        0           0        1  9.84  14.395   
4  2011-01-01 04:00:00       1        0           0        1  9.84  14.395   

   humidity  windspeed  
0        81        0.0  
1        80        0.0  
2        80        0.0  
3        75        0.0  
4        75        0.0  


In [77]:
rolling_lags = [5,20,50,300,1000]
edge_gap_varnames = ['edge_lag_%s'%l for l in rolling_lags]
edge_gap_steps = [('add_edge_gap_lag_%s'%lag,prep.AddTimeGaps(lag=lag))for lag in rolling_lags]

max_lag=10
numerical_lagging_variables = ['temp','atemp','humidity','windspeed']
categorical_lagging_variables = ['weather','holiday','workingday','season']
categorical_lagging_varnames = ['%s_lag_%s'%(var,l) for var in categorical_lagging_variables for l in range(1,max_lag,1)]
all_lagging_variables = numerical_lagging_variables + categorical_lagging_variables

lagging_values_steps = [('%s_lag_%s'%(var,lag),
                         prep.LaggingValues(colname = var, lag = max_lag))for var in all_lagging_variables]

stat_calculation_modes = ["whole","day","night"]
median_steps = [('median_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMedian(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
max_steps = [('max_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMax(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
min_steps =[('min_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMin(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in numerical_lagging_variables 
                for mode in stat_calculation_modes]
mode_steps =[('mode_%s_%s_%s'%(var,mode,lag),
                 prep.LaggingMode(colname =var,
                                    lag = lag,
                                    mode=mode)) for lag in rolling_lags
                for var in categorical_lagging_variables 
                for mode in stat_calculation_modes]

lagging_mode_varnames = ["%s_%s_mode_%s"%(var,mode,lag) for lag in rolling_lags
                for var in categorical_lagging_variables 
                for mode in stat_calculation_modes]

encode_variables = ['season','weather','holiday','workingday','date_year','day_night']
encode_variables.extend(edge_gap_varnames)
encode_variables.extend(categorical_lagging_varnames)
encode_variables.extend(lagging_mode_varnames)

encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]


prep_steps = [('extract_times', prep.ExtractTimes())]+\
            edge_gap_steps+lagging_values_steps+\
            median_steps+max_steps+min_steps+mode_steps+\
            encoding_steps
#                [('drop_columns', prep.DropColumns(colnames = ["datetime"])),
#               ('feature_selection',SelectKBest(k=100))
#              ]
              
prep_pipe = Pipeline(prep_steps)
X_train = prep_pipe.fit_transform(X_train,Y_train)
X_valid = prep_pipe.transform(X_valid)
print X_train.shape,X_valid.shape

(7452, 630) (1722, 630)


In [78]:
pickle_out(os.path.join("../","datasets","generated_features","train_lagging_all.pkl"),(X_train,Y_train))
pickle_out(os.path.join("../","datasets","generated_features","valid_lagging_all.pkl"),(X_valid,Y_valid))

In [82]:
X_train,Y_train = pickle_in(os.path.join("../","datasets","generated_features","train_lagging_all.pkl"))
X_valid,Y_valid = pickle_in(os.path.join("../","datasets","generated_features","valid_lagging_all.pkl"))

count    7452.000000
mean        1.505636
std         1.116653
min         0.000000
25%         1.000000
50%         2.000000
75%         3.000000
max         3.000000
Name: season, dtype: float64


In [None]:
print xa["season"].describe()

In [64]:
rf =  RandomForestRegressor(n_jobs=3,oob_score=True,
                            n_estimators=200,
#                             max_features=100,
#                             min_samples_split = 3,
#                             min_samples_leaf=3,
#                             max_depth=25
                           )
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=200, n_jobs=3, oob_score=True, random_state=None,
           verbose=0, warm_start=False)

In [65]:
Y_train_pred = rf.predict(X_train)
Y_valid_pred = rf.predict(X_valid)
train_error = rmsle(Y_train_pred,Y_train)
valid_error = rmsle(Y_valid_pred,Y_valid)
print train_error,valid_error
print rf.oob_score_

0.168424046546 0.383843515185
0.946185975148


In [185]:
param_dist = {"max_depth": [15,20,25,30,35,None],
              "max_features": range(5,15,1),
              "min_samples_split": range(1,3,1),
              "min_samples_leaf": range(1,3,1)
              }

rolling_hyperparams = []
with open(os.path.join("../","models","random_forest_simple_hyperparams.txt"),"wb") as f:
    for i in range(50):
        print "Iteration:%s\n"%i
        md = choice(param_dist["max_depth"])
        print "max_depth:%s"%md
        mf = choice(param_dist["max_features"])
        print "max_features:%s"%mf
        ms = choice(param_dist["min_samples_split"])
        print "min_samples_split:%s"%ms
        ml = choice(param_dist["min_samples_leaf"])
        print "min_samples_leaf:%s\n"%ml

        rf = RandomForestRegressor(n_estimators=200,n_jobs=3,verbose=0,
                max_depth = md,
                max_features=mf,
                min_samples_split=ms,
                min_samples_leaf=ml,
               )
        rf.fit(X_train,Y_train)

        Y_train_pred = rf.predict(X_train)
        Y_valid_pred = rf.predict(X_valid)
        train_error = rmsle(Y_train_pred,Y_train)
        valid_error = rmsle(Y_valid_pred,Y_valid)
        print "Train rmsle:%s"%train_error
        print "Train rmsle:%s\n"%valid_error
        f.write("iteration:%s max_depth:%s max_features:%s min_samples_split:%s "\
        "min_samples_leaf:%s "\
        "train_error:%s valid_error:%s\n"%(i,md,mf,ms,ml,train_error,valid_error))
        
        rolling_hyperparams.append([md,mf,ms,ml,train_error,valid_error])
rolling_hyperparams = rolling_hyperparams.vstack(rolling_hyperparams)
pickle_out(os.path.join("../","models","random_forest_simple_hyperparams.pkl"),rolling_hyperparams)

Iteration:0

max_depth:25
max_features:11
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.355864588928
Train rmsle:0.47874231602

Iteration:1

max_depth:15
max_features:7
min_samples_split:1
min_samples_leaf:1

Train rmsle:0.369293766445
Train rmsle:0.580033228717

Iteration:2

max_depth:30
max_features:6
min_samples_split:2
min_samples_leaf:1

Train rmsle:0.282360322424
Train rmsle:0.565542306743

Iteration:3

max_depth:30
max_features:6
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.478060988588
Train rmsle:0.610556677743

Iteration:4

max_depth:None
max_features:6
min_samples_split:1
min_samples_leaf:2

Train rmsle:0.494751769337
Train rmsle:0.634587514056

Iteration:5

max_depth:30
max_features:7
min_samples_split:2
min_samples_leaf:1

Train rmsle:0.268502970449
Train rmsle:0.539806381725

Iteration:6

max_depth:20
max_features:14
min_samples_split:2
min_samples_leaf:1

Train rmsle:0.206943120846
Train rmsle:0.38938199772

Iteration:7

max_depth:15
max_features:11
min_

KeyboardInterrupt: 

In [82]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RandomForestRegressor(n_estimators=nr,
#             max_features = 9,
#             min_samples_split = 1,
#             min_samples_leaf = 1,
#             max_depth = 20
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.524246358104
2 0.434469563019
3 0.395799150779
5 0.380644031519
10 0.351612307916
50 0.356765725173
100 0.347071524506
200 0.341765233624
300 0.34752623493
500 0.344099122685


KeyboardInterrupt: 

In [218]:
encode_variables = ['season','weather','holiday','workingday','date_year','date_weekday','day_night',
                    'date_month']
encoding_steps = [[('encode_label_%s'%var,prep.PandasLabelEncoder(colname=var)),\
        ('encode_one_hot_%s'%var,prep.PandasOneHotEncoder(colname=var))] for var in encode_variables]
encoding_steps = [en_st for sublist in encoding_steps for en_st in sublist ]

final_steps = [('extract_times', prep.ExtractTimes())]+\
                encoding_steps+\
             [('drop_columns', prep.DropColumns(colnames = ["datetime"])),
             ('random_forest',RandomForestRegressor(n_jobs=3,oob_score=True,
                                                    n_estimators=500,
                                                    max_features=49,
                                                    min_samples_split = 3,
                                                    min_samples_leaf=3,
                                                    max_depth=25))]

final_pipe = Pipeline(final_steps)
final_pipe.fit(X_train,Y_train)

Y_train_pred = final_pipe.predict(X_train)
Y_valid_pred = final_pipe.predict(X_valid)
result_train = rmsle(Y_train_pred,Y_train)
result_valid = rmsle(Y_valid_pred,Y_valid)
print result_train
print result_valid

0.231232339306
0.281295493245


In [219]:
model_filepath = os.path.join("../","models","random_forest_weekdays_pipeline.pkl")
pickle_out(model_filepath,final_pipe,compresion_mode=5)