In [5]:
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.grid_search import RandomizedSearchCV as RS
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle
from utils.generic_utils import pickle_out
from utils.preprocessing_utils import preprocess_weekdays,generate_timelag_data
# import seaborn as sb

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train,Y_train = preprocess_weekdays(train,mode="train")
X_valid,Y_valid = preprocess_weekdays(valid,mode="train")

X_train = generate_timelag_data(X_train,lag=5)
X_valid = generate_timelag_data(X_valid,lag=5)

X_train = X_train[['time_hour','date_year', 'workingday', 'atemp', 'date_month', 'temp',
 'date_weekday', 'temp_lag_1', 'atemp_lag_1', 'humidity', 'season',
 'humidity_lag_1', 'weather', 'weather_lag_1', 'atemp_lag_3']]

X_valid = X_valid[['time_hour','date_year', 'workingday', 'atemp', 'date_month', 'temp',
 'date_weekday', 'temp_lag_1', 'atemp_lag_1', 'humidity', 'season',
 'humidity_lag_1', 'weather', 'weather_lag_1', 'atemp_lag_3']]

print X_train.shape,Y_train.shape

print X_train.shape,Y_train.shape

   season  holiday  workingday  weather  temp   atemp  humidity  windspeed  \
0       1        0           0        1  9.84  14.395        81        0.0   
1       1        0           0        1  9.02  13.635        80        0.0   
2       1        0           0        1  9.02  13.635        80        0.0   
3       1        0           0        1  9.84  14.395        75        0.0   
4       1        0           0        1  9.84  14.395        75        0.0   

   time_hour  date_weekday  date_month  date_year  
0          0             5           1       2011  
1          1             5           1       2011  
2          2             5           1       2011  
3          3             5           1       2011  
4          4             5           1       2011  
(7452, 15) (7452,)
(7452, 15) (7452,)


In [9]:
X_train.head(5)

Unnamed: 0,time_hour,date_year,workingday,atemp,date_month,temp,date_weekday,temp_lag_1,atemp_lag_1,humidity,season,humidity_lag_1,weather,weather_lag_1,atemp_lag_3
0,0,2011,0,14.395,1,9.84,5,9.84,14.395,81,1,81.0,1,1.0,14.395
1,1,2011,0,13.635,1,9.02,5,9.84,14.395,80,1,81.0,1,1.0,14.395
2,2,2011,0,13.635,1,9.02,5,9.02,13.635,80,1,80.0,1,1.0,14.395
3,3,2011,0,14.395,1,9.84,5,9.02,13.635,75,1,80.0,1,1.0,14.395
4,4,2011,0,14.395,1,9.84,5,9.84,14.395,75,1,75.0,1,1.0,13.635


In [10]:
rf = RF()
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [11]:
Y_train_pred = rf.predict(X_train)
Y_valid_pred = rf.predict(X_valid)

print rmsle(Y_train_pred,Y_train)
print rmsle(Y_valid_pred,Y_valid)

0.174682998421
0.275766625857


In [12]:
steps = [('random_forest', RF(n_estimators=100))]

pipe= Pipeline(steps)

param_dist = {"random_forest__max_depth": [5,10,15,20],
              "random_forest__max_features": sp_randint(1, 6),
              "random_forest__min_samples_split": sp_randint(1, 6),
              "random_forest__min_samples_leaf": sp_randint(1, 6)
              }

rf_cv = RS(pipe, 
           param_distributions=param_dist,
           n_iter=100,           
           cv=5,
           verbose=1,
           n_jobs=3,
          )
rf_cv.fit(X_train, Y_train)

Y_valid_pred = rf_cv.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   26.9s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  1.6min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  3.8min
[Parallel(n_jobs=3)]: Done 500 out of 500 | elapsed:  4.4min finished


In [13]:
print rf_cv.best_params_
print valid_score

{'random_forest__min_samples_split': 2, 'random_forest__min_samples_leaf': 1, 'random_forest__max_features': 5, 'random_forest__max_depth': 20}
0.414095433405


In [14]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RF(n_estimators=nr,
            max_features = 5,
            min_samples_split = 2,
            min_samples_leaf = 1,
            max_depth = 20
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.511338802242
2 0.542134872016
3 0.539792152793
5 0.460386022827
10 0.485282787597
50 0.40650818771
100 0.413724499659
200 0.42054596885
300 0.42145827527
500 0.417096210273
1000 0.421804169873


In [17]:
rf = RF(n_estimators = 100,
        max_features = 5,
        min_samples_split = 2,
        min_samples_leaf = 1,
        max_depth = 20,
       )
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=5, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=100, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [18]:
Y_valid_pred = rf.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)
print valid_score

0.429213235563


In [19]:
model_filepath = os.path.join("../","models","random_forest_weekdays_lagged.pkl")
pickle_out(model_filepath,rf,compresion_mode=9)