In [1]:
%load_ext autoreload
%autoreload 2

# from __future__ import absolute_import
import sys
sys.path.append("../")

import os

import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestRegressor as RF
from sklearn.grid_search import RandomizedSearchCV as RS
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error

from scipy.stats import randint as sp_randint

from utils.evaluation_utils import rmsle
from utils.generic_utils import pickle_out
from utils.preprocessing_utils import preprocess_weekdays_important,preprocess_weekdays_important_strict
# import seaborn as sb

In [2]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train,Y_train = preprocess_weekdays_important(train,mode="train")
X_valid,Y_valid = preprocess_weekdays_important(valid,mode="train")

print X_train.shape,Y_train.shape

print X_train.shape,Y_train.shape

(7452, 8) (7452,)
(7452, 8) (7452,)


In [3]:
X_train.head(5)

Unnamed: 0,workingday,temp,atemp,humidity,time_hour,date_weekday,date_month,date_year
0,0,9.84,14.395,81,0,5,1,2011
1,0,9.02,13.635,80,1,5,1,2011
2,0,9.02,13.635,80,2,5,1,2011
3,0,9.84,14.395,75,3,5,1,2011
4,0,9.84,14.395,75,4,5,1,2011


In [4]:
rf = RF()
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [5]:
Y_train_pred = rf.predict(X_train)
Y_valid_pred = rf.predict(X_valid)

print rmsle(Y_train_pred,Y_train)
print rmsle(Y_valid_pred,Y_valid)

0.174503171831
0.284703216384


In [6]:
steps = [('random_forest', RF(n_estimators=100))]

pipe= Pipeline(steps)

param_dist = {"random_forest__max_depth": [5,10,15,20],
              "random_forest__max_features": sp_randint(1, 6),
              "random_forest__min_samples_split": sp_randint(1, 6),
              "random_forest__min_samples_leaf": sp_randint(1, 6)
              }

rf_cv = RS(pipe, 
           param_distributions=param_dist,
           n_iter=100,           
           cv=5,
           verbose=1,
           n_jobs=3,
          )
rf_cv.fit(X_train, Y_train)

Y_valid_pred = rf_cv.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   23.8s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  1.8min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  4.0min
[Parallel(n_jobs=3)]: Done 500 out of 500 | elapsed:  4.5min finished


In [7]:
print rf_cv.best_params_
print valid_score

{'random_forest__min_samples_split': 3, 'random_forest__min_samples_leaf': 1, 'random_forest__max_features': 5, 'random_forest__max_depth': 20}
0.284295749936


In [8]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RF(n_estimators=nr,
            max_features = 5,
            min_samples_split = 3,
            min_samples_leaf = 1,
            max_depth = 20
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.483369878553
2 0.336238068307
3 0.327379588048
5 0.322107132102
10 0.303635996078
50 0.285588244462
100 0.282819549007
200 0.283556335039
300 0.284580393186
500 0.282991756713
1000 0.283100314592


In [9]:
rf = RF(n_estimators = 500,
        max_features = 5,
        min_samples_split = 3,
        min_samples_leaf = 1,
        max_depth = 20,
       )
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=5, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=3, min_weight_fraction_leaf=0.0,
           n_estimators=500, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [10]:
Y_valid_pred = rf.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)
print valid_score

0.282781195143


In [11]:
model_filepath = os.path.join("../","models","random_forest_weekdays_important.pkl")
pickle_out(model_filepath,rf,compresion_mode=9)

In [3]:
data_folder = os.path.join("../","datasets","initial_data_split")

train = pd.read_csv(os.path.join(data_folder,"train.csv"))
valid = pd.read_csv(os.path.join(data_folder,"valid.csv"))

X_train,Y_train = preprocess_weekdays_important_strict(train,mode="train")
X_valid,Y_valid = preprocess_weekdays_important_strict(valid,mode="train")

print X_train.shape,Y_train.shape

print X_train.shape,Y_train.shape

(7452, 5) (7452,)
(7452, 5) (7452,)


In [4]:
steps = [('random_forest', RF(n_estimators=100))]

pipe= Pipeline(steps)

param_dist = {"random_forest__max_depth": [5,10,15,20],
              "random_forest__max_features": sp_randint(1, 6),
              "random_forest__min_samples_split": sp_randint(1, 6),
              "random_forest__min_samples_leaf": sp_randint(1, 6)
              }

rf_cv = RS(pipe, 
           param_distributions=param_dist,
           n_iter=100,           
           cv=5,
           verbose=1,
           n_jobs=3,
          )
rf_cv.fit(X_train, Y_train)

Y_valid_pred = rf_cv.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


[Parallel(n_jobs=3)]: Done  44 tasks      | elapsed:   25.3s
[Parallel(n_jobs=3)]: Done 194 tasks      | elapsed:  1.7min
[Parallel(n_jobs=3)]: Done 444 tasks      | elapsed:  3.8min
[Parallel(n_jobs=3)]: Done 500 out of 500 | elapsed:  4.3min finished


In [5]:
print rf_cv.best_params_
print valid_score

{'random_forest__min_samples_split': 4, 'random_forest__min_samples_leaf': 1, 'random_forest__max_features': 3, 'random_forest__max_depth': 20}
0.385257877407


In [6]:
number_of_trees = [1,2,3,5,10,50,100,200,300,500,1000]

for nr in number_of_trees:
    rf = RF(n_estimators=nr,
            max_features = 3,
            min_samples_split = 4,
            min_samples_leaf = 1,
            max_depth = 20
           )
    rf.fit(X_train, Y_train)
    Y_valid_pred = rf.predict(X_valid)
    valid_score = rmsle(Y_valid_pred,Y_valid)
    print nr, valid_score

1 0.492437471848
2 0.433237980401
3 0.428760870021
5 0.413447186146
10 0.393340235553
50 0.39361298094
100 0.390330362741
200 0.385030443076
300 0.385757349051
500 0.386418033198
1000 0.385885465733


In [7]:
rf = RF(n_estimators = 1000,
        max_features = 3,
        min_samples_split = 4,
        min_samples_leaf = 1,
        max_depth = 20,
       )
rf.fit(X_train,Y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=20,
           max_features=3, max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=4, min_weight_fraction_leaf=0.0,
           n_estimators=1000, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [8]:
Y_valid_pred = rf.predict(X_valid)
valid_score = rmsle(Y_valid_pred,Y_valid)
print valid_score

0.385521517844


In [9]:
model_filepath = os.path.join("../","models","random_forest_weekdays_important_strict.pkl")
pickle_out(model_filepath,rf,compresion_mode=9)