In [1]:
# Fitting a random forest classifier to the data

import numpy as np
import pandas as pd
import matplotlib.pyplot as pp
import datetime as dt

from sklearn import ensemble

%matplotlib inline

## Data import and cleaning

In [3]:
# Import the training data and merge with the properties data

df_train = pd.read_csv('../data/train_2016_v2.csv')
df_properties = pd.read_csv('../data/properties_2016.csv')
df_train = df_train.merge(df_properties, on=['parcelid'])

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
# drop the columns with non-numeric data
df_train = df_train.drop(df_train.loc[:,df_train.dtypes == 'object'], axis=1)

In [5]:
# remove nans by filling them with the most negative value possible
df_train = df_train.fillna(df_train.min().min()-1)

In [6]:
df_train.shape

(90275, 54)

## Random Forest Regressor

#### Function to create random samples of the data for  training and testing:

In [7]:
def create_random_train_test_samples(df, train_frac):
    len_df = df.shape[0]
    # perform a random permutation of the dataframe
    a = df.sample(frac=1).reset_index()
    # take first frac rows for test and rest for train 
    # return (X_train, X_test, y_train, y_test)
    return (a.loc[:int(train_frac*len_df),:].drop(['logerror'], axis = 1).as_matrix(), 
            a.loc[int(train_frac*len_df):,:].drop(['logerror'], axis = 1).as_matrix(),
            a.loc[:int(train_frac*len_df),'logerror'].as_matrix(),
            a.loc[int(train_frac*len_df):,'logerror'].as_matrix())


In [8]:
X_train, X_test, y_train, y_test = create_random_train_test_samples(df_train, 0.8)

### Random forest with the default parameters (except max_features)

In [9]:
RFR = ensemble.RandomForestRegressor(n_estimators=10,
                                     criterion='mse',
                                     max_features='sqrt',
                                     max_depth=None,
                                     min_samples_split=2)                                                            

In [10]:
# Fit the RF model using the training data:
RFR.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='sqrt', max_leaf_nodes=None, min_samples_leaf=1,
           min_samples_split=2, min_weight_fraction_leaf=0.0,
           n_estimators=10, n_jobs=1, oob_score=False, random_state=None,
           verbose=0, warm_start=False)

In [12]:
# Predict the values of y in the test set and compute the Mean Absolute Error
y_pred = RFR.predict(X_test)

In [13]:
def MAE(y_pred, y_act):
    return np.sum(np.abs(y_act - y_pred))/y_act.size

In [14]:
MAE(y_pred, y_test)

0.079362190054199463

### Tuning the RF model by increading # estimators and fitting model parameters using cross validation

Here, looking at the effect of increasing the parameter min_samples_split

In [190]:
for i in range(2,40,4):
    RFR = ensemble.RandomForestRegressor(n_estimators=20,
                                         criterion='mse',
                                         max_features='sqrt',
                                         max_depth=None,
                                         min_samples_split=i)  
    
    RFR.fit(X_train, y_train)
    y_pred = RFR.predict(X_test)
    print('MAE of {:4} for min_samples_split={}'.format(MAE(y_pred, y_test), i))

MAE of 0.0762055304339 for min_samples_split=2
MAE of 0.0742160303717 for min_samples_split=6
MAE of 0.0733180305649 for min_samples_split=10
MAE of 0.0728908807559 for min_samples_split=14
MAE of 0.0722275760278 for min_samples_split=18
MAE of 0.0716946035025 for min_samples_split=22
MAE of 0.0717987088817 for min_samples_split=26
MAE of 0.071417122531 for min_samples_split=30
MAE of 0.0712103081503 for min_samples_split=34
MAE of 0.0708480684225 for min_samples_split=38


Below, so something similar but with greater range for min_samples_split and more estimators. Also, for each hyperparamter, do model fitting and evaluation n_trials = 5 times and get average MAE:

In [195]:
n_trials = 5
for i in range(2,100,8):
    
    mae = 0
    for j in range(n_trials):
        X_train, X_test, y_train, y_test = create_random_train_test_samples(df_train, 0.8)
        RFR = ensemble.RandomForestRegressor(n_estimators=50,
                                             criterion='mse',
                                             max_features='sqrt',
                                             max_depth=None,
                                             min_samples_split=i)  
    
        RFR.fit(X_train, y_train)
        y_pred = RFR.predict(X_test)
        mae = mae + (MAE(y_pred, y_test)/float(n_trials))
    
    
    print('MAE of {:4} for min_samples_split={}'.format(mae, i))

MAE of 0.0732362886223 for min_samples_split=2
MAE of 0.0712513126889 for min_samples_split=10
MAE of 0.0701304929212 for min_samples_split=18
MAE of 0.0698025453081 for min_samples_split=26
MAE of 0.0691126743413 for min_samples_split=34
MAE of 0.0693640551174 for min_samples_split=42
MAE of 0.068903769104 for min_samples_split=50
MAE of 0.0693977730426 for min_samples_split=58
MAE of 0.0686006818747 for min_samples_split=66
MAE of 0.0689458431241 for min_samples_split=74
MAE of 0.0689833502499 for min_samples_split=82
MAE of 0.0691498850663 for min_samples_split=90
MAE of 0.0687408404196 for min_samples_split=98


In [None]:
pp.figure()
pp.plot(range(2,100,8), [],'x')
pp.xlabel('min_samples_split')
pp.ylabel('MAE')
pp.show()

Based on above, pick min_samples_split = 20 and fit model with a lot of estimators:

In [15]:
X_train, X_test, y_train, y_test = create_random_train_test_samples(df_train, 0.8)
RFR = ensemble.RandomForestRegressor(n_estimators=500,
                                    criterion='mse',
                                    max_features='sqrt',
                                    max_depth=None,
                                    min_samples_split=20)  
    
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)
MAE(y_pred, y_test)
    

0.068784122686759305

Get the feature importance of each feature:

In [17]:
df_train.columns[RFR.feature_importances_ > 0.05]

Index(['parcelid', 'logerror', 'calculatedfinishedsquarefeet', 'latitude',
       'longitude', 'lotsizesquarefeet', 'structuretaxvaluedollarcnt',
       'taxvaluedollarcnt', 'landtaxvaluedollarcnt', 'taxamount'],
      dtype='object')