### Simple Example of random forest

Using the numbers of dear kill data set. 

* Shows how to use sinus and cosinus variables for cyclic variables 

In [1]:
import numpy as np
import pandas as pd
from sklearn import datasets
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('default')
from sklearn.model_selection import train_test_split

In [2]:
try:
    import feather
except:
    !pip install feather-format
    import feather

In [3]:
import os
from urllib.request import urlretrieve
def get_if_not_there(filename = 'deer_train.feather'):
    if not os.path.isfile(filename):
         urlretrieve('https://raw.githubusercontent.com/tensorchiefs/dl_book/master/data/{}'.format(filename),
                    filename = filename)

get_if_not_there('deer_train.feather')
get_if_not_there('deer_test.feather')

In [4]:
df_train = feather.read_dataframe('deer_train.feather')
df_test = feather.read_dataframe('deer_test.feather')
df_train #Time is in days

Unnamed: 0,wild,year,time,daytime,weekday
0,0,2002.0,0.000000,night.am,Sunday
1,0,2002.0,0.020833,night.am,Sunday
2,0,2002.0,0.041667,night.am,Sunday
3,1,2002.0,0.062500,night.am,Sunday
4,3,2002.0,0.083333,night.am,Sunday
...,...,...,...,...,...
140251,2,2009.0,2921.895833,night.pm,Sunday
140252,1,2009.0,2921.916667,night.pm,Sunday
140253,0,2009.0,2921.937500,night.pm,Sunday
140254,1,2009.0,2921.958333,night.pm,Sunday


In [5]:
# Optinal cyclic encoding
def encode(data, col, max_val):
    data[col + '_sin'] = np.sin(2 * np.pi * data[col]/max_val)
    data[col + '_cos'] = np.cos(2 * np.pi * data[col]/max_val)
    return data

dt_train = encode(df_train, 'time', 365)
df_test = encode(df_test, 'time', 365)
df_test[0:4]

Unnamed: 0,wild,year,time,daytime,weekday,time_sin,time_cos
0,0,2010.0,2922.0,night.am,Sunday,0.034422,0.999407
1,2,2010.0,2922.020833,night.am,Sunday,0.03478,0.999395
2,2,2010.0,2922.041667,night.am,Sunday,0.035138,0.999382
3,2,2010.0,2922.0625,night.am,Sunday,0.035497,0.99937


#### Encoding of the categorical features

In [6]:
y_train = df_train.iloc[:,0].to_numpy(dtype='float32')
y_test = df_test.iloc[:,0].to_numpy(dtype='float32')
X_train = pd.get_dummies(df_train.iloc[:,2:]) #We wont use the year
X_test = pd.get_dummies(df_test.iloc[:,2:])
X_train.iloc[:,0] = X_train.iloc[:,0]/2922.02    #We divide by the maximal number to be in the range 0 to 1
X_test.iloc[:,0] = X_test.iloc[:,0]/2922.02 
X_test

Unnamed: 0,time,time_sin,time_cos,daytime_day.am,daytime_night.am,daytime_pre.sunrise.am,daytime_post.sunrise.am,daytime_day.pm,daytime_night.pm,daytime_pre.sunset.pm,daytime_post.sunset.pm,weekday_Monday,weekday_Tuesday,weekday_Wednesday,weekday_Thursday,weekday_Friday,weekday_Saturday,weekday_Sunday
0,0.999993,0.034422,0.999407,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
1,1.000000,0.034780,0.999395,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
2,1.000007,0.035138,0.999382,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
3,1.000015,0.035497,0.999370,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
4,1.000022,0.035855,0.999357,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
35035,1.249785,0.032629,0.999468,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
35036,1.249792,0.032988,0.999456,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
35037,1.249799,0.033346,0.999444,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1
35038,1.249806,0.033705,0.999432,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1


In [7]:
# Transformation to numerical
X_train = X_train.to_numpy(dtype='float32')
X_test = X_test.to_numpy(dtype='float32')
del df_train, df_test
X_train.shape,X_test.shape

((140256, 18), (35040, 18))

In [8]:
from sklearn.ensemble import RandomForestRegressor
model = RandomForestRegressor(n_estimators=100, random_state=0)

In [9]:
model.fit(X_train, y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
                      max_features='auto', max_leaf_nodes=None,
                      min_impurity_decrease=0.0, min_impurity_split=None,
                      min_samples_leaf=1, min_samples_split=2,
                      min_weight_fraction_leaf=0.0, n_estimators=100,
                      n_jobs=None, oob_score=False, random_state=0, verbose=0,
                      warm_start=False)

In [10]:
res = model.predict(X_test)

In [11]:
res[0:10]

array([1.65, 1.64, 1.65, 1.67, 1.68, 1.66, 1.69, 1.72, 1.69, 1.62])

In [12]:
np.mean((res - y_test)**2) #MSE 11.5320 (without) 8.8955 with feature engeneering

8.895538045091325

In [13]:
res_train = model.predict(X_train)
sig2 = np.mean((res_train - y_train)**2)
0.5*np.log(2 * np.pi * sig2) + 0.5*np.mean((y_test - res.flatten())**2)/sig2 #7.708444 without 6.94 with

6.941875609531877