# Random Forest Regression

## Importing the libraries

In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from tqdm import tqdm
import math
import gc

## Importing the dataset

In [2]:
train = pd.read_csv('worked/train_worked.csv', index_col='ID')
y_train = train.iloc[:, 1].values
train.pop('target')
catFeatures = ['location']
train = pd.get_dummies(train, columns=catFeatures)
train['target'] = y_train
x_train = train.iloc[:, :-1].values

test = pd.read_csv('worked/test_worked.csv', index_col='ID')
test.pop('target')
catFeatures = ['location']
test = pd.get_dummies(test, columns=catFeatures)
x_test = test.iloc[:, :].values


In [3]:
train.head(5)

Unnamed: 0_level_0,max_temp,min_temp,mean_temp,std_temp,var_temp,median_temp,ptp_temp,max_precip,min_precip,mean_precip,...,std_atmos_press,var_atmos_press,median_atmos_press,ptp_atmos_press,location_A,location_B,location_C,location_D,location_E,target
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ID_train_0,27.208333,19.275,22.299527,2.594011,6.728895,21.358333,7.933333,0.0,0.0,0.0,...,0.070374,0.004952,87.762083,0.2575,0,0,1,0,0,45.126304
ID_train_1,33.616667,17.983333,24.679063,4.266955,18.206903,23.791667,15.633333,0.561,0.0,0.007025,...,0.156,0.024336,90.429167,0.668333,0,0,0,1,0,79.131702
ID_train_10,31.841667,18.458333,24.112317,3.776377,14.26102,23.304167,13.383333,7.804,0.0,0.151767,...,0.179481,0.032213,88.425,0.830833,1,0,0,0,0,32.661304
ID_train_100,27.491667,16.941667,20.845273,3.08014,9.487259,19.541667,10.55,25.787,0.0,1.127273,...,0.161758,0.026166,88.4,0.72,1,0,0,0,0,53.850238
ID_train_1000,28.75,17.525,21.870732,3.28687,10.803515,20.95,11.225,0.136,0.0,0.0051,...,0.118369,0.014011,88.5525,0.450833,1,0,0,0,0,177.41875


In [4]:
train.columns

Index(['max_temp', 'min_temp', 'mean_temp', 'std_temp', 'var_temp',
       'median_temp', 'ptp_temp', 'max_precip', 'min_precip', 'mean_precip',
       'std_precip', 'var_precip', 'median_precip', 'ptp_precip',
       'max_rel_humidity', 'min_rel_humidity', 'mean_rel_humidity',
       'std_rel_humidity', 'var_rel_humidity', 'median_rel_humidity',
       'ptp_rel_humidity', 'max_wind_dir', 'min_wind_dir', 'mean_wind_dir',
       'std_wind_dir', 'var_wind_dir', 'median_wind_dir', 'ptp_wind_dir',
       'max_wind_spd', 'min_wind_spd', 'mean_wind_spd', 'std_wind_spd',
       'var_wind_spd', 'median_wind_spd', 'ptp_wind_spd', 'max_atmos_press',
       'min_atmos_press', 'mean_atmos_press', 'std_atmos_press',
       'var_atmos_press', 'median_atmos_press', 'ptp_atmos_press',
       'location_A', 'location_B', 'location_C', 'location_D', 'location_E',
       'target'],
      dtype='object')

In [5]:
train.shape

(15539, 48)

In [6]:
x_train

array([[27.20833333, 19.275     , 22.29952651, ...,  1.        ,
         0.        ,  0.        ],
       [33.61666667, 17.98333333, 24.67906336, ...,  0.        ,
         1.        ,  0.        ],
       [31.84166667, 18.45833333, 24.11231692, ...,  0.        ,
         0.        ,  0.        ],
       ...,
       [34.10833333, 16.95      , 24.63708333, ...,  0.        ,
         1.        ,  0.        ],
       [30.9       , 15.89166667, 23.47293388, ...,  0.        ,
         1.        ,  0.        ],
       [30.45      , 19.03333333, 24.01260331, ...,  1.        ,
         0.        ,  0.        ]])

## Splitting the dataset into the Training set and Test set

In [7]:
from sklearn.model_selection import train_test_split
x_trainer, x_tester, y_trainer, y_tester = train_test_split(x_train, y_train, test_size=0.20)
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor

reg = RandomForestRegressor(n_estimators = 150, random_state=32)
reg.fit(x_trainer, y_trainer)
cvs = cross_val_score(reg, x_trainer, y_trainer, cv=5)

print(cvs)
print(np.mean(cvs))

[0.57434551 0.56924536 0.48979925 0.57988844 0.53763919]
0.5501835497908841


In [8]:
y_pred = reg.predict(x_tester)
np.set_printoptions(precision=2)
print(np.concatenate((y_pred.reshape(len(y_pred),1), y_tester.reshape(len(y_tester),1)),1))

[[49.89 58.12]
 [62.47 49.35]
 [67.61 62.61]
 ...
 [32.49 55.49]
 [55.24 41.18]
 [44.14 29.57]]


In [9]:
from sklearn import metrics

print('Mean Absolute Error:', metrics.mean_absolute_error(y_pred, y_tester))
print('Mean Squared Error:', metrics.mean_squared_error(y_pred, y_tester))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_pred, y_tester)))

Mean Absolute Error: 15.90902412087097
Mean Squared Error: 686.3402187083042
Root Mean Squared Error: 26.19809570767128


## Training the Random Forest Regression model on the whole dataset

## Predicting the Test set results

## Evaluating the Model Performance