# Decision Tree Regression Model

## Import Libaries

In [1]:
import pandas as pd
import numpy as np

#build the model, import decision tree
from sklearn.tree import DecisionTreeRegressor, DecisionTreeClassifier
from sklearn.metrics import accuracy_score

#get error metrics to evaluate model
#from sklearn.metrics import mean_absolute_error, r2_score, mean_squared_error
from sklearn import metrics

In [2]:
X_train = pd.read_csv('../dataset/no_outliers/X_train.csv')
X_test = pd.read_csv('../dataset/no_outliers/X_test.csv')
y_train = pd.read_csv('../dataset/no_outliers/y_train.csv')
y_test = pd.read_csv('../dataset/no_outliers/y_test.csv')

X_train_with = pd.read_csv('../dataset/with_outliers/X_train.csv')
X_test_with = pd.read_csv('../dataset/with_outliers/X_test.csv')
y_train_with = pd.read_csv('../dataset/with_outliers/y_train.csv')
y_test_with = pd.read_csv('../dataset/with_outliers/y_test.csv')

In [3]:
X_train

Unnamed: 0,season,year,month,holiday,weekday,weather,temperature,humidity,windspeed
0,0.000000,1.0,0.000000,0.0,0.666667,0.0,0.165650,0.318182,0.468072
1,0.333333,1.0,0.363636,0.0,1.000000,0.0,0.639257,0.293661,0.212724
2,0.666667,0.0,0.545455,0.0,0.000000,0.0,0.835451,0.471293,0.533538
3,0.666667,0.0,0.636364,0.0,0.333333,0.0,0.734190,0.391148,0.217658
4,0.000000,0.0,0.000000,0.0,0.333333,0.5,0.199404,0.840910,0.274944
...,...,...,...,...,...,...,...,...,...
573,1.000000,1.0,1.000000,0.0,1.000000,0.0,0.335474,0.537680,0.168555
574,0.666667,0.0,0.545455,0.0,0.500000,0.0,0.870259,0.510766,0.273255
575,1.000000,1.0,0.727273,0.0,0.833333,0.5,0.708874,0.594498,0.320748
576,0.666667,1.0,0.545455,0.0,0.833333,0.5,0.767942,0.815790,0.438620


In [4]:
dt = DecisionTreeRegressor(random_state=42)

### Without Outliers (Mean, R^2)

In [5]:
dt_without = dt.fit(X_train, y_train)

In [6]:
# use model to get prediction
y_pred_dt = dt_without.predict(X_test)
y_pred_dt

array([0.79245283, 0.94902145, 0.76983476, 0.63131372, 0.59955467,
       0.80264854, 0.39622642, 0.16219384, 0.12586429, 0.1804758 ,
       0.54107582, 0.65041603, 0.24082972, 0.43654049, 0.48798781,
       0.24082972, 0.46126802, 0.56720966, 0.5074417 , 0.83007149,
       0.53591937, 0.31899684, 0.50533224, 0.78155397, 0.60705496,
       0.65193953, 0.5309973 , 0.804758  , 0.15223251, 0.21235205,
       0.44509551, 0.51048869, 0.8845658 , 0.64373608, 0.79245283,
       0.724833  , 0.87741709, 0.41907887, 0.39212469, 0.21903199,
       0.2101254 , 0.49783195, 0.26133833, 0.804758  , 0.53486464,
       0.63295441, 0.47029181, 0.42669636, 0.32462206, 1.        ,
       0.30903551, 0.89569905, 0.54494316, 0.89569905, 0.62346185,
       0.72307512, 0.23133716, 0.28958162, 0.90132427, 0.88866753,
       0.58080394, 0.66694011, 0.7056135 , 0.04793156, 0.60728935,
       0.47474511, 0.28149537, 0.33669284, 0.62346185, 0.7426462 ,
       0.58748389, 0.76432673, 0.34044299, 0.45962733, 0.28161

In [7]:
mean = metrics.mean_absolute_error(y_test, y_pred_dt)
mean2 = metrics.mean_squared_error(y_test, y_pred_dt)
rmean2 = np.sqrt(metrics.mean_squared_error(y_test, y_pred_dt))
r2 = metrics.r2_score(y_test, y_pred_dt)
coefficient = dt.score(X_test, y_test)

print('Mean Absolute Error:', mean)
print('Mean Squared Error:', mean2)
print('Root Mean Squared Error:', rmean2)
print ('R^2: ',r2)
print ('Coefficient: ', coefficient)

Mean Absolute Error: 0.06948843637480451
Mean Squared Error: 0.008135021701381634
Root Mean Squared Error: 0.0901943551525351
R^2:  0.8584918417063379
Coefficient:  0.8584918417063379


In [8]:
print('Accuracy: ', r2*100,'%')

Accuracy:  85.84918417063379 %


### With Outliers

In [9]:
dt_with = dt.fit(X_train_with, y_train_with)

In [10]:
y_pred_dt_with = dt_with.predict(X_test_with)
y_pred_dt_with

array([0.52314544, 0.529591  , 0.22325091, 0.11519981, 0.5059182 ,
       0.41579749, 0.52607524, 0.53474745, 0.69096449, 0.04793156,
       0.18551506, 0.28864409, 0.24082972, 0.61877417, 0.55584203,
       0.45529122, 0.90765264, 0.83054026, 0.37747568, 0.09058948,
       0.88866753, 0.20684402, 0.6138521 , 0.56404547, 0.10523849,
       0.79174968, 0.38239775, 0.60728935, 0.77545998, 0.87144029,
       0.11297316, 0.588773  , 0.80042189, 0.5932263 , 0.28864409,
       0.47767491, 0.72881753, 0.724833  , 0.25430681, 0.30141802,
       0.7352631 , 0.12586429, 0.56744404, 0.43818118, 0.88866753,
       0.804758  , 0.26672917, 0.50580101, 0.04910348, 0.37817884,
       0.76983476, 0.7706551 , 0.79104653, 0.86030704, 0.57635064,
       0.40993789, 0.38673386, 0.80042189, 0.79725771, 0.42212586,
       0.7352631 , 0.48599555, 0.69776163, 1.        , 0.90003516,
       0.55935779, 0.50580101, 0.72307512, 0.79081214, 0.48119067,
       0.82128208, 0.82421188, 0.37407711, 0.79784367, 0.61994

In [11]:
mean_with = metrics.mean_absolute_error(y_test_with, y_pred_dt_with)
mean2_with = metrics.mean_squared_error(y_test_with, y_pred_dt_with)
rmean2_with = np.sqrt(metrics.mean_squared_error(y_test_with, y_pred_dt_with))
r2_with = metrics.r2_score(y_test_with, y_pred_dt_with)
coefficient_with = dt.score(X_test_with, y_test_with)

print ('Mean: ', mean_with)
print('Mean Squared Error:', mean2_with)
print('Root Mean Squared Error:', rmean2_with)
print ('R^2: ',r2_with)
print ('Coefficient: ', coefficient_with)

Mean:  0.07826836348039744
Mean Squared Error: 0.011313025698433201
Root Mean Squared Error: 0.10636270821313831
R^2:  0.8063255649138468
Coefficient:  0.8063255649138468


In [12]:
print('Accuracy: ', r2_with*100,'%')

Accuracy:  80.63255649138468 %
