# Decision Tree - Regression (Bike Rental)

[ch2-decision-trees.ipynb](https://github.com/kyopark2014/ML-Algorithms/blob/main/xgboost/src/ch2-decision-trees.ipynb)

In [1]:
import pandas as pd

# df_bikes = pd.read_csv('bike_rentals_cleaned.csv')
df_bikes = pd.read_csv('https://raw.githubusercontent.com/rickiepark/handson-gb/main/Chapter02/bike_rentals_cleaned.csv')

In [2]:
df_bikes.head()

Unnamed: 0,instant,season,yr,mnth,holiday,weekday,workingday,weathersit,temp,atemp,hum,windspeed,cnt
0,1,1.0,0.0,1.0,0.0,6.0,0.0,2,0.344167,0.363625,0.805833,0.160446,985
1,2,1.0,0.0,1.0,0.0,0.0,0.0,2,0.363478,0.353739,0.696087,0.248539,801
2,3,1.0,0.0,1.0,0.0,1.0,1.0,1,0.196364,0.189405,0.437273,0.248309,1349
3,4,1.0,0.0,1.0,0.0,2.0,1.0,1,0.2,0.212122,0.590435,0.160296,1562
4,5,1.0,0.0,1.0,0.0,3.0,1.0,1,0.226957,0.22927,0.436957,0.1869,1600


In [3]:
df_bikes.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 731 entries, 0 to 730
Data columns (total 13 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   instant     731 non-null    int64  
 1   season      731 non-null    float64
 2   yr          731 non-null    float64
 3   mnth        731 non-null    float64
 4   holiday     731 non-null    float64
 5   weekday     731 non-null    float64
 6   workingday  731 non-null    float64
 7   weathersit  731 non-null    int64  
 8   temp        731 non-null    float64
 9   atemp       731 non-null    float64
 10  hum         731 non-null    float64
 11  windspeed   731 non-null    float64
 12  cnt         731 non-null    int64  
dtypes: float64(10), int64(3)
memory usage: 74.4 KB


In [4]:
df_bikes.isna().sum().sum()

0

### Split train/test dataset

In [5]:
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [6]:
X_train.shape

(548, 12)

### Decision Tree - Regression

In [7]:
from sklearn.tree import DecisionTreeRegressor

dt = DecisionTreeRegressor(random_state=2)

In [8]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(dt, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

scores

array([-1568188.30612245,  -544181.64383562, -2488944.17808219,
       -1304270.75342466, -2123235.61643836])

### Evaluation

In [9]:
import numpy as np
rmse = np.sqrt(-scores)

print('Avg. RMSE: %0.2f' % (rmse.mean()))

Avg. RMSE: 1233.36


In [10]:
dt.fit(X_train, y_train)

DecisionTreeRegressor(random_state=2)

In [11]:
y_pred = dt.predict(X_test)

from sklearn.metrics import mean_squared_error
import numpy as np

reg_mse = mean_squared_error(y_test, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

944.806364062907

In [12]:
leaf_node_count = 0
tree = dt.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
        if tree.n_node_samples[i] > 1:
            print('Node Index:', i, ', # of Samples:', tree.n_node_samples[i])
print('# of Leaf Nodes:', leaf_node_count)

Node Index: 123 , # of Samples: 2
# of Leaf Nodes: 547


### Hyperparameter

#### GridSearchCV

In [13]:
from sklearn.model_selection import GridSearchCV

params = {'max_depth':[None,2,3,4,6,8,10,20]}

dt = DecisionTreeRegressor(random_state=2)

grid_reg = GridSearchCV(dt, params, scoring='neg_mean_squared_error', 
                        cv=5, return_train_score=True, n_jobs=-1)

grid_reg.fit(X_train, y_train)

best_params = grid_reg.best_params_

print("Best parameters:", best_params)

Best parameters: {'max_depth': 6}


In [14]:
best_score = np.sqrt(-grid_reg.best_score_)

print("Best score: {:.3f}".format(best_score))

Best score: 951.398


In [15]:
best_model = grid_reg.best_estimator_

y_pred = best_model.predict(X_test)

from sklearn.metrics import mean_squared_error
rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
print('RMSE: {:.3f}'.format(rmse_test))

RMSE: 864.670


### Verify the earn hyperparameter, max_depth

In [16]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=2, max_depth=6)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
import numpy as np

model_mse = mean_squared_error(y_test, y_pred)
model_rmse = np.sqrt(model_mse)
print('Avg. RMSE: %0.3f' % (model_rmse))

leaf_node_count = 0
tree = model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

Avg. RMSE: 864.670
# of Leaf Nodes: 54


#### min_samples_leaf

In [17]:
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)    
    grid_reg.fit(X_train, y_train)

    best_params = grid_reg.best_params_
    print("Best parameter:", best_params)
    
    best_score = np.sqrt(-grid_reg.best_score_)
    print("Best score: {:.3f}".format(best_score))

    y_pred = grid_reg.predict(X_test)
    rmse_test = np.sqrt(mean_squared_error(y_test, y_pred))
    print('RMSE: {:.3f}'.format(rmse_test))

In [18]:
grid_search(params={'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best parameter: {'min_samples_leaf': 8}
Best score: 896.083
RMSE: 855.620


In [19]:
grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best parameter: {'max_depth': 6, 'min_samples_leaf': 2}
Best score: 870.396
RMSE: 913.000


In [20]:
grid_search(params={'max_depth':[6,7,8,9,10],'min_samples_leaf':[3,5,7,9]})

Best parameter: {'max_depth': 9, 'min_samples_leaf': 7}
Best score: 888.905
RMSE: 878.538


### Verify the new hyperparameter, min_samples_leaf

In [21]:
from sklearn.tree import DecisionTreeRegressor
model = DecisionTreeRegressor(random_state=2, min_samples_leaf=8)

model.fit(X_train, y_train)

y_pred = model.predict(X_test)

from sklearn.metrics import mean_squared_error
import numpy as np

model_mse = mean_squared_error(y_test, y_pred)
model_rmse = np.sqrt(model_mse)

print('Avg. RMSE: %0.3f' % (model_rmse))

leaf_node_count = 0
tree = model.tree_
for i in range(tree.node_count):
    if (tree.children_left[i] == -1) and (tree.children_right[i] == -1):
        leaf_node_count += 1
print('# of Leaf Nodes:', leaf_node_count)

Avg. RMSE: 855.620
# of Leaf Nodes: 54
