# Introduction

In [None]:
"""
What? Decision tree

An ensemble method prediction is composed of different machine learning models that combine to work together. 
The individual models that make up the ensemble are called base learners.
Decision trees, the most commonly used base learners, are unique in the machine learning landscape. 
Instead of multiplying column values by numeric weights, as in linear regression and logistic regression 
decision trees split the data by asking questions about the columns.

Corey Wade. “Hands-On Gradient Boosting with XGBoost and scikit-learn. 
https://github.com/PacktPublishing/Hands-On-Gradient-Boosting-with-XGBoost-and-Scikit-learn
"""

# Import pandas and numpy

In [18]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import GridSearchCV

# Classification

In [6]:

df_census = pd.read_csv('../DATASETS/census_cleaned.csv')

# Split data into X and y
X = df_census.iloc[:,:-1]
y = df_census.iloc[:,-1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2)

In [8]:
# Initialize classification model
clf = DecisionTreeClassifier(random_state=2)

# Fit model on training data
clf.fit(X_train, y_train)

# Make predictions for test data
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy_score(y_pred, y_test)

0.8131679154894976

# Regression

In [13]:
df_bikes = pd.read_csv('../DATASETS/bike_rentals_cleaned.csv')

# Split data into X and y
X_bikes = df_bikes.iloc[:,:-1]
y_bikes = df_bikes.iloc[:,-1]

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X_bikes, y_bikes, random_state=2)

In [14]:
# Initialize Decision Tree Regressor
reg = DecisionTreeRegressor(random_state=2)

# Obtain scores of cross-validation using mean squared error
scores = cross_val_score(reg, X_bikes, y_bikes, scoring='neg_mean_squared_error', cv=5)

# Take square root of the scores
rmse = np.sqrt(-scores)

# Display mean score
print('RMSE mean: %0.2f' % (rmse.mean()))

RMSE mean: 1233.36


In [None]:
"""
The RMSE is 1233.36. This is worse than the 972.06 obtained from Linear Regression and from the 887.31 obtained by
XGBoost. Is the model overfitting the data because the variance is too high? This question may be answered by seeing
how well the decision tree makes predictions on the training set alone.
"""

In [17]:
# Initialize and score DecisionTreeRegressor on training set
reg = DecisionTreeRegressor()
reg.fit(X_train, y_train)
y_pred = reg.predict(X_train)
reg_mse = mean_squared_error(y_train, y_pred)
reg_rmse = np.sqrt(reg_mse)
reg_rmse

0.0

In [None]:
"""
A RMSE of 0.0 means that the model has perfectly fit every data point! This perfect score combined with a 
cross-validation error of 1233.36 is proof that the decision tree is overfitting the data with high variance. 
The training set fit perfectly, but the test set missed badly.

Solutions? Hyperparameters may rectify the situation.
"""

# Regression -> hyperparameter tuning

In [None]:
"""
Generally speaking, decreasing max hyperparameters and increasing min hyperparameters will reduce variation
and prevent overfitting.
"""

In [19]:
# Choose max_depth hyperparameters
params = {'max_depth':[None,2,3,4,6,8,10,20]}

# Initialize regression model as reg
reg = DecisionTreeRegressor(random_state=2)

# Initialize GridSearchCV as grid_reg
grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)

# Fit grid_reg on X_train and y_train
grid_reg.fit(X_train, y_train)

# Extract best parameters
best_params = grid_reg.best_params_

# Print best hyperparameters
print("Best params:", best_params)

Best params: {'max_depth': 6}


In [None]:
"""
As you can see, a max_depth value of 6 resulted in the best cross-validation score in the training set.
Remember this is done through cross-validation, the result are the mean of the results and this gives
a better estimate of the error.
"""

In [20]:
# Compute best score
best_score = np.sqrt(-grid_reg.best_score_)

# Print best score
print("Training score: {:.3f}".format(best_score))

Training score: 951.173


In [27]:
# Extract best model
best_model = grid_reg.best_estimator_

# Predict test set labels
y_pred = best_model.predict(X_test)

# Import mean_squared_error from sklearn.metrics as MSE 
from sklearn.metrics import mean_squared_error

# Compute rmse_test
rmse_test = mean_squared_error(y_test, y_pred)**0.5

# Print rmse_test
print('Test score: {:.3f}'.format(rmse_test))

Test score: 864.670


In [None]:
"""
Variance has been dsubstantially reduced.
"""

In [22]:
# Create grid_search function
def grid_search(params, reg=DecisionTreeRegressor(random_state=2)):

    # Instantiate GridSearchCV as grid_reg
    grid_reg = GridSearchCV(reg, params, scoring='neg_mean_squared_error', cv=5, n_jobs=-1)
    
    # Fit grid_reg on X_train and y_train
    grid_reg.fit(X_train, y_train)

    # Extract best params
    best_params = grid_reg.best_params_

    # Print best params
    print("Best params:", best_params)
    
    # Compute best score
    best_score = np.sqrt(-grid_reg.best_score_)

    # Print best score
    print("Training score: {:.3f}".format(best_score))

    # Predict test set labels
    y_pred = grid_reg.predict(X_test)

    # Compute rmse_test
    rmse_test = mean_squared_error(y_test, y_pred)**0.5

    # Print rmse_test
    print('Test score: {:.3f}'.format(rmse_test))

In [23]:
# Shape of the training samples
X_train.shape

(548, 12)

In [24]:
grid_search(params={'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best params: {'min_samples_leaf': 8}
Training score: 895.859
Test score: 855.620


In [25]:
grid_search(params={'max_depth':[None,2,3,4,6,8,10,20],'min_samples_leaf':[1,2,4,6,8,10,20,30]})

Best params: {'max_depth': 6, 'min_samples_leaf': 2}
Training score: 870.381
Test score: 913.000


In [None]:
"""
The result may be a surprise. Even though the training score has improved, the test score has not. min_samples_leaf 
has decreased from 8 to 2, while max_depth has remained the same.
This is a valuable lesson in hyperparameter tuning: Hyperparameters should not be chosen in isolation.
"""

In [26]:
grid_search(params={'max_depth':[5,6,7,8,9],'min_samples_leaf':[3,5,7,9]})

Best params: {'max_depth': 9, 'min_samples_leaf': 7}
Training score: 888.787
Test score: 878.538
