## Decision Tree

Check the performances of the model using Decision Tree.

1. with default setting
2. with optimized hyperparamters using grid search
3. with best max_depth

In [45]:
import pandas as pd
import numpy as np
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

In [46]:
# import data
PARENT = "Predicting_House_Prices"
path = Path(PARENT).parent / "../Data/X_train_model2.csv"
X_train = pd.read_csv(path)

path2 = Path(PARENT).parent / "../Data/X_valid_model2.csv"
X_valid = pd.read_csv(path2)

path3 = Path(PARENT).parent / "../Data/y_train_model2.csv"
y_train = pd.read_csv(path3)

path4 = Path(PARENT).parent / "../Data/y_valid_model2.csv"
y_valid = pd.read_csv(path4)

In [47]:
X_train = X_train.drop(columns="Unnamed: 0")
X_valid = X_valid.drop(columns="Unnamed: 0")
y_train = y_train.drop(columns="Unnamed: 0")
y_valid = y_valid.drop(columns="Unnamed: 0")

In [5]:
print(X_train.shape, X_valid.shape)

(15421, 1759) (6609, 1759)


### with default setting

In [48]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error

dtree = DecisionTreeRegressor()
dtree.fit(X_train, y_train)

dtree_score = dtree.score(X_valid, y_valid)
dtree_preds = dtree.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, dtree_preds)} \n"
    f" Score : {dtree_score:.5f} \n",
)

 Mean Squared Error : 2671867113435.0376 
 Score : 0.21066 



In [54]:
(mean_squared_error(y_valid, dtree_preds))**0.5

1634584.6914231877

### with optimized hyperparameters using grid search

In [39]:
from sklearn.model_selection import train_test_split

# 30% of training data is used for hyperparameter tuning for efficient search
X_temp, _, y_temp, _ = train_test_split(X_train, y_train, random_state=0, test_size=0.7)
print(len(X_temp), len(y_temp))

4575 4575


In [40]:
from sklearn.model_selection import GridSearchCV

# List Hyperparameters
parameters={"splitter":["best","random"],
            "max_depth" : [1,3,5,7,9,11,12],
            "min_samples_leaf":[1,2,3,4,5,6,7,8,9,10],
            "min_weight_fraction_leaf":[0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9],
            "max_features":["auto","log2","sqrt",None],
            "max_leaf_nodes":[None,10,20,30,40,50,60,70,80,90] }

# Create new KNN object
dtree2 = DecisionTreeRegressor()

# Use GridSearch
clf = GridSearchCV(dtree2, param_grid=parameters, scoring='neg_mean_squared_error', cv=3)

# Fit the model
best_model = clf.fit(X_temp, y_temp)

# Print The value of best Hyperparameters
best_model.best_params_

{'max_depth': 7,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'min_samples_leaf': 1,
 'min_weight_fraction_leaf': 0.1,
 'splitter': 'best'}

In [49]:
# decision tree with optimized parameters
dtree2 = DecisionTreeRegressor(max_depth=7, max_features='auto', max_leaf_nodes=None, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter='best')
dtree2.fit(X_train, y_train)

dtree2_score = dtree2.score(X_valid, y_valid)
dtree2_preds = dtree2.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, dtree2_preds)} \n"
    f" Score : {dtree2_score:.5f} \n",
)

 Mean Squared Error : 2783654474621.3984 
 Score : 0.17764 



In [50]:
# decision tree with optimized parameters explored before changing preprocessing
dtree3 = DecisionTreeRegressor(max_depth=3, max_features=None, max_leaf_nodes=None, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter='best')
dtree3.fit(X_train, y_train)

dtree3_score = dtree3.score(X_valid, y_valid)
dtree3_preds = dtree3.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, dtree3_preds)} \n"
    f" Score : {dtree3_score:.5f} \n",
)

 Mean Squared Error : 2803488084008.795 
 Score : 0.17178 



In [10]:
dtree4 = DecisionTreeRegressor(max_depth=8, max_features=None, max_leaf_nodes=None, 
                                min_samples_leaf=1, min_weight_fraction_leaf=0.1, splitter='best')
dtree4.fit(X_train, y_train)

dtree4_score = dtree4.score(X_valid, y_valid)
dtree4_preds = dtree4.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, dtree4_preds)} \n"
    f" Score : {dtree4_score:.5f} \n",
)

 Score : 0.01513 



## with best max_depth

In [51]:
# only explore 'max_depth'
for max_d in range(1,31):
  model = DecisionTreeRegressor(max_depth=max_d, random_state=42)
  model.fit(X_train, y_train)
  print('The Training Accuracy for max_depth {} is:'.format(max_d), model.score(X_train, y_train))
  print('The Validation Accuracy for max_depth {} is:'.format(max_d), model.score(X_valid, y_valid))
  print('')

The Training Accuracy for max_depth 1 is: 0.21790021892812983
The Validation Accuracy for max_depth 1 is: 0.2418158880413822

The Training Accuracy for max_depth 2 is: 0.3845693203644871
The Validation Accuracy for max_depth 2 is: 0.34907186117044753

The Training Accuracy for max_depth 3 is: 0.47784422576501473
The Validation Accuracy for max_depth 3 is: 0.2194162580625283

The Training Accuracy for max_depth 4 is: 0.5561482959882471
The Validation Accuracy for max_depth 4 is: 0.2651151350321991

The Training Accuracy for max_depth 5 is: 0.6716371648272204
The Validation Accuracy for max_depth 5 is: 0.287952815171007

The Training Accuracy for max_depth 6 is: 0.7248498007812829
The Validation Accuracy for max_depth 6 is: 0.24526496884070936

The Training Accuracy for max_depth 7 is: 0.7721397248337589
The Validation Accuracy for max_depth 7 is: 0.20821641376818012

The Training Accuracy for max_depth 8 is: 0.8120714420253415
The Validation Accuracy for max_depth 8 is: 0.27131348000481

In [52]:
# decision tree with best max_depth
dtree5 = DecisionTreeRegressor(max_depth=2)
dtree5.fit(X_train, y_train)

dtree5_score = dtree5.score(X_valid, y_valid)
dtree5_preds = dtree5.predict(X_valid)

print(
    f" Mean Squared Error : {mean_squared_error(y_valid, dtree5_preds)} \n"
    f" Score : {dtree5_score:.5f} \n",
)

 Mean Squared Error : 2203360682072.455 
 Score : 0.34907 



In [55]:
(mean_squared_error(y_valid, dtree5_preds))**0.5

1484372.1508006188