In [2]:
import pandas as pd

ames_housing = pd.read_csv("../datasets/ames_housing_no_missing.csv")
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]


We will compare the generalization performance of a decision tree and a linear regression based on two separate predictive models and evaluate them by 10-fold cross-validation.

We create the models using sklearn.linear_model.LinearRegression and sklearn.tree.DecisionTreeRegressor with the default parameters for the linear regression and a random_state=0 for the decision.

A sklearn.preprocessing.StandardScaler scales the numerical data in the linear regression model.

By comparing the cross-validation test scores for both models fold-to-fold, we count the number of times the linear model has a better test score than the decision tree model.

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_validate
from sklearn.tree import DecisionTreeRegressor
import numpy as np

linear_model = make_pipeline(StandardScaler(), LinearRegression())
cv_results_l = cross_validate(linear_model, data_numerical, target, cv=10, 
                            return_estimator=True,n_jobs=2)
cv_results_l['test_score'].mean()

0.7179515064979931

In [4]:
tree_model = make_pipeline(DecisionTreeRegressor(random_state=0))
cv_results_t = cross_validate(tree_model, data_numerical, target, cv=10, 
                            return_estimator=True,n_jobs=2)
cv_results_t['test_score'].mean()

0.6210257285885292

In [5]:
m_l = cv_results_l['test_score']
m_t = cv_results_t['test_score']
print("Linear regression is better than decision tree for "
    f"{( m_l > m_t).sum()} CV iterations out of 10 folds")

Linear regression is better than decision tree for 9 CV iterations out of 10 folds


#### Optimizing the tree model
Instead of using the default parameters for the decision tree regressor, we will optimize the max_depth of the tree and vary the max_depth from 1 level up to 15 levels. Fo the evaluation we will use a nested cross-validation with a grid-search (sklearn.model_selection.GridSearchCV), setting cv=10 for both the inner and outer cross-validations. 

In [26]:
from sklearn.model_selection import GridSearchCV

param_grid = {"max_depth": np.arange(1, 16, 1)}
tree_model2 = GridSearchCV(DecisionTreeRegressor(random_state=0), 
                           param_grid=param_grid, cv=10)
cv_results_t2 = cross_validate(tree_model2, data_numerical, target, cv=10, 
                            return_estimator=True,n_jobs=2)
print("The generalization performance of the tree model: " f"{cv_results_t2['test_score'].mean()}")

The generalization performance of the tree model: 0.6966361061945607


In [17]:
for search_cv in cv_results_t2["estimator"]:
    print(search_cv.best_params_)


{'max_depth': 5}
{'max_depth': 7}
{'max_depth': 6}
{'max_depth': 6}
{'max_depth': 8}
{'max_depth': 6}
{'max_depth': 7}
{'max_depth': 8}
{'max_depth': 7}
{'max_depth': 6}


The **optimal depth** is ranging from 5 to 8 

#### Tree model including categorical features

Instead of using only the numerical features you will now use the entire dataset available in the variable data building a preprocessor. We use an OrdinalEncoder to encode the categorical columns.

In addition, we set the max_depth of the decision tree to 7 (fixed, no need to tune it with a grid-search) and valuate this model using cross_validate as in the previous questions.

In [27]:
from sklearn.compose import make_column_selector as selector
from sklearn.preprocessing import OrdinalEncoder
from sklearn.compose import ColumnTransformer


numerical_columns_selector = selector(dtype_exclude=object)
categorical_columns_selector = selector(dtype_include=object)

numerical_columns = numerical_columns_selector(data)
categorical_columns = categorical_columns_selector(data)

categorical_preprocessor = OrdinalEncoder(handle_unknown="use_encoded_value", 
                                          unknown_value=-1)
numerical_preprocessor = StandardScaler()

preprocessor = ColumnTransformer([
    ('one-hot-encoder', categorical_preprocessor, categorical_columns),
    ('standard_scaler', numerical_preprocessor, numerical_columns)])

In [35]:
tree_model3 = make_pipeline(preprocessor, DecisionTreeRegressor(max_depth=7, random_state=0))
cv_results_t3 = cross_validate(tree_model3, data, target, cv=10, 
                            return_estimator=True,n_jobs=2)
print("The generalization performance of the tree model with numerical and categorical features: " f"{cv_results_t3['test_score'].mean()}")

The generalization performance of the tree model with numerical and categorical features: 0.7668234519871657


In [32]:
print(
    "A tree model using both numerical and categorical features is better than a "
    "tree with optimal depth using only numerical features for "
    f"{sum(cv_results_t3['test_score'] > cv_results_t2['test_score'])} CV "
    "iterations out of 10 folds."
)

A tree model using both numerical and categorical features is better than a tree with optimal depth using only numerical features for 9 CV iterations out of 10 folds.


#### Conclusion
A tree model using both numerical and categorical features is better than a tree with optimal depth using only numerical features for 9 CV iterations out of 10 folds.