In [1]:
import pandas as pd

ames_housing = pd.read_csv(
    "../datasets/ames_housing_no_missing.csv",
    na_filter=False,  # required for pandas>2.0
)
target_name = "SalePrice"
data = ames_housing.drop(columns=target_name)
target = ames_housing[target_name]


In [2]:
numerical_features = [
    "LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", "BsmtFinSF2",
    "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", "2ndFlrSF", "LowQualFinSF",
    "GrLivArea", "BedroomAbvGr", "KitchenAbvGr", "TotRmsAbvGrd", "Fireplaces",
    "GarageCars", "GarageArea", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch",
    "3SsnPorch", "ScreenPorch", "PoolArea", "MiscVal",
]

data_numerical = data[numerical_features]

In [3]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

tree_model = DecisionTreeRegressor(max_depth=3, random_state=0)

linear_model=make_pipeline(
    StandardScaler(),
    LinearRegression()
).set_output(transform="pandas")



In [4]:
from sklearn.model_selection import cross_validate
cv_results = cross_validate(
    linear_model,
    data_numerical,
    target,
    cv=10,
    scoring="neg_mean_squared_error",
    return_train_score=True,
    return_estimator=True,
)

In [5]:
test_error = -cv_results["test_score"]
print(
    "Mean squared error of linear regression model on the test set:\n"
    f"{test_error.mean():.2e} ± {test_error.std():.2e}"
)

Mean squared error of linear regression model on the test set:
1.83e+09 ± 1.29e+09


In [6]:
cv_results_linear = cross_validate(
    linear_model,
    data_numerical,
    target,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    return_estimator=True,
)

cv_results_tree = cross_validate(
    tree_model,
    data_numerical,
    target,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    return_estimator=True,
)

sum(-cv_results_linear['test_score'] < -cv_results_tree['test_score'])

10

In [7]:
-cv_results_tree['test_score']

array([32712.27311199, 31978.05518764, 36410.55100452, 36433.48442425,
       38545.84976819, 31686.140671  , 33713.76871368, 34456.45215463,
       36530.49676244, 33264.35932377])

In [8]:
-cv_results_linear['test_score']

array([25549.50375637, 25048.20115671, 25755.16502856, 26942.54483046,
       27356.87128544, 24821.12657318, 24415.98862711, 26309.08964105,
       32983.62484936, 25164.68932662])

In [17]:
from sklearn.model_selection import GridSearchCV
import numpy as np

param_grid = {"max_depth": np.arange(1, 15, 1)}
tree_cv = GridSearchCV(DecisionTreeRegressor(), param_grid=param_grid, cv=10)
tree_cv.fit(data_numerical, target)

In [18]:
print(f"Optimal depth found via CV: {tree_cv.best_params_['max_depth']}")

Optimal depth found via CV: 6


In [19]:
tree_gridcv_results = cross_validate(
    tree_cv,
    data_numerical,
    target,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    return_estimator=True,
)


In [20]:
tree_gridcv_results['test_score']

array([-28118.55947549, -27388.99410932, -30254.25825185, -29609.82296436,
       -28886.80239616, -28487.3524868 , -28904.20639881, -24866.00930184,
       -30461.30978273, -25975.3283891 ])

In [21]:
sum(-cv_results_linear['test_score'] < -tree_gridcv_results['test_score'])

8

In [26]:
# Question 4
from sklearn.compose import make_column_selector as selector

categorical_columns_selector = selector(dtype_include=object)
categorical_columns = categorical_columns_selector(data)

from sklearn.preprocessing import OrdinalEncoder

categorical_preprocessor = OrdinalEncoder(
    handle_unknown="use_encoded_value", unknown_value=-1
)

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer(
    [("cat_preprocessor", categorical_preprocessor, categorical_columns)],
    remainder="passthrough",
)

from sklearn.pipeline import Pipeline

tree_regression = Pipeline(
    [
        ("preprocessor", preprocessor),
        (
            "regressor",
            DecisionTreeRegressor(random_state=2, max_depth=7),
        ),
    ]
)


tree_cv_results = cross_validate(
    tree_regression,
    data,
    target,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    return_estimator=True,
)

cv_results_tree_num = cross_validate(
    DecisionTreeRegressor(random_state=2, max_depth=7),
    data_numerical,
    target,
    cv=10,
    scoring="neg_mean_absolute_error",
    return_train_score=True,
    return_estimator=True,
)

In [27]:
sum(-cv_results_tree_num['test_score'] < -tree_cv_results['test_score'])

2