In [1]:
import pandas as pd
import numpy as np
import os
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error

# read the data
data = pd.read_csv(os.path.join("test_sample.csv"))

# separate target variable and regressors
X = data.drop("Y", axis=1)
y = data["Y"]

# define the DecisionTreeRegressor model
tree_reg = DecisionTreeRegressor(random_state=0)

# set the hyperparameter grid for GridSearchCV
param_grid = {'ccp_alpha': np.linspace(0.01, 0.1, 10)}

# define GridSearchCV with 10-fold cross-validation
grid_search = GridSearchCV(tree_reg, param_grid, cv=10, scoring='neg_mean_squared_error')

# fit GridSearchCV to the data
grid_search.fit(X, y) 

# get the optimal ccp_alpha value
optimal_ccp_alpha = grid_search.best_params_['ccp_alpha']

# fit the DecisionTreeRegressor with the optimal ccp_alpha
tree_reg = DecisionTreeRegressor(ccp_alpha=optimal_ccp_alpha, random_state=0)
tree_reg.fit(X, y)

# make predictions on the data
y_pred = tree_reg.predict(X)

# calculate the mean squared error of the fitted model
mse = mean_squared_error(y, y_pred)

# print the optimal ccp_alpha value and the mean squared error of the fitted model
print("Optimal ccp_alpha value:", optimal_ccp_alpha)
print("Tree regression MSE:", mse)

Optimal ccp_alpha value: 0.1
Tree regression MSE: 10.971333071187875
